easy_ml 0.2.0.pre.rc76 → 0.2.0.pre.rc78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/models_controller.rb +3 -2
- data/app/frontend/components/ModelForm.tsx +16 -0
- data/app/frontend/components/ScheduleModal.tsx +0 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +7 -6
- data/app/jobs/easy_ml/application_job.rb +1 -0
- data/app/jobs/easy_ml/batch_job.rb +47 -6
- data/app/jobs/easy_ml/compute_feature_job.rb +10 -10
- data/app/jobs/easy_ml/reaper.rb +14 -10
- data/app/jobs/easy_ml/refresh_dataset_job.rb +2 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +1 -0
- data/app/models/concerns/easy_ml/dataframe_serialization.rb +1 -17
- data/app/models/easy_ml/column/imputers/base.rb +1 -1
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -5
- data/app/models/easy_ml/column/imputers/today.rb +1 -1
- data/app/models/easy_ml/column/selector.rb +0 -8
- data/app/models/easy_ml/column.rb +1 -1
- data/app/models/easy_ml/dataset/learner/base.rb +2 -2
- data/app/models/easy_ml/dataset/learner/eager.rb +3 -1
- data/app/models/easy_ml/dataset/learner/lazy.rb +4 -1
- data/app/models/easy_ml/dataset/refresh_reasons.rb +12 -0
- data/app/models/easy_ml/dataset.rb +29 -76
- data/app/models/easy_ml/datasource.rb +0 -6
- data/app/models/easy_ml/feature.rb +27 -38
- data/app/models/easy_ml/model.rb +20 -2
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +3 -2
- data/app/models/easy_ml/models/xgboost.rb +52 -36
- data/app/models/easy_ml/retraining_run.rb +1 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +1 -1
- data/app/serializers/easy_ml/model_serializer.rb +1 -0
- data/lib/easy_ml/core/tuner.rb +7 -4
- data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
- data/lib/easy_ml/data/dataset_manager/reader/base.rb +80 -0
- data/lib/easy_ml/data/dataset_manager/reader/batch.rb +106 -0
- data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +23 -0
- data/lib/easy_ml/data/dataset_manager/reader/file.rb +75 -0
- data/lib/easy_ml/data/dataset_manager/reader.rb +58 -0
- data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +67 -0
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +139 -0
- data/lib/easy_ml/data/dataset_manager/writer/named.rb +14 -0
- data/lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb +15 -0
- data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +150 -0
- data/lib/easy_ml/data/dataset_manager/writer.rb +80 -0
- data/lib/easy_ml/data/dataset_manager.rb +140 -0
- data/lib/easy_ml/data/partition/boundaries.rb +60 -0
- data/lib/easy_ml/data/partition.rb +7 -0
- data/lib/easy_ml/data/polars_column.rb +19 -5
- data/lib/easy_ml/data/synced_directory.rb +1 -2
- data/lib/easy_ml/data.rb +2 -0
- data/lib/easy_ml/engine.rb +16 -14
- data/lib/easy_ml/feature_store.rb +21 -188
- data/lib/easy_ml/reasons.rb +41 -0
- data/lib/easy_ml/support/lockable.rb +1 -5
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js +522 -0
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-B1qLZuyu.js.map → Application.tsx-Bbf3mD_b.js.map} +1 -1
- metadata +24 -9
- data/app/models/easy_ml/datasources/polars_datasource.rb +0 -69
- data/lib/easy_ml/data/filter_extensions.rb +0 -31
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js +0 -522
- /data/app/models/{lineage_history.rb → easy_ml/lineage_history.rb} +0 -0
@@ -0,0 +1,58 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Reader
|
5
|
+
require_relative "reader/base"
|
6
|
+
require_relative "reader/file"
|
7
|
+
require_relative "reader/data_frame"
|
8
|
+
|
9
|
+
ADAPTERS = [
|
10
|
+
File,
|
11
|
+
DataFrame,
|
12
|
+
]
|
13
|
+
|
14
|
+
def self.query(input, **kwargs, &block)
|
15
|
+
adapter(input).new(
|
16
|
+
kwargs.merge!(input: input), &block
|
17
|
+
).query
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.schema(input, **kwargs, &block)
|
21
|
+
adapter(input).new(
|
22
|
+
kwargs.merge!(input: input), &block
|
23
|
+
).schema
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.files(dir)
|
27
|
+
Dir.glob(::File.join(dir, "**/*.{parquet}"))
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.sha
|
31
|
+
files = sha.sort
|
32
|
+
|
33
|
+
file_hashes = files.map do |file|
|
34
|
+
meta = Polars.read_parquet_schema(file)
|
35
|
+
row_count = Polars.scan_parquet(file).select(Polars.col("*").count).collect[0, 0]
|
36
|
+
|
37
|
+
Digest::SHA256.hexdigest([
|
38
|
+
meta.to_json,
|
39
|
+
row_count.to_s,
|
40
|
+
].join("|"))
|
41
|
+
end
|
42
|
+
|
43
|
+
Digest::SHA256.hexdigest(file_hashes.join)
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def self.adapter(input)
|
49
|
+
if input.is_a?(Polars::DataFrame) || input.is_a?(Polars::LazyFrame)
|
50
|
+
DataFrame
|
51
|
+
else
|
52
|
+
File
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Writer
|
5
|
+
class AppendOnly < Base
|
6
|
+
attr_accessor :primary_key
|
7
|
+
|
8
|
+
def initialize(options)
|
9
|
+
super
|
10
|
+
@primary_key = options.dig(:primary_key)
|
11
|
+
raise "primary_key required for append_only writer" if primary_key.nil?
|
12
|
+
raise "filenames required: specify the prefix to uuse for unique new files" unless filenames.present?
|
13
|
+
end
|
14
|
+
|
15
|
+
def store
|
16
|
+
# If there are no existing files, just store as normal
|
17
|
+
return super if files.empty?
|
18
|
+
|
19
|
+
# Get existing data lazily
|
20
|
+
existing_keys = query(lazy: true)
|
21
|
+
.select(primary_key)
|
22
|
+
.collect[primary_key]
|
23
|
+
.to_a
|
24
|
+
|
25
|
+
# Convert input to lazy if it isn't already
|
26
|
+
input_data = df.is_a?(Polars::LazyFrame) ? df : df.lazy
|
27
|
+
|
28
|
+
# Filter out records that already exist
|
29
|
+
new_records = input_data.filter(
|
30
|
+
Polars.col(primary_key).is_in(existing_keys).not_
|
31
|
+
)
|
32
|
+
|
33
|
+
# If we have new records, store them
|
34
|
+
if new_records.clone.select(Polars.length).collect[0, 0] > 0
|
35
|
+
@df = new_records
|
36
|
+
store_to_unique_file
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def compact
|
41
|
+
files = self.files
|
42
|
+
return if files.empty?
|
43
|
+
|
44
|
+
clear_unique_id
|
45
|
+
|
46
|
+
# Mv existing compacted parquet to a temp file, so it doesn't conflict with write,
|
47
|
+
# but can still be queried
|
48
|
+
compacted_file = File.join(root_dir, "compacted.parquet")
|
49
|
+
if File.exist?(compacted_file)
|
50
|
+
tmp_file = File.join(root_dir, "compacted.orig.parquet")
|
51
|
+
FileUtils.mv(compacted_file, tmp_file)
|
52
|
+
end
|
53
|
+
files = self.files
|
54
|
+
|
55
|
+
compacted_file.tap do |target_file|
|
56
|
+
compacted_data = query(lazy: true).sort(primary_key)
|
57
|
+
|
58
|
+
safe_write(compacted_data, target_file)
|
59
|
+
FileUtils.rm(files)
|
60
|
+
clear_unique_id
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Writer
|
5
|
+
class Base
|
6
|
+
attr_accessor :filenames, :root_dir, :options, :append_only, :df
|
7
|
+
|
8
|
+
def initialize(options)
|
9
|
+
@root_dir = options.dig(:root_dir)
|
10
|
+
@filenames = options.dig(:filenames)
|
11
|
+
@append_only = options.dig(:append_only)
|
12
|
+
@options = options
|
13
|
+
@df = options.dig(:df)
|
14
|
+
end
|
15
|
+
|
16
|
+
def wipe
|
17
|
+
clear_unique_id
|
18
|
+
FileUtils.rm_rf(root_dir)
|
19
|
+
end
|
20
|
+
|
21
|
+
def store
|
22
|
+
store_to_unique_file
|
23
|
+
end
|
24
|
+
|
25
|
+
def compact
|
26
|
+
files = self.files
|
27
|
+
|
28
|
+
clear_unique_id
|
29
|
+
File.join(root_dir, "compacted.parquet").tap do |target_file|
|
30
|
+
safe_write(
|
31
|
+
query(lazy: true),
|
32
|
+
target_file
|
33
|
+
)
|
34
|
+
FileUtils.rm(files)
|
35
|
+
end
|
36
|
+
clear_unique_id
|
37
|
+
end
|
38
|
+
|
39
|
+
def unlock!
|
40
|
+
clear_all_keys
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def files
|
46
|
+
DatasetManager.new(options).files
|
47
|
+
end
|
48
|
+
|
49
|
+
def query(**kwargs, &block)
|
50
|
+
DatasetManager.new(options).query(root_dir, **kwargs, &block)
|
51
|
+
end
|
52
|
+
|
53
|
+
def store_to_unique_file(subdir: nil)
|
54
|
+
safe_write(df, unique_path(subdir: subdir))
|
55
|
+
end
|
56
|
+
|
57
|
+
def acquire_lock(key, &block)
|
58
|
+
Support::Lockable.with_lock("#{key}:lock", wait_timeout: 2, &block)
|
59
|
+
end
|
60
|
+
|
61
|
+
def unique_path(subdir: nil)
|
62
|
+
filename = [filenames, unique_id(subdir: subdir), "parquet"].compact.join(".")
|
63
|
+
|
64
|
+
File.join(root_dir, subdir.to_s, filename)
|
65
|
+
end
|
66
|
+
|
67
|
+
def safe_write(df, path)
|
68
|
+
FileUtils.mkdir_p(File.dirname(path))
|
69
|
+
df.is_a?(Polars::LazyFrame) ? df.sink_parquet(path) : df.write_parquet(path)
|
70
|
+
path
|
71
|
+
end
|
72
|
+
|
73
|
+
def clear_all_keys
|
74
|
+
list_keys.each { |key| unlock_file(key) }
|
75
|
+
end
|
76
|
+
|
77
|
+
def unlock_file(key)
|
78
|
+
acquire_lock(key) do |suo|
|
79
|
+
suo.client.del(key)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def clear_unique_id(subdir: nil)
|
84
|
+
key = unique_id_key(subdir: subdir)
|
85
|
+
acquire_lock(key) do |suo|
|
86
|
+
suo.client.del(key)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def unique_id_key(subdir: nil)
|
91
|
+
File.join("dataset_managers", root_dir, subdir.to_s, "sequence")
|
92
|
+
end
|
93
|
+
|
94
|
+
def add_key(key)
|
95
|
+
keylist = unique_id_key(subdir: "keylist")
|
96
|
+
|
97
|
+
acquire_lock(keylist) do |suo|
|
98
|
+
suo.client.sadd(keylist, key)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def list_keys
|
103
|
+
keylist = unique_id_key(subdir: "keylist")
|
104
|
+
|
105
|
+
acquire_lock(keylist) do |suo|
|
106
|
+
if suo.client.type(keylist) == "set"
|
107
|
+
suo.client.smembers(keylist)
|
108
|
+
else
|
109
|
+
suo.client.del(keylist)
|
110
|
+
[]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def key_exists?(key)
|
116
|
+
keylist = unique_id_key(subdir: "keylist")
|
117
|
+
|
118
|
+
acquire_lock(keylist) do |suo|
|
119
|
+
suo.client.sismember(keylist, key)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def unique_id(subdir: nil)
|
124
|
+
key = unique_id_key(subdir: subdir)
|
125
|
+
add_key(key)
|
126
|
+
|
127
|
+
acquire_lock(key) do |suo|
|
128
|
+
redis = suo.client
|
129
|
+
|
130
|
+
seq = (redis.get(key) || "0").to_i
|
131
|
+
redis.set(key, (seq + 1).to_s)
|
132
|
+
seq + 1
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Writer
|
5
|
+
class Partitioned < Base
|
6
|
+
class PartitionReasons < EasyML::Reasons
|
7
|
+
add_reason "Missing primary key", -> { primary_key.nil? }
|
8
|
+
add_reason "Df does not contain primary key", -> { df.columns.exclude?(primary_key) }
|
9
|
+
add_reason "Primary key is not numeric", -> { !numeric_primary_key? }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Writer
|
5
|
+
class Partitioned < Base
|
6
|
+
require_relative "partitioned/partition_reasons"
|
7
|
+
|
8
|
+
attr_accessor :partition_size, :partition, :primary_key, :df
|
9
|
+
|
10
|
+
def initialize(options)
|
11
|
+
super
|
12
|
+
@partition_size = options.dig(:partition_size)
|
13
|
+
@partition = options.dig(:partition)
|
14
|
+
@primary_key = options.dig(:primary_key)
|
15
|
+
|
16
|
+
raise "filenames required: specify the prefix to use for unique new files" unless filenames.present?
|
17
|
+
end
|
18
|
+
|
19
|
+
def wipe
|
20
|
+
partitions.each do |partition|
|
21
|
+
FileUtils.rm_rf(File.join(root_dir, partition))
|
22
|
+
end
|
23
|
+
clear_all_keys
|
24
|
+
end
|
25
|
+
|
26
|
+
def store
|
27
|
+
unless can_partition?
|
28
|
+
puts cannot_partition_reasons.explain
|
29
|
+
return Base.new(options).store
|
30
|
+
end
|
31
|
+
|
32
|
+
store_each_partition
|
33
|
+
end
|
34
|
+
|
35
|
+
def compact
|
36
|
+
files = self.files
|
37
|
+
@df = query(lazy: true)
|
38
|
+
|
39
|
+
clear_unique_id(subdir: "compacted")
|
40
|
+
compact_each_partition.tap do
|
41
|
+
FileUtils.rm(files)
|
42
|
+
clear_unique_id
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def partitions
|
49
|
+
Dir.glob(File.join(root_dir, "**/*")).map { |f| f.split("/").last }
|
50
|
+
end
|
51
|
+
|
52
|
+
def compact_each_partition
|
53
|
+
with_each_partition do |partition_df, _|
|
54
|
+
safe_write(
|
55
|
+
partition_df.sort(Polars.col(primary_key)),
|
56
|
+
unique_path(subdir: "compacted")
|
57
|
+
)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def with_each_partition(&block)
|
62
|
+
partition_boundaries.map do |partition|
|
63
|
+
partition_start = partition[:partition_start]
|
64
|
+
partition_end = partition[:partition_end]
|
65
|
+
partition_df = df.filter(Polars.col(primary_key).is_between(partition_start, partition_end))
|
66
|
+
num_rows = lazy? ? partition_df.select(Polars.length).collect[0, 0] : partition_df.shape[0]
|
67
|
+
|
68
|
+
binding.pry if num_rows == 0
|
69
|
+
next if num_rows == 0
|
70
|
+
yield partition_df, partition
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def store_each_partition
|
75
|
+
with_each_partition do |partition_df, partition|
|
76
|
+
safe_write(
|
77
|
+
partition_df,
|
78
|
+
unique_path(subdir: partition[:partition])
|
79
|
+
)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def partition_boundaries
|
84
|
+
EasyML::Data::Partition::Boundaries.new(df, primary_key, partition_size).to_a
|
85
|
+
end
|
86
|
+
|
87
|
+
def cannot_partition_reasons
|
88
|
+
@cannot_partition_reasons ||= PartitionReasons.new(self)
|
89
|
+
end
|
90
|
+
|
91
|
+
def can_partition?
|
92
|
+
@partitioned ||= cannot_partition_reasons.none?
|
93
|
+
end
|
94
|
+
|
95
|
+
def lazy?
|
96
|
+
df.is_a?(Polars::LazyFrame)
|
97
|
+
end
|
98
|
+
|
99
|
+
def cast_primary_key
|
100
|
+
case dtype_primary_key
|
101
|
+
when Polars::Categorical
|
102
|
+
Polars.col(primary_key).cast(Polars::String)
|
103
|
+
else
|
104
|
+
Polars.col(primary_key)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def dtype_primary_key
|
109
|
+
@dtype_primary_key ||= schema[primary_key]
|
110
|
+
end
|
111
|
+
|
112
|
+
def schema
|
113
|
+
@schema ||= df.schema
|
114
|
+
end
|
115
|
+
|
116
|
+
def min_key
|
117
|
+
return @min_key if @min_key
|
118
|
+
|
119
|
+
if lazy?
|
120
|
+
@min_key = df.select(cast_primary_key).min.collect.to_a[0].dig(primary_key)
|
121
|
+
else
|
122
|
+
@min_key = df[primary_key].min
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def max_key
|
127
|
+
return @max_key if @max_key
|
128
|
+
|
129
|
+
if lazy?
|
130
|
+
@max_key = df.select(cast_primary_key).max.collect.to_a[0].dig(primary_key)
|
131
|
+
else
|
132
|
+
@max_key = df[primary_key].max
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def numeric_primary_key?
|
137
|
+
begin
|
138
|
+
# We are intentionally not using to_i, so it will raise an error for keys like "A1"
|
139
|
+
min = min_key.is_a?(String) ? Integer(min_key) : min_key
|
140
|
+
max = max_key.is_a?(String) ? Integer(max_key) : max_key
|
141
|
+
min.is_a?(Integer) && max.is_a?(Integer)
|
142
|
+
rescue ArgumentError
|
143
|
+
false
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Writer
|
5
|
+
require_relative "writer/base"
|
6
|
+
require_relative "writer/partitioned"
|
7
|
+
require_relative "writer/append_only"
|
8
|
+
require_relative "writer/named"
|
9
|
+
|
10
|
+
ADAPTERS = [
|
11
|
+
Base,
|
12
|
+
Partitioned,
|
13
|
+
AppendOnly,
|
14
|
+
Named,
|
15
|
+
]
|
16
|
+
|
17
|
+
attr_accessor :filenames, :root_dir, :partition,
|
18
|
+
:primary_key, :options, :append_only, :named
|
19
|
+
|
20
|
+
def initialize(options)
|
21
|
+
@root_dir = options.dig(:root_dir)
|
22
|
+
@filenames = options.dig(:filenames)
|
23
|
+
@partition = options.dig(:partition) || (options.dig(:partition_size).present? && options.dig(:primary_key).present?)
|
24
|
+
@append_only = options.dig(:append_only)
|
25
|
+
@primary_key = options.dig(:primary_key)
|
26
|
+
@named = options.dig(:named) || false
|
27
|
+
@options = options
|
28
|
+
end
|
29
|
+
|
30
|
+
def unlock!
|
31
|
+
adapter_class.new(options).unlock!
|
32
|
+
end
|
33
|
+
|
34
|
+
def store(df, *args)
|
35
|
+
adapter_class.new(options.merge!(df: df)).store(*args)
|
36
|
+
end
|
37
|
+
|
38
|
+
def wipe
|
39
|
+
adapter_class.new(options).wipe
|
40
|
+
end
|
41
|
+
|
42
|
+
def compact
|
43
|
+
adapter_class.new(options).compact
|
44
|
+
end
|
45
|
+
|
46
|
+
def inspect
|
47
|
+
keys = %w(root_dir append_only partition primary_key)
|
48
|
+
attrs = keys.map { |k| "#{k}=#{send(k)}" unless send(k).nil? }.compact
|
49
|
+
"#<#{self.class.name} #{attrs.join(" ")}>"
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def adapter_class
|
55
|
+
if partition?
|
56
|
+
Partitioned
|
57
|
+
elsif append_only?
|
58
|
+
AppendOnly
|
59
|
+
elsif named?
|
60
|
+
Named
|
61
|
+
else
|
62
|
+
Base
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def named?
|
67
|
+
@named
|
68
|
+
end
|
69
|
+
|
70
|
+
def partition?
|
71
|
+
@partition
|
72
|
+
end
|
73
|
+
|
74
|
+
def append_only?
|
75
|
+
@append_only
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
require_relative "dataset_manager/writer"
|
5
|
+
require_relative "dataset_manager/reader"
|
6
|
+
|
7
|
+
attr_accessor :root_dir, :partition, :append_only, :filenames, :primary_key,
|
8
|
+
:partition_size, :s3_bucket, :s3_prefix, :s3_access_key_id,
|
9
|
+
:s3_secret_access_key, :polars_args, :source_of_truth,
|
10
|
+
:options
|
11
|
+
|
12
|
+
def initialize(options = {})
|
13
|
+
@root_dir = options.dig(:root_dir)
|
14
|
+
@partition = options.dig(:partition) || (options.dig(:partition_size).present? && options.dig(:primary_key).present?)
|
15
|
+
@append_only = options.dig(:append_only) || false
|
16
|
+
@filenames = options.dig(:filenames) || "file"
|
17
|
+
@primary_key = options.dig(:primary_key)
|
18
|
+
@partition_size = options.dig(:partition_size) || nil
|
19
|
+
@s3_bucket = options.dig(:s3_bucket) || EasyML::Configuration.s3_bucket
|
20
|
+
@s3_prefix = options.dig(:s3_prefix) || nil
|
21
|
+
@s3_access_key_id = options.dig(:s3_access_key_id) || EasyML::Configuration.s3_access_key_id
|
22
|
+
@s3_secret_access_key = options.dig(:s3_secret_access_key) || EasyML::Configuration.s3_secret_access_key
|
23
|
+
@polars_args = options.dig(:polars_args) || {}
|
24
|
+
@source_of_truth = options.dig(:source_of_truth) || :local
|
25
|
+
@options = options
|
26
|
+
|
27
|
+
raise "primary_key required: how should we divide partitions?" if partition && primary_key.nil?
|
28
|
+
raise "partition_size required: specify number of rows in each partition" if partition && partition_size.nil?
|
29
|
+
raise "root_dir required: specify the root_dir of the dataset" unless root_dir.present?
|
30
|
+
end
|
31
|
+
|
32
|
+
def inspect
|
33
|
+
keys = %w(root append_only partition primary_key)
|
34
|
+
attrs = keys.map { |k| "#{k}=#{send(k)}" unless send(k).nil? }.compact
|
35
|
+
"#<#{self.class.name} #{attrs.join("\n\t")}>"
|
36
|
+
end
|
37
|
+
|
38
|
+
class << self
|
39
|
+
def query(input = nil, **kwargs, &block)
|
40
|
+
Reader.query(input, **kwargs, &block)
|
41
|
+
end
|
42
|
+
|
43
|
+
def schema(input = nil, **kwargs, &block)
|
44
|
+
Reader.schema(input, **kwargs, &block)
|
45
|
+
end
|
46
|
+
|
47
|
+
def num_rows
|
48
|
+
Reader.num_rows
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def num_rows
|
53
|
+
Reader.num_rows(root_dir)
|
54
|
+
end
|
55
|
+
|
56
|
+
def query(input = nil, **kwargs, &block)
|
57
|
+
input = root_dir if input.nil?
|
58
|
+
DatasetManager.query(input, **kwargs, &block)
|
59
|
+
end
|
60
|
+
|
61
|
+
def schema(input = nil, **kwargs, &block)
|
62
|
+
input = root_dir if input.nil?
|
63
|
+
DatasetManager.schema(input, **kwargs, &block)
|
64
|
+
end
|
65
|
+
|
66
|
+
def sha
|
67
|
+
Reader.sha(root_dir)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Transform CSV files into Parquet files, of all the same datatype.
|
71
|
+
# Learn datatypes of columns and store schema.
|
72
|
+
def normalize
|
73
|
+
Normalizer.normalize(root_dir)
|
74
|
+
end
|
75
|
+
|
76
|
+
def data
|
77
|
+
query
|
78
|
+
end
|
79
|
+
|
80
|
+
def unlock!
|
81
|
+
writer.unlock!
|
82
|
+
end
|
83
|
+
|
84
|
+
def compact
|
85
|
+
writer.compact
|
86
|
+
end
|
87
|
+
|
88
|
+
def store(df, *args)
|
89
|
+
writer.store(df, *args)
|
90
|
+
end
|
91
|
+
|
92
|
+
def cp(from, to)
|
93
|
+
writer.cp(from, to)
|
94
|
+
end
|
95
|
+
|
96
|
+
def empty?
|
97
|
+
files.empty? || query(limit: 1).empty?
|
98
|
+
end
|
99
|
+
|
100
|
+
def files
|
101
|
+
Reader.files(root_dir)
|
102
|
+
end
|
103
|
+
|
104
|
+
def wipe
|
105
|
+
writer.wipe
|
106
|
+
end
|
107
|
+
|
108
|
+
def upload
|
109
|
+
synced_directory.upload
|
110
|
+
end
|
111
|
+
|
112
|
+
def download
|
113
|
+
synced_directory.download
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
|
118
|
+
def root
|
119
|
+
root_dir.gsub(/^#{Rails.root.to_s}/, "")
|
120
|
+
end
|
121
|
+
|
122
|
+
def writer
|
123
|
+
Writer.new(options)
|
124
|
+
end
|
125
|
+
|
126
|
+
def synced_directory
|
127
|
+
@synced_dir ||= EasyML::Data::SyncedDirectory.new(
|
128
|
+
root_dir: root_dir,
|
129
|
+
source_of_truth: source_of_truth,
|
130
|
+
s3_bucket: s3_bucket,
|
131
|
+
s3_prefix: s3_prefix,
|
132
|
+
s3_access_key_id: s3_access_key_id,
|
133
|
+
s3_secret_access_key: s3_secret_access_key,
|
134
|
+
polars_args: polars_args,
|
135
|
+
cache_for: 0,
|
136
|
+
)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|