easy_ml 0.2.0.pre.rc76 → 0.2.0.pre.rc78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/models_controller.rb +3 -2
- data/app/frontend/components/ModelForm.tsx +16 -0
- data/app/frontend/components/ScheduleModal.tsx +0 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +7 -6
- data/app/jobs/easy_ml/application_job.rb +1 -0
- data/app/jobs/easy_ml/batch_job.rb +47 -6
- data/app/jobs/easy_ml/compute_feature_job.rb +10 -10
- data/app/jobs/easy_ml/reaper.rb +14 -10
- data/app/jobs/easy_ml/refresh_dataset_job.rb +2 -0
- data/app/jobs/easy_ml/sync_datasource_job.rb +1 -0
- data/app/models/concerns/easy_ml/dataframe_serialization.rb +1 -17
- data/app/models/easy_ml/column/imputers/base.rb +1 -1
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -5
- data/app/models/easy_ml/column/imputers/today.rb +1 -1
- data/app/models/easy_ml/column/selector.rb +0 -8
- data/app/models/easy_ml/column.rb +1 -1
- data/app/models/easy_ml/dataset/learner/base.rb +2 -2
- data/app/models/easy_ml/dataset/learner/eager.rb +3 -1
- data/app/models/easy_ml/dataset/learner/lazy.rb +4 -1
- data/app/models/easy_ml/dataset/refresh_reasons.rb +12 -0
- data/app/models/easy_ml/dataset.rb +29 -76
- data/app/models/easy_ml/datasource.rb +0 -6
- data/app/models/easy_ml/feature.rb +27 -38
- data/app/models/easy_ml/model.rb +20 -2
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +3 -2
- data/app/models/easy_ml/models/xgboost.rb +52 -36
- data/app/models/easy_ml/retraining_run.rb +1 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +1 -1
- data/app/serializers/easy_ml/model_serializer.rb +1 -0
- data/lib/easy_ml/core/tuner.rb +7 -4
- data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
- data/lib/easy_ml/data/dataset_manager/reader/base.rb +80 -0
- data/lib/easy_ml/data/dataset_manager/reader/batch.rb +106 -0
- data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +23 -0
- data/lib/easy_ml/data/dataset_manager/reader/file.rb +75 -0
- data/lib/easy_ml/data/dataset_manager/reader.rb +58 -0
- data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +67 -0
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +139 -0
- data/lib/easy_ml/data/dataset_manager/writer/named.rb +14 -0
- data/lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb +15 -0
- data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +150 -0
- data/lib/easy_ml/data/dataset_manager/writer.rb +80 -0
- data/lib/easy_ml/data/dataset_manager.rb +140 -0
- data/lib/easy_ml/data/partition/boundaries.rb +60 -0
- data/lib/easy_ml/data/partition.rb +7 -0
- data/lib/easy_ml/data/polars_column.rb +19 -5
- data/lib/easy_ml/data/synced_directory.rb +1 -2
- data/lib/easy_ml/data.rb +2 -0
- data/lib/easy_ml/engine.rb +16 -14
- data/lib/easy_ml/feature_store.rb +21 -188
- data/lib/easy_ml/reasons.rb +41 -0
- data/lib/easy_ml/support/lockable.rb +1 -5
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js +522 -0
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-B1qLZuyu.js.map → Application.tsx-Bbf3mD_b.js.map} +1 -1
- metadata +24 -9
- data/app/models/easy_ml/datasources/polars_datasource.rb +0 -69
- data/lib/easy_ml/data/filter_extensions.rb +0 -31
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js +0 -522
- /data/app/models/{lineage_history.rb → easy_ml/lineage_history.rb} +0 -0
@@ -22,7 +22,6 @@ module EasyML
|
|
22
22
|
DATASOURCE_OPTIONS = {
|
23
23
|
"s3" => "EasyML::Datasources::S3Datasource",
|
24
24
|
"file" => "EasyML::Datasources::FileDatasource",
|
25
|
-
"polars" => "EasyML::Datasources::PolarsDatasource",
|
26
25
|
}
|
27
26
|
DATASOURCE_TYPES = [
|
28
27
|
{
|
@@ -35,11 +34,6 @@ module EasyML
|
|
35
34
|
label: "Local Files",
|
36
35
|
description: "Connect to data stored in local files",
|
37
36
|
},
|
38
|
-
{
|
39
|
-
value: "polars",
|
40
|
-
label: "Polars DataFrame",
|
41
|
-
description: "In-memory dataframe storage using Polars",
|
42
|
-
},
|
43
37
|
].freeze
|
44
38
|
DATASOURCE_NAMES = DATASOURCE_OPTIONS.keys.freeze
|
45
39
|
DATASOURCE_CONSTANTS = DATASOURCE_OPTIONS.values.map(&:constantize)
|
@@ -88,6 +88,7 @@ module EasyML
|
|
88
88
|
before_save :update_sha
|
89
89
|
after_find :update_from_feature_class
|
90
90
|
before_save :update_from_feature_class
|
91
|
+
before_destroy :wipe
|
91
92
|
|
92
93
|
def feature_klass
|
93
94
|
feature_class.constantize
|
@@ -190,34 +191,23 @@ module EasyML
|
|
190
191
|
reader = dataset.raw
|
191
192
|
|
192
193
|
if adapter.respond_to?(:batch)
|
193
|
-
|
194
|
-
|
195
|
-
max_id = array.max
|
194
|
+
series = adapter.batch(reader, self)
|
195
|
+
primary_key = series.name
|
196
196
|
else
|
197
|
-
|
198
|
-
begin
|
199
|
-
unless primary_key.present?
|
200
|
-
raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
|
201
|
-
end
|
202
|
-
df = reader.query(select: primary_key)
|
203
|
-
rescue => e
|
204
|
-
raise "Couldn't find primary key #{primary_key.first} for feature #{feature_class}: #{e.message}"
|
205
|
-
end
|
206
|
-
return [] if df.nil?
|
207
|
-
|
208
|
-
min_id = df[primary_key.first].min
|
209
|
-
max_id = df[primary_key.last].max
|
197
|
+
primary_key = self.primary_key
|
210
198
|
end
|
211
199
|
|
212
|
-
|
213
|
-
|
200
|
+
EasyML::Data::Partition::Boundaries.new(
|
201
|
+
reader.data(lazy: true, all_columns: true),
|
202
|
+
primary_key,
|
203
|
+
batch_size
|
204
|
+
).to_a.map.with_index do |partition, idx|
|
214
205
|
{
|
215
206
|
feature_id: id,
|
216
|
-
batch_start:
|
217
|
-
batch_end:
|
207
|
+
batch_start: partition[:partition_start],
|
208
|
+
batch_end: partition[:partition_end],
|
218
209
|
batch_number: feature_position,
|
219
210
|
subbatch_number: idx,
|
220
|
-
parent_batch_id: Random.uuid,
|
221
211
|
}
|
222
212
|
end
|
223
213
|
end
|
@@ -228,9 +218,16 @@ module EasyML
|
|
228
218
|
|
229
219
|
def fit(features: [self], async: false)
|
230
220
|
ordered_features = features.sort_by(&:feature_position)
|
231
|
-
|
221
|
+
parent_batch_id = Random.uuid
|
222
|
+
jobs = ordered_features.map do |feature|
|
223
|
+
feature.build_batches.map do |batch_args|
|
224
|
+
batch_args.merge(parent_batch_id: parent_batch_id)
|
225
|
+
end
|
226
|
+
end
|
232
227
|
job_count = jobs.dup.flatten.size
|
233
228
|
|
229
|
+
ordered_features.each(&:wipe)
|
230
|
+
|
234
231
|
# This is very important! For whatever reason, Resque BatchJob does not properly
|
235
232
|
# handle batch finished callbacks for batch size = 1
|
236
233
|
if async && job_count > 1
|
@@ -325,6 +322,7 @@ module EasyML
|
|
325
322
|
params = {
|
326
323
|
select: select,
|
327
324
|
filter: filter,
|
325
|
+
sort: primary_key,
|
328
326
|
}.compact
|
329
327
|
else
|
330
328
|
params = {}
|
@@ -438,24 +436,10 @@ module EasyML
|
|
438
436
|
end
|
439
437
|
|
440
438
|
def feature_store
|
441
|
-
|
442
|
-
end
|
443
|
-
|
444
|
-
def upload_remote_files
|
445
|
-
feature_store.upload_remote_files
|
446
|
-
end
|
447
|
-
|
448
|
-
def files
|
449
|
-
feature_store.list_partitions
|
450
|
-
end
|
451
|
-
|
452
|
-
def query(**kwargs)
|
453
|
-
feature_store.query(**kwargs)
|
439
|
+
EasyML::FeatureStore.new(self)
|
454
440
|
end
|
455
441
|
|
456
|
-
|
457
|
-
feature_store.store(df)
|
458
|
-
end
|
442
|
+
delegate :files, :query, :store, :compact, to: :feature_store
|
459
443
|
|
460
444
|
def batch_size
|
461
445
|
read_attribute(:batch_size) ||
|
@@ -466,6 +450,7 @@ module EasyML
|
|
466
450
|
def after_fit
|
467
451
|
update_sha
|
468
452
|
|
453
|
+
feature_store.compact
|
469
454
|
updates = {
|
470
455
|
fit_at: Time.current,
|
471
456
|
needs_fit: false,
|
@@ -474,6 +459,10 @@ module EasyML
|
|
474
459
|
update!(updates)
|
475
460
|
end
|
476
461
|
|
462
|
+
def unlock!
|
463
|
+
feature_store.unlock!
|
464
|
+
end
|
465
|
+
|
477
466
|
UNCONFIGURABLE_COLUMNS = %w(
|
478
467
|
id
|
479
468
|
dataset_id
|
data/app/models/easy_ml/model.rb
CHANGED
@@ -45,7 +45,7 @@ module EasyML
|
|
45
45
|
MODEL_NAMES = MODEL_OPTIONS.keys.freeze
|
46
46
|
MODEL_CONSTANTS = MODEL_OPTIONS.values.map(&:constantize)
|
47
47
|
|
48
|
-
add_configuration_attributes :task, :objective, :hyperparameters, :callbacks, :metrics
|
48
|
+
add_configuration_attributes :task, :objective, :hyperparameters, :callbacks, :metrics, :weights_column
|
49
49
|
MODEL_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
|
50
50
|
add_configuration_attributes attribute
|
51
51
|
end
|
@@ -179,6 +179,8 @@ module EasyML
|
|
179
179
|
end
|
180
180
|
|
181
181
|
def actually_train(&progress_block)
|
182
|
+
raise untrainable_error unless trainable?
|
183
|
+
|
182
184
|
lock_model do
|
183
185
|
run = pending_run
|
184
186
|
run.wrap_training do
|
@@ -258,7 +260,7 @@ module EasyML
|
|
258
260
|
|
259
261
|
def formatted_version
|
260
262
|
return nil unless version
|
261
|
-
|
263
|
+
UTC.parse(version).in_time_zone(EasyML::Configuration.timezone).strftime("%B %-d, %Y at %-l:%M %p")
|
262
264
|
end
|
263
265
|
|
264
266
|
def last_run_at
|
@@ -277,6 +279,22 @@ module EasyML
|
|
277
279
|
alias_method :latest_version, :inference_version
|
278
280
|
alias_method :deployed, :inference_version
|
279
281
|
|
282
|
+
def trainable?
|
283
|
+
adapter.trainable?
|
284
|
+
end
|
285
|
+
|
286
|
+
def untrainable_columns
|
287
|
+
adapter.untrainable_columns
|
288
|
+
end
|
289
|
+
|
290
|
+
def untrainable_error
|
291
|
+
%Q(
|
292
|
+
Cannot train dataset containing null values!
|
293
|
+
Apply preprocessing to the following columns:
|
294
|
+
#{untrainable_columns.join(", ")}
|
295
|
+
)
|
296
|
+
end
|
297
|
+
|
280
298
|
def predict(xs)
|
281
299
|
load_model!
|
282
300
|
unless xs.is_a?(XGBoost::DMatrix)
|
@@ -36,7 +36,7 @@ module EasyML
|
|
36
36
|
if tuner.present?
|
37
37
|
[tuner.x_valid, tuner.y_valid]
|
38
38
|
else
|
39
|
-
model.dataset.valid(split_ys: true)
|
39
|
+
model.dataset.valid(split_ys: true, lazy: true)
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
@@ -47,7 +47,8 @@ module EasyML
|
|
47
47
|
if epoch % log_frequency == 0
|
48
48
|
model.adapter.external_model = booster
|
49
49
|
x_valid, y_valid = valid_dataset
|
50
|
-
|
50
|
+
x_valid = x_valid.select(model.dataset.col_order(inference: true))
|
51
|
+
@preprocessed ||= model.preprocess(x_valid, y_valid)
|
51
52
|
y_pred = model.predict(@preprocessed)
|
52
53
|
dataset = model.dataset.valid(all_columns: true)
|
53
54
|
|
@@ -421,11 +421,11 @@ module EasyML
|
|
421
421
|
def prepare_data
|
422
422
|
if @d_train.nil?
|
423
423
|
col_order = dataset.col_order
|
424
|
-
x_sample, y_sample = dataset.train(split_ys: true, limit: 5, select: col_order)
|
424
|
+
x_sample, y_sample = dataset.train(split_ys: true, limit: 5, select: col_order, lazy: true)
|
425
425
|
preprocess(x_sample, y_sample) # Ensure we fail fast if the dataset is misconfigured
|
426
|
-
x_train, y_train = dataset.train(split_ys: true, select: col_order)
|
427
|
-
x_valid, y_valid = dataset.valid(split_ys: true, select: col_order)
|
428
|
-
x_test, y_test = dataset.test(split_ys: true, select: col_order)
|
426
|
+
x_train, y_train = dataset.train(split_ys: true, select: col_order, lazy: true)
|
427
|
+
x_valid, y_valid = dataset.valid(split_ys: true, select: col_order, lazy: true)
|
428
|
+
x_test, y_test = dataset.test(split_ys: true, select: col_order, lazy: true)
|
429
429
|
@d_train = preprocess(x_train, y_train)
|
430
430
|
@d_valid = preprocess(x_valid, y_valid)
|
431
431
|
@d_test = preprocess(x_test, y_test)
|
@@ -434,21 +434,60 @@ module EasyML
|
|
434
434
|
[@d_train, @d_valid, @d_test]
|
435
435
|
end
|
436
436
|
|
437
|
+
def trainable?
|
438
|
+
untrainable_columns.empty?
|
439
|
+
end
|
440
|
+
|
441
|
+
def untrainable_columns
|
442
|
+
df = model.dataset.processed.data(lazy: true)
|
443
|
+
|
444
|
+
columns = df.columns
|
445
|
+
selects = columns.map do |col|
|
446
|
+
Polars.col(col).null_count.alias(col)
|
447
|
+
end
|
448
|
+
null_info = df.select(selects).collect
|
449
|
+
null_info.to_hashes.first.compact
|
450
|
+
col_list = null_info.to_hashes.first.transform_values { |v| v > 0 ? v : nil }.compact.keys
|
451
|
+
|
452
|
+
model.dataset.regular_columns(col_list)
|
453
|
+
end
|
454
|
+
|
437
455
|
def preprocess(xs, ys = nil)
|
438
456
|
return xs if xs.is_a?(::XGBoost::DMatrix)
|
457
|
+
weights_col = model.weights_column || nil
|
458
|
+
|
459
|
+
if weights_col == model.dataset.target
|
460
|
+
raise ArgumentError, "Weight column cannot be the target column"
|
461
|
+
end
|
462
|
+
|
463
|
+
# Extract feature columns (all columns except label and weight)
|
464
|
+
feature_cols = xs.columns
|
465
|
+
feature_cols -= [weights_col] if weights_col
|
466
|
+
lazy = xs.is_a?(Polars::LazyFrame)
|
467
|
+
|
468
|
+
# Get features, labels and weights
|
469
|
+
features = lazy ? xs.select(feature_cols).collect.to_numo : xs.select(feature_cols).to_numo
|
470
|
+
weights = weights_col ? (lazy ? xs.select(weights_col).collect.to_numo : xs.select(weights_col).to_numo) : nil
|
471
|
+
weights = weights.flatten if weights
|
472
|
+
if ys.present?
|
473
|
+
ys = ys.is_a?(Array) ? Polars::Series.new(ys) : ys
|
474
|
+
labels = lazy ? ys.collect.to_numo.flatten : ys.to_numo.flatten
|
475
|
+
else
|
476
|
+
labels = nil
|
477
|
+
end
|
478
|
+
|
479
|
+
kwargs = {
|
480
|
+
label: labels,
|
481
|
+
weight: weights,
|
482
|
+
}.compact
|
439
483
|
|
440
|
-
orig_xs = xs.dup
|
441
|
-
column_names = xs.columns
|
442
|
-
xs = _preprocess(xs)
|
443
|
-
ys = ys.nil? ? nil : _preprocess(ys).flatten
|
444
|
-
kwargs = { label: ys }.compact
|
445
484
|
begin
|
446
|
-
::XGBoost::DMatrix.new(
|
447
|
-
|
485
|
+
::XGBoost::DMatrix.new(features, **kwargs).tap do |dmatrix|
|
486
|
+
dmatrix.feature_names = feature_cols
|
448
487
|
end
|
449
488
|
rescue StandardError => e
|
450
|
-
problematic_columns =
|
451
|
-
problematic_xs =
|
489
|
+
problematic_columns = xs.schema.select { |k, v| [Polars::Categorical, Polars::String].include?(v) }
|
490
|
+
problematic_xs = lazy ? xs.lazy.select(problematic_columns.keys).collect : xs.select(problematic_columns.keys)
|
452
491
|
raise %(
|
453
492
|
Error building data for XGBoost.
|
454
493
|
Apply preprocessing to columns
|
@@ -501,29 +540,6 @@ module EasyML
|
|
501
540
|
cb_container.after_iteration(@booster, current_iteration, d_train, evals)
|
502
541
|
end
|
503
542
|
|
504
|
-
def _preprocess(df)
|
505
|
-
return df if df.is_a?(Array)
|
506
|
-
|
507
|
-
df.to_a.map do |row|
|
508
|
-
row.values.map do |value|
|
509
|
-
case value
|
510
|
-
when Time
|
511
|
-
value.to_i # Convert Time to Unix timestamp
|
512
|
-
when Date
|
513
|
-
value.to_time.to_i # Convert Date to Unix timestamp
|
514
|
-
when String
|
515
|
-
value
|
516
|
-
when TrueClass, FalseClass
|
517
|
-
value ? 1.0 : 0.0 # Convert booleans to 1.0 and 0.0
|
518
|
-
when Integer
|
519
|
-
value
|
520
|
-
else
|
521
|
-
value.to_f # Ensure everything else is converted to a float
|
522
|
-
end
|
523
|
-
end
|
524
|
-
end
|
525
|
-
end
|
526
|
-
|
527
543
|
def initialize_model
|
528
544
|
@xgboost_model = model_class.new(n_estimators: @hyperparameters.to_h.dig(:n_estimators))
|
529
545
|
if block_given?
|
@@ -150,7 +150,7 @@ module EasyML
|
|
150
150
|
|
151
151
|
training_model.dataset.refresh
|
152
152
|
evaluator = retraining_job.evaluator.symbolize_keys
|
153
|
-
x_test, y_test = training_model.dataset.test(split_ys: true)
|
153
|
+
x_test, y_test = training_model.dataset.test(split_ys: true, all_columns: true)
|
154
154
|
y_pred = training_model.predict(x_test)
|
155
155
|
|
156
156
|
metric = evaluator[:metric].to_sym
|
data/lib/easy_ml/core/tuner.rb
CHANGED
@@ -8,7 +8,7 @@ module EasyML
|
|
8
8
|
:metrics, :objective, :n_trials, :direction, :evaluator,
|
9
9
|
:study, :results, :adapter, :tune_started_at, :x_valid, :y_valid,
|
10
10
|
:project_name, :job, :current_run, :trial_enumerator, :progress_block,
|
11
|
-
:tuner_job, :dataset
|
11
|
+
:tuner_job, :dataset, :x_normalized
|
12
12
|
|
13
13
|
def initialize(options = {})
|
14
14
|
@model = options[:model]
|
@@ -73,9 +73,12 @@ module EasyML
|
|
73
73
|
model.task = task
|
74
74
|
|
75
75
|
model.dataset.refresh if model.dataset.needs_refresh?
|
76
|
-
x_valid, y_valid = model.dataset.valid(split_ys: true,
|
76
|
+
x_valid, y_valid = model.dataset.valid(split_ys: true, all_columns: true)
|
77
|
+
x_normalized = model.dataset.normalize(x_valid, inference: true)
|
78
|
+
x_normalized = model.preprocess(x_normalized)
|
77
79
|
self.x_valid = x_valid
|
78
80
|
self.y_valid = y_valid
|
81
|
+
self.x_normalized = x_normalized
|
79
82
|
self.dataset = model.dataset.valid(all_columns: true)
|
80
83
|
adapter.tune_started_at = tune_started_at
|
81
84
|
adapter.x_valid = x_valid
|
@@ -99,7 +102,7 @@ module EasyML
|
|
99
102
|
@study.tell(@current_trial, result)
|
100
103
|
rescue StandardError => e
|
101
104
|
puts EasyML::Event.easy_ml_context(e.backtrace)
|
102
|
-
@tuner_run.update!(status: :failed, hyperparameters:
|
105
|
+
@tuner_run.update!(status: :failed, hyperparameters: model.hyperparameters.to_h)
|
103
106
|
puts "Optuna failed with: #{e.message}"
|
104
107
|
raise e
|
105
108
|
end
|
@@ -138,7 +141,7 @@ module EasyML
|
|
138
141
|
end
|
139
142
|
end
|
140
143
|
|
141
|
-
y_pred = model.predict(
|
144
|
+
y_pred = model.predict(x_normalized)
|
142
145
|
model.metrics = metrics
|
143
146
|
metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
|
144
147
|
metric = metrics.symbolize_keys.dig(model.evaluator[:metric].to_sym)
|
File without changes
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Reader
|
5
|
+
class Base
|
6
|
+
DEFAULTS = {
|
7
|
+
drop_cols: [],
|
8
|
+
filter: nil,
|
9
|
+
limit: nil,
|
10
|
+
select: nil,
|
11
|
+
unique: nil,
|
12
|
+
sort: nil,
|
13
|
+
descending: false,
|
14
|
+
batch_size: nil,
|
15
|
+
batch_start: nil,
|
16
|
+
batch_key: nil,
|
17
|
+
lazy: false,
|
18
|
+
}
|
19
|
+
|
20
|
+
DEFAULTS.each do |k, _|
|
21
|
+
attr_accessor k
|
22
|
+
end
|
23
|
+
attr_accessor :block, :options, :input
|
24
|
+
attr_accessor :options
|
25
|
+
|
26
|
+
def initialize(options, &block)
|
27
|
+
options = apply_defaults(options)
|
28
|
+
@block = block
|
29
|
+
@options = options
|
30
|
+
end
|
31
|
+
|
32
|
+
def query
|
33
|
+
raise "Not implemented"
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def apply_defaults(kwargs)
|
39
|
+
options = kwargs.dup
|
40
|
+
|
41
|
+
DEFAULTS.each do |k, default|
|
42
|
+
unless options.key?(k)
|
43
|
+
options[k] = default
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
options.each do |k, v|
|
48
|
+
send("#{k}=", v)
|
49
|
+
end
|
50
|
+
|
51
|
+
options
|
52
|
+
end
|
53
|
+
|
54
|
+
def query_dataframes(df, schema)
|
55
|
+
num_rows = df.is_a?(Polars::LazyFrame) ? df.select(Polars.length).collect[0, 0] : df.shape[0]
|
56
|
+
return df if num_rows == 0
|
57
|
+
|
58
|
+
# Apply the predicate filter if given
|
59
|
+
df = df.filter(filter) if filter
|
60
|
+
# Apply select columns if provided
|
61
|
+
df = df.select(select) if select.present?
|
62
|
+
df = df.unique if unique
|
63
|
+
|
64
|
+
# Apply sorting if provided
|
65
|
+
df = df.sort(sort, reverse: descending) if sort
|
66
|
+
|
67
|
+
# Apply drop columns
|
68
|
+
drop_cols = self.drop_cols
|
69
|
+
drop_cols &= schema.keys
|
70
|
+
df = df.drop(drop_cols) unless drop_cols.empty?
|
71
|
+
|
72
|
+
# Collect the DataFrame (execute the lazy operations)
|
73
|
+
df = df.limit(limit) if limit
|
74
|
+
lazy ? df : df.collect
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Reader
|
5
|
+
class Batch < File
|
6
|
+
def query
|
7
|
+
return batch_enumerator unless block.present?
|
8
|
+
return process_batches
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def batch_enumerator
|
14
|
+
Enumerator.new do |yielder|
|
15
|
+
process_batches do |batch|
|
16
|
+
yielder << batch
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def process_batches(&b)
|
22
|
+
raise "When using batch_size, sort must match primary key (#{batch_key})" if sort.present? && batch_key != sort
|
23
|
+
block = b || self.block
|
24
|
+
|
25
|
+
sort = batch_key
|
26
|
+
|
27
|
+
current_start = get_batch_start
|
28
|
+
final_value = get_final_value
|
29
|
+
|
30
|
+
while current_start < final_value
|
31
|
+
filter = Polars.col(sort) >= current_start
|
32
|
+
batch = query_files(filter: filter, limit: batch_size, lazy: true, sort: sort, descending: descending)
|
33
|
+
block.yield(batch)
|
34
|
+
current_start = File.new(input: input, lazy: true)
|
35
|
+
.query
|
36
|
+
.filter(filter)
|
37
|
+
.sort(sort, reverse: descending)
|
38
|
+
.limit(batch_size + 1)
|
39
|
+
.sort(sort, reverse: !descending)
|
40
|
+
.limit(1)
|
41
|
+
.select(sort)
|
42
|
+
.collect
|
43
|
+
.to_a.first&.dig(sort) || final_value
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def query_files(overrides = {})
|
48
|
+
query = options.deep_dup.merge!(overrides).except(:batch_size, :batch_start, :batch_key)
|
49
|
+
File.new(query).query
|
50
|
+
end
|
51
|
+
|
52
|
+
def get_batch_start
|
53
|
+
if batch_start.present?
|
54
|
+
batch_start
|
55
|
+
else
|
56
|
+
get_sorted_batch_keys(descending)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def get_final_value
|
61
|
+
get_sorted_batch_keys(!descending)
|
62
|
+
end
|
63
|
+
|
64
|
+
def get_sorted_batch_keys(descending, filter: nil)
|
65
|
+
query = query_files(lazy: true)
|
66
|
+
query = query.filter(filter) if filter
|
67
|
+
query.sort(batch_key, reverse: descending).limit(1).select(batch_key).collect.to_a.last.dig(batch_key)
|
68
|
+
end
|
69
|
+
|
70
|
+
def batch_key
|
71
|
+
return @batch_key if @batch_key
|
72
|
+
|
73
|
+
lazy_df = lazy_frames([files.first]).first
|
74
|
+
if select
|
75
|
+
# Lazily filter only the selected columns
|
76
|
+
lazy_df = lazy_df.select(select)
|
77
|
+
|
78
|
+
# Lazily compute the unique count for each column and compare with total row count
|
79
|
+
primary_keys = select.select do |col|
|
80
|
+
lazy_df.select(col).unique.collect.height == lazy_df.collect.height
|
81
|
+
end
|
82
|
+
else
|
83
|
+
primary_keys = lazy_df.collect.columns.select do |col|
|
84
|
+
# Lazily count unique values and compare with the total row count
|
85
|
+
lazy_df.select(col).unique.collect.height == lazy_df.collect.height
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
if primary_keys.count > 1
|
90
|
+
key = primary_keys.detect { |key| key.underscore.split("_").any? { |k| k.match?(/id/) } }
|
91
|
+
if key
|
92
|
+
primary_keys = [key]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
if primary_keys.count != 1
|
97
|
+
raise "Unable to determine primary key for dataset"
|
98
|
+
end
|
99
|
+
|
100
|
+
@batch_key = primary_keys.first
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
module EasyML
|
3
|
+
module Data
|
4
|
+
class DatasetManager
|
5
|
+
class Reader
|
6
|
+
class DataFrame < File
|
7
|
+
def query
|
8
|
+
return query_dataframes(lazy_frames, schema)
|
9
|
+
end
|
10
|
+
|
11
|
+
def schema
|
12
|
+
input.schema
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
def lazy_frames
|
17
|
+
input.lazy
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class DatasetManager
|
4
|
+
class Reader
|
5
|
+
class File < Base
|
6
|
+
attr_accessor :file_filter
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
super
|
10
|
+
@file_filter = options.dig(:file_filter) || ->(file) { true }
|
11
|
+
end
|
12
|
+
|
13
|
+
def query
|
14
|
+
return query_dataframes(dataframe, schema) unless batch_size.present?
|
15
|
+
return Batch.new(options, &block).query
|
16
|
+
end
|
17
|
+
|
18
|
+
def schema
|
19
|
+
@schema ||= files.any? ? Polars.read_parquet_schema(files.first) : nil
|
20
|
+
end
|
21
|
+
|
22
|
+
def files
|
23
|
+
filter_files do
|
24
|
+
if is_file?
|
25
|
+
@files ||= [input]
|
26
|
+
elsif is_dir?
|
27
|
+
@files ||= Dir.glob(::File.join(root_dir, "**/*.{parquet}"))
|
28
|
+
else
|
29
|
+
@files ||= []
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def filter_files(&block)
|
37
|
+
yield
|
38
|
+
@files = @files.select(&file_filter)
|
39
|
+
end
|
40
|
+
|
41
|
+
def is_dir?
|
42
|
+
path.directory?
|
43
|
+
end
|
44
|
+
|
45
|
+
def is_file?
|
46
|
+
path.file?
|
47
|
+
end
|
48
|
+
|
49
|
+
def root_dir
|
50
|
+
path if is_dir?
|
51
|
+
end
|
52
|
+
|
53
|
+
def path
|
54
|
+
@path ||= input.is_a?(Pathname) ? input : Pathname.new(input)
|
55
|
+
end
|
56
|
+
|
57
|
+
def dataframe
|
58
|
+
@dataframe = lazy_frames.any? ? Polars.concat(lazy_frames) : Polars::LazyFrame.new
|
59
|
+
end
|
60
|
+
|
61
|
+
def lazy_frames(files = nil)
|
62
|
+
return @lazy_frames if @lazy_frames
|
63
|
+
|
64
|
+
files ||= self.files
|
65
|
+
@lazy_frames = files.map do |file|
|
66
|
+
Polars.scan_parquet(file)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
require_relative "batch"
|