easy_ml 0.2.0.pre.rc76 → 0.2.0.pre.rc78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/models_controller.rb +3 -2
  3. data/app/frontend/components/ModelForm.tsx +16 -0
  4. data/app/frontend/components/ScheduleModal.tsx +0 -2
  5. data/app/frontend/components/dataset/PreprocessingConfig.tsx +7 -6
  6. data/app/jobs/easy_ml/application_job.rb +1 -0
  7. data/app/jobs/easy_ml/batch_job.rb +47 -6
  8. data/app/jobs/easy_ml/compute_feature_job.rb +10 -10
  9. data/app/jobs/easy_ml/reaper.rb +14 -10
  10. data/app/jobs/easy_ml/refresh_dataset_job.rb +2 -0
  11. data/app/jobs/easy_ml/sync_datasource_job.rb +1 -0
  12. data/app/models/concerns/easy_ml/dataframe_serialization.rb +1 -17
  13. data/app/models/easy_ml/column/imputers/base.rb +1 -1
  14. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -5
  15. data/app/models/easy_ml/column/imputers/today.rb +1 -1
  16. data/app/models/easy_ml/column/selector.rb +0 -8
  17. data/app/models/easy_ml/column.rb +1 -1
  18. data/app/models/easy_ml/dataset/learner/base.rb +2 -2
  19. data/app/models/easy_ml/dataset/learner/eager.rb +3 -1
  20. data/app/models/easy_ml/dataset/learner/lazy.rb +4 -1
  21. data/app/models/easy_ml/dataset/refresh_reasons.rb +12 -0
  22. data/app/models/easy_ml/dataset.rb +29 -76
  23. data/app/models/easy_ml/datasource.rb +0 -6
  24. data/app/models/easy_ml/feature.rb +27 -38
  25. data/app/models/easy_ml/model.rb +20 -2
  26. data/app/models/easy_ml/models/xgboost/evals_callback.rb +3 -2
  27. data/app/models/easy_ml/models/xgboost.rb +52 -36
  28. data/app/models/easy_ml/retraining_run.rb +1 -1
  29. data/app/serializers/easy_ml/dataset_serializer.rb +1 -1
  30. data/app/serializers/easy_ml/model_serializer.rb +1 -0
  31. data/lib/easy_ml/core/tuner.rb +7 -4
  32. data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
  33. data/lib/easy_ml/data/dataset_manager/reader/base.rb +80 -0
  34. data/lib/easy_ml/data/dataset_manager/reader/batch.rb +106 -0
  35. data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +23 -0
  36. data/lib/easy_ml/data/dataset_manager/reader/file.rb +75 -0
  37. data/lib/easy_ml/data/dataset_manager/reader.rb +58 -0
  38. data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +67 -0
  39. data/lib/easy_ml/data/dataset_manager/writer/base.rb +139 -0
  40. data/lib/easy_ml/data/dataset_manager/writer/named.rb +14 -0
  41. data/lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb +15 -0
  42. data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +150 -0
  43. data/lib/easy_ml/data/dataset_manager/writer.rb +80 -0
  44. data/lib/easy_ml/data/dataset_manager.rb +140 -0
  45. data/lib/easy_ml/data/partition/boundaries.rb +60 -0
  46. data/lib/easy_ml/data/partition.rb +7 -0
  47. data/lib/easy_ml/data/polars_column.rb +19 -5
  48. data/lib/easy_ml/data/synced_directory.rb +1 -2
  49. data/lib/easy_ml/data.rb +2 -0
  50. data/lib/easy_ml/engine.rb +16 -14
  51. data/lib/easy_ml/feature_store.rb +21 -188
  52. data/lib/easy_ml/reasons.rb +41 -0
  53. data/lib/easy_ml/support/lockable.rb +1 -5
  54. data/lib/easy_ml/version.rb +1 -1
  55. data/lib/easy_ml.rb +1 -1
  56. data/public/easy_ml/assets/.vite/manifest.json +1 -1
  57. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js +522 -0
  58. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-B1qLZuyu.js.map → Application.tsx-Bbf3mD_b.js.map} +1 -1
  59. metadata +24 -9
  60. data/app/models/easy_ml/datasources/polars_datasource.rb +0 -69
  61. data/lib/easy_ml/data/filter_extensions.rb +0 -31
  62. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js +0 -522
  63. /data/app/models/{lineage_history.rb → easy_ml/lineage_history.rb} +0 -0
@@ -22,7 +22,6 @@ module EasyML
22
22
  DATASOURCE_OPTIONS = {
23
23
  "s3" => "EasyML::Datasources::S3Datasource",
24
24
  "file" => "EasyML::Datasources::FileDatasource",
25
- "polars" => "EasyML::Datasources::PolarsDatasource",
26
25
  }
27
26
  DATASOURCE_TYPES = [
28
27
  {
@@ -35,11 +34,6 @@ module EasyML
35
34
  label: "Local Files",
36
35
  description: "Connect to data stored in local files",
37
36
  },
38
- {
39
- value: "polars",
40
- label: "Polars DataFrame",
41
- description: "In-memory dataframe storage using Polars",
42
- },
43
37
  ].freeze
44
38
  DATASOURCE_NAMES = DATASOURCE_OPTIONS.keys.freeze
45
39
  DATASOURCE_CONSTANTS = DATASOURCE_OPTIONS.values.map(&:constantize)
@@ -88,6 +88,7 @@ module EasyML
88
88
  before_save :update_sha
89
89
  after_find :update_from_feature_class
90
90
  before_save :update_from_feature_class
91
+ before_destroy :wipe
91
92
 
92
93
  def feature_klass
93
94
  feature_class.constantize
@@ -190,34 +191,23 @@ module EasyML
190
191
  reader = dataset.raw
191
192
 
192
193
  if adapter.respond_to?(:batch)
193
- array = adapter.batch(reader, self)
194
- min_id = array.min
195
- max_id = array.max
194
+ series = adapter.batch(reader, self)
195
+ primary_key = series.name
196
196
  else
197
- # Get all primary keys
198
- begin
199
- unless primary_key.present?
200
- raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
201
- end
202
- df = reader.query(select: primary_key)
203
- rescue => e
204
- raise "Couldn't find primary key #{primary_key.first} for feature #{feature_class}: #{e.message}"
205
- end
206
- return [] if df.nil?
207
-
208
- min_id = df[primary_key.first].min
209
- max_id = df[primary_key.last].max
197
+ primary_key = self.primary_key
210
198
  end
211
199
 
212
- (min_id..max_id).step(batch_size).map.with_index do |batch_start, idx|
213
- batch_end = [batch_start + batch_size, max_id + 1].min - 1
200
+ EasyML::Data::Partition::Boundaries.new(
201
+ reader.data(lazy: true, all_columns: true),
202
+ primary_key,
203
+ batch_size
204
+ ).to_a.map.with_index do |partition, idx|
214
205
  {
215
206
  feature_id: id,
216
- batch_start: batch_start,
217
- batch_end: batch_end,
207
+ batch_start: partition[:partition_start],
208
+ batch_end: partition[:partition_end],
218
209
  batch_number: feature_position,
219
210
  subbatch_number: idx,
220
- parent_batch_id: Random.uuid,
221
211
  }
222
212
  end
223
213
  end
@@ -228,9 +218,16 @@ module EasyML
228
218
 
229
219
  def fit(features: [self], async: false)
230
220
  ordered_features = features.sort_by(&:feature_position)
231
- jobs = ordered_features.map(&:build_batches)
221
+ parent_batch_id = Random.uuid
222
+ jobs = ordered_features.map do |feature|
223
+ feature.build_batches.map do |batch_args|
224
+ batch_args.merge(parent_batch_id: parent_batch_id)
225
+ end
226
+ end
232
227
  job_count = jobs.dup.flatten.size
233
228
 
229
+ ordered_features.each(&:wipe)
230
+
234
231
  # This is very important! For whatever reason, Resque BatchJob does not properly
235
232
  # handle batch finished callbacks for batch size = 1
236
233
  if async && job_count > 1
@@ -325,6 +322,7 @@ module EasyML
325
322
  params = {
326
323
  select: select,
327
324
  filter: filter,
325
+ sort: primary_key,
328
326
  }.compact
329
327
  else
330
328
  params = {}
@@ -438,24 +436,10 @@ module EasyML
438
436
  end
439
437
 
440
438
  def feature_store
441
- @feature_store ||= EasyML::FeatureStore.new(self)
442
- end
443
-
444
- def upload_remote_files
445
- feature_store.upload_remote_files
446
- end
447
-
448
- def files
449
- feature_store.list_partitions
450
- end
451
-
452
- def query(**kwargs)
453
- feature_store.query(**kwargs)
439
+ EasyML::FeatureStore.new(self)
454
440
  end
455
441
 
456
- def store(df)
457
- feature_store.store(df)
458
- end
442
+ delegate :files, :query, :store, :compact, to: :feature_store
459
443
 
460
444
  def batch_size
461
445
  read_attribute(:batch_size) ||
@@ -466,6 +450,7 @@ module EasyML
466
450
  def after_fit
467
451
  update_sha
468
452
 
453
+ feature_store.compact
469
454
  updates = {
470
455
  fit_at: Time.current,
471
456
  needs_fit: false,
@@ -474,6 +459,10 @@ module EasyML
474
459
  update!(updates)
475
460
  end
476
461
 
462
+ def unlock!
463
+ feature_store.unlock!
464
+ end
465
+
477
466
  UNCONFIGURABLE_COLUMNS = %w(
478
467
  id
479
468
  dataset_id
@@ -45,7 +45,7 @@ module EasyML
45
45
  MODEL_NAMES = MODEL_OPTIONS.keys.freeze
46
46
  MODEL_CONSTANTS = MODEL_OPTIONS.values.map(&:constantize)
47
47
 
48
- add_configuration_attributes :task, :objective, :hyperparameters, :callbacks, :metrics
48
+ add_configuration_attributes :task, :objective, :hyperparameters, :callbacks, :metrics, :weights_column
49
49
  MODEL_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
50
50
  add_configuration_attributes attribute
51
51
  end
@@ -179,6 +179,8 @@ module EasyML
179
179
  end
180
180
 
181
181
  def actually_train(&progress_block)
182
+ raise untrainable_error unless trainable?
183
+
182
184
  lock_model do
183
185
  run = pending_run
184
186
  run.wrap_training do
@@ -258,7 +260,7 @@ module EasyML
258
260
 
259
261
  def formatted_version
260
262
  return nil unless version
261
- Time.strptime(version, "%Y%m%d%H%M%S").strftime("%B %-d, %Y at %-l:%M %p")
263
+ UTC.parse(version).in_time_zone(EasyML::Configuration.timezone).strftime("%B %-d, %Y at %-l:%M %p")
262
264
  end
263
265
 
264
266
  def last_run_at
@@ -277,6 +279,22 @@ module EasyML
277
279
  alias_method :latest_version, :inference_version
278
280
  alias_method :deployed, :inference_version
279
281
 
282
+ def trainable?
283
+ adapter.trainable?
284
+ end
285
+
286
+ def untrainable_columns
287
+ adapter.untrainable_columns
288
+ end
289
+
290
+ def untrainable_error
291
+ %Q(
292
+ Cannot train dataset containing null values!
293
+ Apply preprocessing to the following columns:
294
+ #{untrainable_columns.join(", ")}
295
+ )
296
+ end
297
+
280
298
  def predict(xs)
281
299
  load_model!
282
300
  unless xs.is_a?(XGBoost::DMatrix)
@@ -36,7 +36,7 @@ module EasyML
36
36
  if tuner.present?
37
37
  [tuner.x_valid, tuner.y_valid]
38
38
  else
39
- model.dataset.valid(split_ys: true)
39
+ model.dataset.valid(split_ys: true, lazy: true)
40
40
  end
41
41
  end
42
42
 
@@ -47,7 +47,8 @@ module EasyML
47
47
  if epoch % log_frequency == 0
48
48
  model.adapter.external_model = booster
49
49
  x_valid, y_valid = valid_dataset
50
- @preprocessed ||= model.preprocess(x_valid)
50
+ x_valid = x_valid.select(model.dataset.col_order(inference: true))
51
+ @preprocessed ||= model.preprocess(x_valid, y_valid)
51
52
  y_pred = model.predict(@preprocessed)
52
53
  dataset = model.dataset.valid(all_columns: true)
53
54
 
@@ -421,11 +421,11 @@ module EasyML
421
421
  def prepare_data
422
422
  if @d_train.nil?
423
423
  col_order = dataset.col_order
424
- x_sample, y_sample = dataset.train(split_ys: true, limit: 5, select: col_order)
424
+ x_sample, y_sample = dataset.train(split_ys: true, limit: 5, select: col_order, lazy: true)
425
425
  preprocess(x_sample, y_sample) # Ensure we fail fast if the dataset is misconfigured
426
- x_train, y_train = dataset.train(split_ys: true, select: col_order)
427
- x_valid, y_valid = dataset.valid(split_ys: true, select: col_order)
428
- x_test, y_test = dataset.test(split_ys: true, select: col_order)
426
+ x_train, y_train = dataset.train(split_ys: true, select: col_order, lazy: true)
427
+ x_valid, y_valid = dataset.valid(split_ys: true, select: col_order, lazy: true)
428
+ x_test, y_test = dataset.test(split_ys: true, select: col_order, lazy: true)
429
429
  @d_train = preprocess(x_train, y_train)
430
430
  @d_valid = preprocess(x_valid, y_valid)
431
431
  @d_test = preprocess(x_test, y_test)
@@ -434,21 +434,60 @@ module EasyML
434
434
  [@d_train, @d_valid, @d_test]
435
435
  end
436
436
 
437
+ def trainable?
438
+ untrainable_columns.empty?
439
+ end
440
+
441
+ def untrainable_columns
442
+ df = model.dataset.processed.data(lazy: true)
443
+
444
+ columns = df.columns
445
+ selects = columns.map do |col|
446
+ Polars.col(col).null_count.alias(col)
447
+ end
448
+ null_info = df.select(selects).collect
449
+ null_info.to_hashes.first.compact
450
+ col_list = null_info.to_hashes.first.transform_values { |v| v > 0 ? v : nil }.compact.keys
451
+
452
+ model.dataset.regular_columns(col_list)
453
+ end
454
+
437
455
  def preprocess(xs, ys = nil)
438
456
  return xs if xs.is_a?(::XGBoost::DMatrix)
457
+ weights_col = model.weights_column || nil
458
+
459
+ if weights_col == model.dataset.target
460
+ raise ArgumentError, "Weight column cannot be the target column"
461
+ end
462
+
463
+ # Extract feature columns (all columns except label and weight)
464
+ feature_cols = xs.columns
465
+ feature_cols -= [weights_col] if weights_col
466
+ lazy = xs.is_a?(Polars::LazyFrame)
467
+
468
+ # Get features, labels and weights
469
+ features = lazy ? xs.select(feature_cols).collect.to_numo : xs.select(feature_cols).to_numo
470
+ weights = weights_col ? (lazy ? xs.select(weights_col).collect.to_numo : xs.select(weights_col).to_numo) : nil
471
+ weights = weights.flatten if weights
472
+ if ys.present?
473
+ ys = ys.is_a?(Array) ? Polars::Series.new(ys) : ys
474
+ labels = lazy ? ys.collect.to_numo.flatten : ys.to_numo.flatten
475
+ else
476
+ labels = nil
477
+ end
478
+
479
+ kwargs = {
480
+ label: labels,
481
+ weight: weights,
482
+ }.compact
439
483
 
440
- orig_xs = xs.dup
441
- column_names = xs.columns
442
- xs = _preprocess(xs)
443
- ys = ys.nil? ? nil : _preprocess(ys).flatten
444
- kwargs = { label: ys }.compact
445
484
  begin
446
- ::XGBoost::DMatrix.new(xs, **kwargs).tap do |dmat|
447
- dmat.feature_names = column_names
485
+ ::XGBoost::DMatrix.new(features, **kwargs).tap do |dmatrix|
486
+ dmatrix.feature_names = feature_cols
448
487
  end
449
488
  rescue StandardError => e
450
- problematic_columns = orig_xs.schema.select { |k, v| [Polars::Categorical, Polars::String].include?(v) }
451
- problematic_xs = orig_xs.select(problematic_columns.keys)
489
+ problematic_columns = xs.schema.select { |k, v| [Polars::Categorical, Polars::String].include?(v) }
490
+ problematic_xs = lazy ? xs.lazy.select(problematic_columns.keys).collect : xs.select(problematic_columns.keys)
452
491
  raise %(
453
492
  Error building data for XGBoost.
454
493
  Apply preprocessing to columns
@@ -501,29 +540,6 @@ module EasyML
501
540
  cb_container.after_iteration(@booster, current_iteration, d_train, evals)
502
541
  end
503
542
 
504
- def _preprocess(df)
505
- return df if df.is_a?(Array)
506
-
507
- df.to_a.map do |row|
508
- row.values.map do |value|
509
- case value
510
- when Time
511
- value.to_i # Convert Time to Unix timestamp
512
- when Date
513
- value.to_time.to_i # Convert Date to Unix timestamp
514
- when String
515
- value
516
- when TrueClass, FalseClass
517
- value ? 1.0 : 0.0 # Convert booleans to 1.0 and 0.0
518
- when Integer
519
- value
520
- else
521
- value.to_f # Ensure everything else is converted to a float
522
- end
523
- end
524
- end
525
- end
526
-
527
543
  def initialize_model
528
544
  @xgboost_model = model_class.new(n_estimators: @hyperparameters.to_h.dig(:n_estimators))
529
545
  if block_given?
@@ -150,7 +150,7 @@ module EasyML
150
150
 
151
151
  training_model.dataset.refresh
152
152
  evaluator = retraining_job.evaluator.symbolize_keys
153
- x_test, y_test = training_model.dataset.test(split_ys: true)
153
+ x_test, y_test = training_model.dataset.test(split_ys: true, all_columns: true)
154
154
  y_pred = training_model.predict(x_test)
155
155
 
156
156
  metric = evaluator[:metric].to_sym
@@ -84,7 +84,7 @@ module EasyML
84
84
  end
85
85
 
86
86
  attribute :needs_refresh do |dataset|
87
- dataset.needs_refresh?(exclude: [:datasource_needs_refresh])
87
+ dataset.needs_refresh?(except: [:datasource_needs_refresh])
88
88
  end
89
89
 
90
90
  attribute :stacktrace do |object|
@@ -27,6 +27,7 @@ module EasyML
27
27
  :model_type,
28
28
  :task,
29
29
  :objective,
30
+ :weights_column,
30
31
  :metrics,
31
32
  :dataset_id,
32
33
  :status,
@@ -8,7 +8,7 @@ module EasyML
8
8
  :metrics, :objective, :n_trials, :direction, :evaluator,
9
9
  :study, :results, :adapter, :tune_started_at, :x_valid, :y_valid,
10
10
  :project_name, :job, :current_run, :trial_enumerator, :progress_block,
11
- :tuner_job, :dataset
11
+ :tuner_job, :dataset, :x_normalized
12
12
 
13
13
  def initialize(options = {})
14
14
  @model = options[:model]
@@ -73,9 +73,12 @@ module EasyML
73
73
  model.task = task
74
74
 
75
75
  model.dataset.refresh if model.dataset.needs_refresh?
76
- x_valid, y_valid = model.dataset.valid(split_ys: true, select: model.dataset.col_order)
76
+ x_valid, y_valid = model.dataset.valid(split_ys: true, all_columns: true)
77
+ x_normalized = model.dataset.normalize(x_valid, inference: true)
78
+ x_normalized = model.preprocess(x_normalized)
77
79
  self.x_valid = x_valid
78
80
  self.y_valid = y_valid
81
+ self.x_normalized = x_normalized
79
82
  self.dataset = model.dataset.valid(all_columns: true)
80
83
  adapter.tune_started_at = tune_started_at
81
84
  adapter.x_valid = x_valid
@@ -99,7 +102,7 @@ module EasyML
99
102
  @study.tell(@current_trial, result)
100
103
  rescue StandardError => e
101
104
  puts EasyML::Event.easy_ml_context(e.backtrace)
102
- @tuner_run.update!(status: :failed, hyperparameters: {})
105
+ @tuner_run.update!(status: :failed, hyperparameters: model.hyperparameters.to_h)
103
106
  puts "Optuna failed with: #{e.message}"
104
107
  raise e
105
108
  end
@@ -138,7 +141,7 @@ module EasyML
138
141
  end
139
142
  end
140
143
 
141
- y_pred = model.predict(x_valid)
144
+ y_pred = model.predict(x_normalized)
142
145
  model.metrics = metrics
143
146
  metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
144
147
  metric = metrics.symbolize_keys.dig(model.evaluator[:metric].to_sym)
File without changes
@@ -0,0 +1,80 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Reader
5
+ class Base
6
+ DEFAULTS = {
7
+ drop_cols: [],
8
+ filter: nil,
9
+ limit: nil,
10
+ select: nil,
11
+ unique: nil,
12
+ sort: nil,
13
+ descending: false,
14
+ batch_size: nil,
15
+ batch_start: nil,
16
+ batch_key: nil,
17
+ lazy: false,
18
+ }
19
+
20
+ DEFAULTS.each do |k, _|
21
+ attr_accessor k
22
+ end
23
+ attr_accessor :block, :options, :input
24
+ attr_accessor :options
25
+
26
+ def initialize(options, &block)
27
+ options = apply_defaults(options)
28
+ @block = block
29
+ @options = options
30
+ end
31
+
32
+ def query
33
+ raise "Not implemented"
34
+ end
35
+
36
+ private
37
+
38
+ def apply_defaults(kwargs)
39
+ options = kwargs.dup
40
+
41
+ DEFAULTS.each do |k, default|
42
+ unless options.key?(k)
43
+ options[k] = default
44
+ end
45
+ end
46
+
47
+ options.each do |k, v|
48
+ send("#{k}=", v)
49
+ end
50
+
51
+ options
52
+ end
53
+
54
+ def query_dataframes(df, schema)
55
+ num_rows = df.is_a?(Polars::LazyFrame) ? df.select(Polars.length).collect[0, 0] : df.shape[0]
56
+ return df if num_rows == 0
57
+
58
+ # Apply the predicate filter if given
59
+ df = df.filter(filter) if filter
60
+ # Apply select columns if provided
61
+ df = df.select(select) if select.present?
62
+ df = df.unique if unique
63
+
64
+ # Apply sorting if provided
65
+ df = df.sort(sort, reverse: descending) if sort
66
+
67
+ # Apply drop columns
68
+ drop_cols = self.drop_cols
69
+ drop_cols &= schema.keys
70
+ df = df.drop(drop_cols) unless drop_cols.empty?
71
+
72
+ # Collect the DataFrame (execute the lazy operations)
73
+ df = df.limit(limit) if limit
74
+ lazy ? df : df.collect
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,106 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Reader
5
+ class Batch < File
6
+ def query
7
+ return batch_enumerator unless block.present?
8
+ return process_batches
9
+ end
10
+
11
+ private
12
+
13
+ def batch_enumerator
14
+ Enumerator.new do |yielder|
15
+ process_batches do |batch|
16
+ yielder << batch
17
+ end
18
+ end
19
+ end
20
+
21
+ def process_batches(&b)
22
+ raise "When using batch_size, sort must match primary key (#{batch_key})" if sort.present? && batch_key != sort
23
+ block = b || self.block
24
+
25
+ sort = batch_key
26
+
27
+ current_start = get_batch_start
28
+ final_value = get_final_value
29
+
30
+ while current_start < final_value
31
+ filter = Polars.col(sort) >= current_start
32
+ batch = query_files(filter: filter, limit: batch_size, lazy: true, sort: sort, descending: descending)
33
+ block.yield(batch)
34
+ current_start = File.new(input: input, lazy: true)
35
+ .query
36
+ .filter(filter)
37
+ .sort(sort, reverse: descending)
38
+ .limit(batch_size + 1)
39
+ .sort(sort, reverse: !descending)
40
+ .limit(1)
41
+ .select(sort)
42
+ .collect
43
+ .to_a.first&.dig(sort) || final_value
44
+ end
45
+ end
46
+
47
+ def query_files(overrides = {})
48
+ query = options.deep_dup.merge!(overrides).except(:batch_size, :batch_start, :batch_key)
49
+ File.new(query).query
50
+ end
51
+
52
+ def get_batch_start
53
+ if batch_start.present?
54
+ batch_start
55
+ else
56
+ get_sorted_batch_keys(descending)
57
+ end
58
+ end
59
+
60
+ def get_final_value
61
+ get_sorted_batch_keys(!descending)
62
+ end
63
+
64
+ def get_sorted_batch_keys(descending, filter: nil)
65
+ query = query_files(lazy: true)
66
+ query = query.filter(filter) if filter
67
+ query.sort(batch_key, reverse: descending).limit(1).select(batch_key).collect.to_a.last.dig(batch_key)
68
+ end
69
+
70
+ def batch_key
71
+ return @batch_key if @batch_key
72
+
73
+ lazy_df = lazy_frames([files.first]).first
74
+ if select
75
+ # Lazily filter only the selected columns
76
+ lazy_df = lazy_df.select(select)
77
+
78
+ # Lazily compute the unique count for each column and compare with total row count
79
+ primary_keys = select.select do |col|
80
+ lazy_df.select(col).unique.collect.height == lazy_df.collect.height
81
+ end
82
+ else
83
+ primary_keys = lazy_df.collect.columns.select do |col|
84
+ # Lazily count unique values and compare with the total row count
85
+ lazy_df.select(col).unique.collect.height == lazy_df.collect.height
86
+ end
87
+ end
88
+
89
+ if primary_keys.count > 1
90
+ key = primary_keys.detect { |key| key.underscore.split("_").any? { |k| k.match?(/id/) } }
91
+ if key
92
+ primary_keys = [key]
93
+ end
94
+ end
95
+
96
+ if primary_keys.count != 1
97
+ raise "Unable to determine primary key for dataset"
98
+ end
99
+
100
+ @batch_key = primary_keys.first
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,23 @@
1
+
2
+ module EasyML
3
+ module Data
4
+ class DatasetManager
5
+ class Reader
6
+ class DataFrame < File
7
+ def query
8
+ return query_dataframes(lazy_frames, schema)
9
+ end
10
+
11
+ def schema
12
+ input.schema
13
+ end
14
+
15
+ private
16
+ def lazy_frames
17
+ input.lazy
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,75 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Reader
5
+ class File < Base
6
+ attr_accessor :file_filter
7
+
8
+ def initialize(options = {})
9
+ super
10
+ @file_filter = options.dig(:file_filter) || ->(file) { true }
11
+ end
12
+
13
+ def query
14
+ return query_dataframes(dataframe, schema) unless batch_size.present?
15
+ return Batch.new(options, &block).query
16
+ end
17
+
18
+ def schema
19
+ @schema ||= files.any? ? Polars.read_parquet_schema(files.first) : nil
20
+ end
21
+
22
+ def files
23
+ filter_files do
24
+ if is_file?
25
+ @files ||= [input]
26
+ elsif is_dir?
27
+ @files ||= Dir.glob(::File.join(root_dir, "**/*.{parquet}"))
28
+ else
29
+ @files ||= []
30
+ end
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def filter_files(&block)
37
+ yield
38
+ @files = @files.select(&file_filter)
39
+ end
40
+
41
+ def is_dir?
42
+ path.directory?
43
+ end
44
+
45
+ def is_file?
46
+ path.file?
47
+ end
48
+
49
+ def root_dir
50
+ path if is_dir?
51
+ end
52
+
53
+ def path
54
+ @path ||= input.is_a?(Pathname) ? input : Pathname.new(input)
55
+ end
56
+
57
+ def dataframe
58
+ @dataframe = lazy_frames.any? ? Polars.concat(lazy_frames) : Polars::LazyFrame.new
59
+ end
60
+
61
+ def lazy_frames(files = nil)
62
+ return @lazy_frames if @lazy_frames
63
+
64
+ files ||= self.files
65
+ @lazy_frames = files.map do |file|
66
+ Polars.scan_parquet(file)
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ require_relative "batch"