easy_ml 0.2.0.pre.rc57 → 0.2.0.pre.rc60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/apis_controller.rb +8 -0
  3. data/app/controllers/easy_ml/application_controller.rb +4 -0
  4. data/app/controllers/easy_ml/datasets_controller.rb +32 -1
  5. data/app/controllers/easy_ml/models_controller.rb +3 -0
  6. data/app/controllers/easy_ml/predictions_controller.rb +10 -5
  7. data/app/frontend/components/DatasetPreview.tsx +50 -19
  8. data/app/frontend/components/ModelForm.tsx +1 -1
  9. data/app/frontend/components/SearchableSelect.tsx +0 -1
  10. data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
  11. data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
  12. data/app/frontend/components/dataset/ColumnList.tsx +14 -2
  13. data/app/frontend/components/dataset/PreprocessingConfig.tsx +82 -21
  14. data/app/frontend/pages/DatasourcesPage.tsx +0 -2
  15. data/app/frontend/types/dataset.ts +3 -0
  16. data/app/jobs/easy_ml/compute_feature_job.rb +0 -2
  17. data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
  18. data/app/models/easy_ml/column/imputers/base.rb +89 -0
  19. data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
  20. data/app/models/easy_ml/column/imputers/clip.rb +30 -0
  21. data/app/models/easy_ml/column/imputers/constant.rb +27 -0
  22. data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
  23. data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
  24. data/app/models/easy_ml/column/imputers/mean.rb +27 -0
  25. data/app/models/easy_ml/column/imputers/median.rb +27 -0
  26. data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
  27. data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
  28. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
  29. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
  30. data/app/models/easy_ml/column/imputers/today.rb +20 -0
  31. data/app/models/easy_ml/column/imputers.rb +126 -0
  32. data/app/models/easy_ml/column/learner.rb +18 -0
  33. data/app/models/easy_ml/column/learners/base.rb +103 -0
  34. data/app/models/easy_ml/column/learners/boolean.rb +11 -0
  35. data/app/models/easy_ml/column/learners/categorical.rb +51 -0
  36. data/app/models/easy_ml/column/learners/datetime.rb +19 -0
  37. data/app/models/easy_ml/column/learners/null.rb +22 -0
  38. data/app/models/easy_ml/column/learners/numeric.rb +33 -0
  39. data/app/models/easy_ml/column/learners/string.rb +15 -0
  40. data/app/models/easy_ml/column/lineage/base.rb +22 -0
  41. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
  42. data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
  43. data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
  44. data/app/models/easy_ml/column/lineage.rb +28 -0
  45. data/app/models/easy_ml/column/selector.rb +96 -0
  46. data/app/models/easy_ml/column.rb +344 -39
  47. data/app/models/easy_ml/column_history.rb +31 -20
  48. data/app/models/easy_ml/column_list.rb +79 -62
  49. data/app/models/easy_ml/dataset.rb +156 -104
  50. data/app/models/easy_ml/dataset_history.rb +23 -23
  51. data/app/models/easy_ml/datasource.rb +4 -0
  52. data/app/models/easy_ml/datasource_history.rb +1 -0
  53. data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
  54. data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
  55. data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
  56. data/app/models/easy_ml/feature.rb +29 -10
  57. data/app/models/easy_ml/feature_history.rb +12 -0
  58. data/app/models/easy_ml/feature_list.rb +15 -0
  59. data/app/models/easy_ml/model.rb +25 -4
  60. data/app/models/easy_ml/model_history.rb +1 -0
  61. data/app/models/easy_ml/retraining_run.rb +1 -0
  62. data/app/serializers/easy_ml/column_serializer.rb +11 -1
  63. data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
  64. data/config/initializers/enumerable.rb +17 -0
  65. data/config/initializers/inflections.rb +2 -0
  66. data/config/routes.rb +3 -0
  67. data/lib/easy_ml/core/tuner.rb +1 -1
  68. data/lib/easy_ml/data/date_converter.rb +137 -30
  69. data/lib/easy_ml/data/polars_column.rb +17 -0
  70. data/lib/easy_ml/data/polars_in_memory.rb +30 -0
  71. data/lib/easy_ml/data/polars_reader.rb +20 -1
  72. data/lib/easy_ml/data/splits/in_memory_split.rb +7 -5
  73. data/lib/easy_ml/data/splits/split.rb +2 -1
  74. data/lib/easy_ml/data/synced_directory.rb +5 -3
  75. data/lib/easy_ml/data.rb +1 -2
  76. data/lib/easy_ml/feature_store.rb +33 -22
  77. data/lib/easy_ml/predict.rb +13 -2
  78. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +7 -0
  79. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +18 -0
  80. data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
  81. data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
  82. data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
  83. data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
  84. data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
  85. data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
  86. data/lib/easy_ml/version.rb +1 -1
  87. data/lib/tasks/profile.rake +40 -0
  88. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  89. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
  90. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
  91. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
  92. metadata +45 -10
  93. data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
  94. data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
  95. data/lib/easy_ml/data/preprocessor.rb +0 -383
  96. data/lib/easy_ml/data/simple_imputer.rb +0 -255
  97. data/lib/easy_ml/data/statistics_learner.rb +0 -128
  98. data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
  99. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js +0 -474
  100. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js.map +0 -1
@@ -1,24 +1,24 @@
1
- # == Schetuma Information
1
+ # == Schema Information
2
2
  #
3
3
  # Table name: easy_ml_datasets
4
4
  #
5
- # id :bigint not null, primary key
6
- # name :string not null
7
- # description :string
8
- # dataset_type :string
9
- # status :string
10
- # version :string
11
- # datasource_id :bigint
12
- # root_dir :string
13
- # configuration :json
14
- # num_rows :bigint
15
- # workflow_status :string
16
- # statistics :json
17
- # preprocessor_statistics :json
18
- # schema :json
19
- # refreshed_at :datetime
20
- # created_at :datetime not null
21
- # updated_at :datetime not null
5
+ # id :bigint not null, primary key
6
+ # name :string not null
7
+ # description :string
8
+ # dataset_type :string
9
+ # status :string
10
+ # version :string
11
+ # datasource_id :bigint
12
+ # root_dir :string
13
+ # configuration :json
14
+ # num_rows :bigint
15
+ # workflow_status :string
16
+ # statistics :json
17
+ # schema :json
18
+ # refreshed_at :datetime
19
+ # created_at :datetime not null
20
+ # updated_at :datetime not null
21
+ # last_datasource_sha :string
22
22
  #
23
23
  module EasyML
24
24
  class Dataset < ActiveRecord::Base
@@ -45,7 +45,7 @@ module EasyML
45
45
  has_many :columns, class_name: "EasyML::Column", dependent: :destroy, extend: EasyML::ColumnList
46
46
  accepts_nested_attributes_for :columns, allow_destroy: true, update_only: true
47
47
 
48
- has_many :features, dependent: :destroy, class_name: "EasyML::Feature"
48
+ has_many :features, dependent: :destroy, class_name: "EasyML::Feature", extend: EasyML::FeatureList
49
49
  accepts_nested_attributes_for :features, allow_destroy: true
50
50
 
51
51
  has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
@@ -80,7 +80,7 @@ module EasyML
80
80
  column_types: EasyML::Data::PolarsColumn::TYPE_MAP.keys.map do |type|
81
81
  { value: type.to_s, label: type.to_s.titleize }
82
82
  end,
83
- preprocessing_strategies: EasyML::Data::Preprocessor.constants[:preprocessing_strategies],
83
+ preprocessing_strategies: EasyML::Column::Imputers.constants[:preprocessing_strategies],
84
84
  feature_options: EasyML::Features::Registry.list_flat,
85
85
  splitter_constants: EasyML::Splitter.constants,
86
86
  }
@@ -119,13 +119,6 @@ module EasyML
119
119
  processed.data(limit: 1)&.schema || raw.data(limit: 1)&.schema
120
120
  end
121
121
 
122
- def refresh_datatypes
123
- return unless columns_need_refresh?
124
-
125
- cleanup
126
- datasource.reread(columns)
127
- end
128
-
129
122
  def num_rows
130
123
  if datasource&.num_rows.nil?
131
124
  datasource.after_sync
@@ -140,6 +133,12 @@ module EasyML
140
133
  EasyML::RefreshDatasetJob.perform_later(id)
141
134
  end
142
135
 
136
+ def best_segment
137
+ [processed, raw].detect do |segment|
138
+ segment.send(:data, all_columns: true, limit: 1)&.columns
139
+ end
140
+ end
141
+
143
142
  def raw
144
143
  return @raw if @raw && @raw.dataset
145
144
 
@@ -162,22 +161,36 @@ module EasyML
162
161
  save
163
162
  end
164
163
 
164
+ def refreshed_datasource?
165
+ last_datasource_sha_changed?
166
+ end
167
+
168
+ def prepare_features
169
+ features.update_all(workflow_status: "ready")
170
+ end
171
+
165
172
  def prepare!
173
+ prepare_features
166
174
  cleanup
167
175
  refresh_datasource!
168
176
  split_data
177
+ process_data
169
178
  end
170
179
 
171
180
  def prepare
181
+ prepare_features
172
182
  refresh_datasource
173
183
  split_data
184
+ process_data
174
185
  end
175
186
 
176
187
  def actually_refresh
177
188
  refreshing do
189
+ learn(delete: false) # After syncing datasource, learn new statistics + sync columns
178
190
  process_data
179
191
  fully_reload
180
192
  learn
193
+ learn_statistics(type: :processed) # After processing data, we learn any new statistics
181
194
  now = UTC.now
182
195
  update(workflow_status: "ready", refreshed_at: now, updated_at: now)
183
196
  fully_reload
@@ -245,19 +258,57 @@ module EasyML
245
258
  features_need_fit.any?
246
259
  end
247
260
 
248
- def refresh_reasons
261
+ # Some of these are expensive to calculate, so we only want to include
262
+ # them in the refresh reasons if they are actually needed.
263
+ #
264
+ # During dataset_serializer for instance, we don't want to check s3,
265
+ # we only do that during background jobs.
266
+ #
267
+ # So yes this is an annoying way to structure a method, but it's helpful for performance
268
+ #
269
+ def refresh_reasons(exclude: [])
249
270
  {
250
- "Not split" => not_split?,
251
- "Refreshed at is nil" => refreshed_at.nil?,
252
- "Columns need refresh" => columns_need_refresh?,
253
- "Features need refresh" => features_need_fit?,
254
- "Datasource needs refresh" => datasource_needs_refresh?,
255
- "Datasource was refreshed" => datasource_was_refreshed?,
256
- }.select { |k, v| v }.map { |k, v| k }
271
+ not_split: {
272
+ name: "Not split",
273
+ check: -> { not_split? },
274
+ },
275
+ refreshed_at_is_nil: {
276
+ name: "Refreshed at is nil",
277
+ check: -> { refreshed_at.nil? },
278
+ },
279
+ columns_need_refresh: {
280
+ name: "Columns need refresh",
281
+ check: -> { columns_need_refresh? },
282
+ },
283
+ features_need_fit: {
284
+ name: "Features need fit",
285
+ check: -> { features_need_fit? },
286
+ },
287
+ datasource_needs_refresh: {
288
+ name: "Datasource needs refresh",
289
+ check: -> { datasource_needs_refresh? },
290
+ },
291
+ refreshed_datasource: {
292
+ name: "Refreshed datasource",
293
+ check: -> { refreshed_datasource? },
294
+ },
295
+ datasource_was_refreshed: {
296
+ name: "Datasource was refreshed",
297
+ check: -> { datasource_was_refreshed? },
298
+ },
299
+ }.except(*exclude).select do |k, config|
300
+ config[:check].call
301
+ end.map do |k, config|
302
+ config[:name]
303
+ end
257
304
  end
258
305
 
259
- def needs_refresh?
260
- refresh_reasons.any?
306
+ def needs_refresh?(exclude: [])
307
+ refresh_reasons(exclude: exclude).any?
308
+ end
309
+
310
+ def processed?
311
+ !needs_refresh?
261
312
  end
262
313
 
263
314
  def not_split?
@@ -274,7 +325,6 @@ module EasyML
274
325
 
275
326
  def learn(delete: true)
276
327
  learn_schema
277
- learn_statistics
278
328
  columns.sync(delete: delete)
279
329
  end
280
330
 
@@ -326,6 +376,8 @@ module EasyML
326
376
 
327
377
  def learn_schema
328
378
  data = processed.data(limit: 1).to_a.any? ? processed.data : raw.data
379
+ return nil if data.nil?
380
+
329
381
  schema = data.schema.reduce({}) do |h, (k, v)|
330
382
  h.tap do
331
383
  h[k] = EasyML::Data::PolarsColumn.polars_to_sym(v)
@@ -334,23 +386,23 @@ module EasyML
334
386
  write_attribute(:schema, schema)
335
387
  end
336
388
 
337
- def learn_statistics
338
- stats = {
339
- raw: EasyML::Data::StatisticsLearner.learn(raw, self),
340
- }
341
- stats.merge!(processed: EasyML::Data::StatisticsLearner.learn(processed, self)) if processed.data.present?
389
+ def learn_statistics(type: :raw, computed: false)
390
+ columns.learn(type: type, computed: computed)
391
+ update(
392
+ statistics: columns.reload.statistics,
393
+ )
394
+ end
342
395
 
343
- update(statistics: stats)
396
+ def statistics
397
+ (read_attribute(:statistics) || {}).with_indifferent_access
344
398
  end
345
399
 
346
400
  def process_data
347
- split_data
348
401
  fit
349
402
  normalize_all
350
- # alert_nulls
351
403
  end
352
404
 
353
- def needs_learn?(df)
405
+ def needs_learn?
354
406
  return true if columns_need_refresh?
355
407
 
356
408
  never_learned = columns.none?
@@ -359,6 +411,7 @@ module EasyML
359
411
  new_features = features.any? { |f| f.updated_at > columns.maximum(:updated_at) }
360
412
  return true if new_features
361
413
 
414
+ df = raw.query(limit: 1)
362
415
  new_cols = df.present? ? (df.columns - columns.map(&:name)) : []
363
416
  new_cols = columns.syncable
364
417
 
@@ -390,22 +443,23 @@ module EasyML
390
443
  { differing_columns: differing_columns, differences: differences }
391
444
  end
392
445
 
393
- def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features, idx: nil)
394
- df = apply_features(df, features)
395
- df = drop_nulls(df)
396
- df = apply_missing_features(df, inference: inference)
397
- df = preprocessor.postprocess(df, inference: inference)
446
+ def validate_input(df)
447
+ fields = missing_required_fields(df)
448
+ return fields.empty?, fields
449
+ end
398
450
 
399
- # Learn will update columns, so if any features have been added
400
- # since the last time columns were learned, we should re-learn the schema
401
- learn(delete: false) if idx == 1 && needs_learn?(df)
451
+ def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
452
+ df = apply_missing_features(df, inference: inference)
453
+ df = drop_nulls(df)
454
+ df = columns.transform(df, inference: inference)
455
+ df = apply_features(df, features)
456
+ df = columns.transform(df, inference: inference, computed: true)
402
457
  df = apply_column_mask(df, inference: inference) unless all_columns
403
- raise_on_nulls(df) if inference
404
458
  df, = processed.split_features_targets(df, true, target) if split_ys
405
459
  df
406
460
  end
407
461
 
408
- def raise_on_nulls(df)
462
+ def missing_required_fields(df)
409
463
  desc_df = df.describe
410
464
 
411
465
  # Get the 'null_count' row
@@ -416,8 +470,10 @@ module EasyML
416
470
  null_count_row[col][0].to_i > 0
417
471
  end
418
472
 
419
- if columns_with_nulls.any?
420
- raise "Null values found in columns: #{columns_with_nulls.join(", ")}"
473
+ # This is a history class, because this only occurs on prediction
474
+ required_columns = columns.current.required.map(&:name)
475
+ required_columns.select do |col|
476
+ columns_with_nulls.include?(col) || df.columns.map(&:to_s).exclude?(col.to_s)
421
477
  end
422
478
  end
423
479
 
@@ -478,16 +534,15 @@ module EasyML
478
534
  result.empty? ? nil : result
479
535
  end
480
536
 
481
- def processed?
482
- !should_split?
483
- end
484
-
485
537
  def decode_labels(ys, col: nil)
486
- preprocessor.decode_labels(ys, col: col.nil? ? target : col)
538
+ if col.nil?
539
+ col = target
540
+ end
541
+ preloaded_columns.find_by(name: col).decode_labels(ys)
487
542
  end
488
543
 
489
544
  def preprocessing_steps
490
- return if columns.nil? || (columns.respond_to?(:empty?) && columns.empty?)
545
+ return {} if preloaded_columns.nil? || (preloaded_columns.respond_to?(:empty?) && preloaded_columns.empty?)
491
546
  return @preprocessing_steps if @preprocessing_steps.present?
492
547
 
493
548
  training = standardize_preprocessing_steps(:training)
@@ -499,23 +554,16 @@ module EasyML
499
554
  }.compact.deep_symbolize_keys
500
555
  end
501
556
 
502
- def preprocessor
503
- @preprocessor ||= initialize_preprocessor
504
- return @preprocessor if @preprocessor.preprocessing_steps == preprocessing_steps
505
-
506
- @preprocessor = initialize_preprocessor
507
- end
508
-
509
557
  def target
510
558
  @target ||= preloaded_columns.find(&:is_target)&.name
511
559
  end
512
560
 
513
561
  def date_column
514
- @date_column ||= columns.date_column.first
562
+ @date_column ||= preloaded_columns.find(&:is_date_column?)
515
563
  end
516
564
 
517
565
  def drop_cols
518
- @drop_cols ||= preloaded_columns.select(&:hidden).flat_map(&:columns)
566
+ @drop_cols ||= preloaded_columns.select(&:hidden).flat_map(&:aliases)
519
567
  end
520
568
 
521
569
  def drop_if_null
@@ -552,10 +600,14 @@ module EasyML
552
600
  df[column_mask(df, inference: inference)]
553
601
  end
554
602
 
555
- def apply_missing_features(df, inference: false)
603
+ def apply_missing_features(df, inference: false, include_one_hots: false)
556
604
  return df unless inference
557
605
 
558
606
  missing_features = (col_order(inference: inference) - df.columns).compact
607
+ unless include_one_hots
608
+ missing_features -= columns.one_hots.flat_map(&:virtual_columns) unless include_one_hots
609
+ missing_features += columns.one_hots.map(&:name) - df.columns
610
+ end
559
611
  df.with_columns(missing_features.map { |f| Polars.lit(nil).alias(f) })
560
612
  end
561
613
 
@@ -576,7 +628,7 @@ module EasyML
576
628
  end
577
629
 
578
630
  def upload_remote_files
579
- return unless processed?
631
+ return if !needs_refresh?
580
632
 
581
633
  processed.upload.tap do
582
634
  features.each(&:upload_remote_files)
@@ -648,27 +700,42 @@ module EasyML
648
700
 
649
701
  def refresh_datasource
650
702
  datasource.reload.refresh
651
- refresh_datatypes
652
- initialize_splits
703
+ after_refresh_datasource
653
704
  end
654
705
 
655
706
  def refresh_datasource!
656
707
  datasource.reload.refresh!
657
- refresh_datatypes
708
+ after_refresh_datasource
709
+ end
710
+
711
+ def after_refresh_datasource
712
+ update(last_datasource_sha: datasource.sha)
658
713
  initialize_splits
659
714
  end
660
715
 
661
716
  def normalize_all
662
717
  processed.cleanup
663
718
 
664
- SPLIT_ORDER.each_with_index do |segment, idx|
719
+ SPLIT_ORDER.each do |segment|
665
720
  df = raw.read(segment)
666
- processed_df = normalize(df, all_columns: true, idx: idx)
721
+ learn_computed_columns(df) if segment == :train
722
+ processed_df = normalize(df, all_columns: true)
667
723
  processed.save(segment, processed_df)
668
724
  end
669
725
  @normalized = true
670
726
  end
671
727
 
728
+ def learn_computed_columns(df)
729
+ return unless features.ready_to_apply.any?
730
+
731
+ df = df.clone
732
+ df = apply_features(df)
733
+ processed.save(:train, df)
734
+ learn(delete: false)
735
+ learn_statistics(type: :processed, computed: true)
736
+ processed.cleanup
737
+ end
738
+
672
739
  def drop_nulls(df)
673
740
  return df if drop_if_null.nil? || drop_if_null.empty?
674
741
 
@@ -678,8 +745,12 @@ module EasyML
678
745
  df.drop_nulls(subset: drop)
679
746
  end
680
747
 
748
+ # Pass refresh: false for frontend views so we don't query S3 during web requests
681
749
  def load_data(segment, **kwargs, &block)
682
- if processed?
750
+ needs_refresh = kwargs.key?(:refresh) ? kwargs[:refresh] : needs_refresh?
751
+ kwargs.delete(:refresh)
752
+
753
+ if !needs_refresh
683
754
  processed.load_data(segment, **kwargs, &block)
684
755
  else
685
756
  raw.load_data(segment, **kwargs, &block)
@@ -687,8 +758,7 @@ module EasyML
687
758
  end
688
759
 
689
760
  def fit
690
- preprocessor.fit(raw.train(all_columns: true))
691
- self.preprocessor_statistics = preprocessor.statistics
761
+ learn_statistics(type: :raw)
692
762
  end
693
763
 
694
764
  # log_method :fit, "Learning statistics", verbose: true
@@ -698,10 +768,9 @@ module EasyML
698
768
  end
699
769
 
700
770
  def split_data(force: false)
701
- return unless force || should_split?
771
+ return unless force || needs_refresh?
702
772
 
703
773
  cleanup
704
- features = self.features.ordered.load
705
774
  splitter.split(datasource) do |train_df, valid_df, test_df|
706
775
  [:train, :valid, :test].zip([train_df, valid_df, test_df]).each do |segment, df|
707
776
  raw.save(segment, df)
@@ -709,10 +778,6 @@ module EasyML
709
778
  end
710
779
  end
711
780
 
712
- def should_split?
713
- needs_refresh?
714
- end
715
-
716
781
  def filter_duplicate_features
717
782
  return unless attributes["features_attributes"].present?
718
783
 
@@ -733,6 +798,7 @@ module EasyML
733
798
  end
734
799
 
735
800
  def apply_features(df, features = self.features)
801
+ features = features.ready_to_apply
736
802
  if features.nil? || features.empty?
737
803
  df
738
804
  else
@@ -754,10 +820,6 @@ module EasyML
754
820
 
755
821
  result = feature.transform_batch(acc_df)
756
822
 
757
- unless result.is_a?(Polars::DataFrame)
758
- raise "Feature '#{feature.name}' must return a Polars::DataFrame, got #{result.class}"
759
- end
760
-
761
823
  result
762
824
  end
763
825
  end
@@ -769,16 +831,6 @@ module EasyML
769
831
  end).to_h.compact.reject { |_k, v| v["method"] == "none" }
770
832
  end
771
833
 
772
- def initialize_preprocessor
773
- EasyML::Data::Preprocessor.new(
774
- directory: Pathname.new(root_dir).append("preprocessor"),
775
- preprocessing_steps: preprocessing_steps,
776
- dataset: self,
777
- ).tap do |preprocessor|
778
- preprocessor.statistics = preprocessor_statistics
779
- end
780
- end
781
-
782
834
  def fully_reload
783
835
  return unless persisted?
784
836
 
@@ -2,28 +2,28 @@
2
2
  #
3
3
  # Table name: easy_ml_dataset_histories
4
4
  #
5
- # id :bigint not null, primary key
6
- # dataset_id :integer not null
7
- # name :string not null
8
- # description :string
9
- # dataset_type :string
10
- # status :string
11
- # version :string
12
- # datasource_id :integer
13
- # root_dir :string
14
- # configuration :json
15
- # num_rows :integer
16
- # workflow_status :string
17
- # statistics :json
18
- # preprocessor_statistics :json
19
- # schema :json
20
- # refreshed_at :datetime
21
- # created_at :datetime not null
22
- # updated_at :datetime not null
23
- # history_started_at :datetime not null
24
- # history_ended_at :datetime
25
- # history_user_id :integer
26
- # snapshot_id :string
5
+ # id :bigint not null, primary key
6
+ # dataset_id :integer not null
7
+ # name :string not null
8
+ # description :string
9
+ # dataset_type :string
10
+ # status :string
11
+ # version :string
12
+ # datasource_id :integer
13
+ # root_dir :string
14
+ # configuration :json
15
+ # num_rows :integer
16
+ # workflow_status :string
17
+ # statistics :json
18
+ # schema :json
19
+ # refreshed_at :datetime
20
+ # created_at :datetime not null
21
+ # updated_at :datetime not null
22
+ # history_started_at :datetime not null
23
+ # history_ended_at :datetime
24
+ # history_user_id :integer
25
+ # snapshot_id :string
26
+ # last_datasource_sha :string
27
27
  #
28
28
  module EasyML
29
29
  class DatasetHistory < ActiveRecord::Base
@@ -44,7 +44,7 @@ module EasyML
44
44
  true
45
45
  end
46
46
 
47
- def should_split?
47
+ def needs_refresh?
48
48
  false
49
49
  end
50
50
  end
@@ -10,6 +10,7 @@
10
10
  # refreshed_at :datetime
11
11
  # created_at :datetime not null
12
12
  # updated_at :datetime not null
13
+ # sha :string
13
14
  #
14
15
  module EasyML
15
16
  class Datasource < ActiveRecord::Base
@@ -55,6 +56,7 @@ module EasyML
55
56
 
56
57
  has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
57
58
  attr_accessor :schema, :columns, :num_rows, :is_syncing
59
+ belongs_to :dataset, class_name: "EasyML::Dataset", optional: true, dependent: :destroy
58
60
 
59
61
  add_configuration_attributes :schema, :columns, :num_rows, :polars_args, :verbose, :is_syncing
60
62
  DATASOURCE_CONSTANTS.flat_map(&:configuration_attributes).each do |attribute|
@@ -118,11 +120,13 @@ module EasyML
118
120
  self.num_rows = data.shape[0]
119
121
  self.is_syncing = false
120
122
  self.refreshed_at = Time.now
123
+ self.sha = adapter.sha
121
124
  save
122
125
  end
123
126
 
124
127
  def refresh
125
128
  unless adapter.needs_refresh?
129
+ update(sha: adapter.sha) if sha.nil?
126
130
  update!(is_syncing: false)
127
131
  return
128
132
  end
@@ -15,6 +15,7 @@
15
15
  # history_ended_at :datetime
16
16
  # history_user_id :integer
17
17
  # snapshot_id :string
18
+ # sha :string
18
19
  #
19
20
  module EasyML
20
21
  class DatasourceHistory < ActiveRecord::Base
@@ -1,7 +1,7 @@
1
1
  module EasyML
2
2
  module Datasources
3
3
  class FileDatasource < BaseDatasource
4
- delegate :query, :convert_to_parquet, to: :reader
4
+ delegate :query, :convert_to_parquet, :sha, to: :reader
5
5
 
6
6
  def after_sync
7
7
  reader.normalize
@@ -6,18 +6,8 @@ module EasyML
6
6
  validates :df, presence: true
7
7
  add_configuration_attributes :df
8
8
 
9
- def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
10
- return if df.nil?
11
-
12
- df = self.df.clone
13
- df = df.filter(filter) if filter
14
- df = df.select(select) if select.present?
15
- df = df.unique if unique
16
- drop_cols &= df.columns
17
- df = df.drop(drop_cols) unless drop_cols.empty?
18
- df = df.sort(sort, reverse: descending) if sort
19
- df = df.limit(limit) if limit
20
- df
9
+ def query(**kwargs)
10
+ EasyML::Data::PolarsInMemory.query(df, **kwargs)
21
11
  end
22
12
 
23
13
  def in_batches(of: 10_000)
@@ -40,6 +30,10 @@ module EasyML
40
30
  datasource.updated_at
41
31
  end
42
32
 
33
+ def sha
34
+ nil
35
+ end
36
+
43
37
  def data
44
38
  df
45
39
  end
@@ -17,7 +17,7 @@ module EasyML
17
17
  add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :cache_for
18
18
 
19
19
  delegate :query, :data, :s3_access_key_id, :s3_secret_access_key, :before_sync, :after_sync, :clean,
20
- to: :synced_directory
20
+ :sha, to: :synced_directory
21
21
 
22
22
  def in_batches(&block)
23
23
  synced_directory.in_batches(&block)