easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/application_controller.rb +4 -0
  3. data/app/controllers/easy_ml/datasets_controller.rb +32 -1
  4. data/app/frontend/components/DatasetPreview.tsx +50 -19
  5. data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
  6. data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
  7. data/app/frontend/components/dataset/ColumnList.tsx +14 -2
  8. data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
  9. data/app/frontend/types/dataset.ts +3 -0
  10. data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
  11. data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
  12. data/app/models/easy_ml/column/imputers/base.rb +89 -0
  13. data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
  14. data/app/models/easy_ml/column/imputers/clip.rb +30 -0
  15. data/app/models/easy_ml/column/imputers/constant.rb +27 -0
  16. data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
  17. data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
  18. data/app/models/easy_ml/column/imputers/mean.rb +27 -0
  19. data/app/models/easy_ml/column/imputers/median.rb +27 -0
  20. data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
  21. data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
  22. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
  23. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
  24. data/app/models/easy_ml/column/imputers/today.rb +20 -0
  25. data/app/models/easy_ml/column/imputers.rb +126 -0
  26. data/app/models/easy_ml/column/learner.rb +18 -0
  27. data/app/models/easy_ml/column/learners/base.rb +103 -0
  28. data/app/models/easy_ml/column/learners/boolean.rb +11 -0
  29. data/app/models/easy_ml/column/learners/categorical.rb +51 -0
  30. data/app/models/easy_ml/column/learners/datetime.rb +19 -0
  31. data/app/models/easy_ml/column/learners/null.rb +22 -0
  32. data/app/models/easy_ml/column/learners/numeric.rb +33 -0
  33. data/app/models/easy_ml/column/learners/string.rb +15 -0
  34. data/app/models/easy_ml/column/lineage/base.rb +22 -0
  35. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
  36. data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
  37. data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
  38. data/app/models/easy_ml/column/lineage.rb +28 -0
  39. data/app/models/easy_ml/column/selector.rb +96 -0
  40. data/app/models/easy_ml/column.rb +319 -52
  41. data/app/models/easy_ml/column_history.rb +29 -22
  42. data/app/models/easy_ml/column_list.rb +63 -78
  43. data/app/models/easy_ml/dataset.rb +128 -96
  44. data/app/models/easy_ml/dataset_history.rb +23 -23
  45. data/app/models/easy_ml/datasource.rb +3 -0
  46. data/app/models/easy_ml/datasource_history.rb +1 -0
  47. data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
  48. data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
  49. data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
  50. data/app/models/easy_ml/feature.rb +19 -7
  51. data/app/models/easy_ml/feature_history.rb +12 -0
  52. data/app/models/easy_ml/feature_list.rb +15 -0
  53. data/app/serializers/easy_ml/column_serializer.rb +11 -1
  54. data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
  55. data/config/initializers/enumerable.rb +17 -0
  56. data/lib/easy_ml/data/date_converter.rb +137 -30
  57. data/lib/easy_ml/data/polars_column.rb +17 -0
  58. data/lib/easy_ml/data/polars_in_memory.rb +30 -0
  59. data/lib/easy_ml/data/polars_reader.rb +20 -1
  60. data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
  61. data/lib/easy_ml/data/splits/split.rb +2 -1
  62. data/lib/easy_ml/data/synced_directory.rb +1 -1
  63. data/lib/easy_ml/data.rb +1 -2
  64. data/lib/easy_ml/engine.rb +1 -0
  65. data/lib/easy_ml/feature_store.rb +33 -22
  66. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
  67. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
  68. data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
  69. data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
  70. data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
  71. data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
  72. data/lib/easy_ml/version.rb +1 -1
  73. data/lib/tasks/profile.rake +40 -0
  74. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  75. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
  76. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
  77. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
  78. metadata +41 -10
  79. data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
  80. data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
  81. data/lib/easy_ml/data/preprocessor.rb +0 -340
  82. data/lib/easy_ml/data/simple_imputer.rb +0 -255
  83. data/lib/easy_ml/data/statistics_learner.rb +0 -193
  84. data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
  85. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
  86. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
@@ -1,24 +1,24 @@
1
- # == Schetuma Information
1
+ # == Schema Information
2
2
  #
3
3
  # Table name: easy_ml_datasets
4
4
  #
5
- # id :bigint not null, primary key
6
- # name :string not null
7
- # description :string
8
- # dataset_type :string
9
- # status :string
10
- # version :string
11
- # datasource_id :bigint
12
- # root_dir :string
13
- # configuration :json
14
- # num_rows :bigint
15
- # workflow_status :string
16
- # statistics :json
17
- # preprocessor_statistics :json
18
- # schema :json
19
- # refreshed_at :datetime
20
- # created_at :datetime not null
21
- # updated_at :datetime not null
5
+ # id :bigint not null, primary key
6
+ # name :string not null
7
+ # description :string
8
+ # dataset_type :string
9
+ # status :string
10
+ # version :string
11
+ # datasource_id :bigint
12
+ # root_dir :string
13
+ # configuration :json
14
+ # num_rows :bigint
15
+ # workflow_status :string
16
+ # statistics :json
17
+ # schema :json
18
+ # refreshed_at :datetime
19
+ # created_at :datetime not null
20
+ # updated_at :datetime not null
21
+ # last_datasource_sha :string
22
22
  #
23
23
  module EasyML
24
24
  class Dataset < ActiveRecord::Base
@@ -45,7 +45,7 @@ module EasyML
45
45
  has_many :columns, class_name: "EasyML::Column", dependent: :destroy, extend: EasyML::ColumnList
46
46
  accepts_nested_attributes_for :columns, allow_destroy: true, update_only: true
47
47
 
48
- has_many :features, dependent: :destroy, class_name: "EasyML::Feature"
48
+ has_many :features, dependent: :destroy, class_name: "EasyML::Feature", extend: EasyML::FeatureList
49
49
  accepts_nested_attributes_for :features, allow_destroy: true
50
50
 
51
51
  has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
@@ -80,7 +80,7 @@ module EasyML
80
80
  column_types: EasyML::Data::PolarsColumn::TYPE_MAP.keys.map do |type|
81
81
  { value: type.to_s, label: type.to_s.titleize }
82
82
  end,
83
- preprocessing_strategies: EasyML::Data::Preprocessor.constants[:preprocessing_strategies],
83
+ preprocessing_strategies: EasyML::Column::Imputers.constants[:preprocessing_strategies],
84
84
  feature_options: EasyML::Features::Registry.list_flat,
85
85
  splitter_constants: EasyML::Splitter.constants,
86
86
  }
@@ -119,13 +119,6 @@ module EasyML
119
119
  processed.data(limit: 1)&.schema || raw.data(limit: 1)&.schema
120
120
  end
121
121
 
122
- def refresh_datatypes
123
- return unless columns_need_refresh?
124
-
125
- cleanup
126
- datasource.reread(columns)
127
- end
128
-
129
122
  def num_rows
130
123
  if datasource&.num_rows.nil?
131
124
  datasource.after_sync
@@ -142,7 +135,7 @@ module EasyML
142
135
 
143
136
  def best_segment
144
137
  [processed, raw].detect do |segment|
145
- segment.send(:train, all_columns: true, limit: 1)&.columns
138
+ segment.send(:data, all_columns: true, limit: 1)&.columns
146
139
  end
147
140
  end
148
141
 
@@ -168,15 +161,27 @@ module EasyML
168
161
  save
169
162
  end
170
163
 
164
+ def refreshed_datasource?
165
+ last_datasource_sha_changed?
166
+ end
167
+
168
+ def prepare_features
169
+ features.update_all(workflow_status: "ready")
170
+ end
171
+
171
172
  def prepare!
173
+ prepare_features
172
174
  cleanup
173
175
  refresh_datasource!
174
176
  split_data
177
+ process_data
175
178
  end
176
179
 
177
180
  def prepare
181
+ prepare_features
178
182
  refresh_datasource
179
183
  split_data
184
+ process_data
180
185
  end
181
186
 
182
187
  def actually_refresh
@@ -184,7 +189,8 @@ module EasyML
184
189
  learn(delete: false) # After syncing datasource, learn new statistics + sync columns
185
190
  process_data
186
191
  fully_reload
187
- learn # After processing data, we may have new columns from newly applied features
192
+ learn
193
+ learn_statistics(type: :processed) # After processing data, we learn any new statistics
188
194
  now = UTC.now
189
195
  update(workflow_status: "ready", refreshed_at: now, updated_at: now)
190
196
  fully_reload
@@ -252,19 +258,57 @@ module EasyML
252
258
  features_need_fit.any?
253
259
  end
254
260
 
255
- def refresh_reasons
261
+ # Some of these are expensive to calculate, so we only want to include
262
+ # them in the refresh reasons if they are actually needed.
263
+ #
264
+ # During dataset_serializer for instance, we don't want to check s3,
265
+ # we only do that during background jobs.
266
+ #
267
+ # So yes this is an annoying way to structure a method, but it's helpful for performance
268
+ #
269
+ def refresh_reasons(exclude: [])
256
270
  {
257
- "Not split" => not_split?,
258
- "Refreshed at is nil" => refreshed_at.nil?,
259
- "Columns need refresh" => columns_need_refresh?,
260
- "Features need refresh" => features_need_fit?,
261
- "Datasource needs refresh" => datasource_needs_refresh?,
262
- "Datasource was refreshed" => datasource_was_refreshed?,
263
- }.select { |k, v| v }.map { |k, v| k }
271
+ not_split: {
272
+ name: "Not split",
273
+ check: -> { not_split? },
274
+ },
275
+ refreshed_at_is_nil: {
276
+ name: "Refreshed at is nil",
277
+ check: -> { refreshed_at.nil? },
278
+ },
279
+ columns_need_refresh: {
280
+ name: "Columns need refresh",
281
+ check: -> { columns_need_refresh? },
282
+ },
283
+ features_need_fit: {
284
+ name: "Features need refresh",
285
+ check: -> { features_need_fit? },
286
+ },
287
+ datasource_needs_refresh: {
288
+ name: "Datasource needs refresh",
289
+ check: -> { datasource_needs_refresh? },
290
+ },
291
+ refreshed_datasource: {
292
+ name: "Refreshed datasource",
293
+ check: -> { refreshed_datasource? },
294
+ },
295
+ datasource_was_refreshed: {
296
+ name: "Datasource was refreshed",
297
+ check: -> { datasource_was_refreshed? },
298
+ },
299
+ }.except(*exclude).select do |k, config|
300
+ config[:check].call
301
+ end.map do |k, config|
302
+ config[:name]
303
+ end
304
+ end
305
+
306
+ def needs_refresh?(exclude: [])
307
+ refresh_reasons(exclude: exclude).any?
264
308
  end
265
309
 
266
- def needs_refresh?
267
- refresh_reasons.any?
310
+ def processed?
311
+ !needs_refresh?
268
312
  end
269
313
 
270
314
  def not_split?
@@ -281,7 +325,6 @@ module EasyML
281
325
 
282
326
  def learn(delete: true)
283
327
  learn_schema
284
- learn_statistics
285
328
  columns.sync(delete: delete)
286
329
  end
287
330
 
@@ -333,6 +376,8 @@ module EasyML
333
376
 
334
377
  def learn_schema
335
378
  data = processed.data(limit: 1).to_a.any? ? processed.data : raw.data
379
+ return nil if data.nil?
380
+
336
381
  schema = data.schema.reduce({}) do |h, (k, v)|
337
382
  h.tap do
338
383
  h[k] = EasyML::Data::PolarsColumn.polars_to_sym(v)
@@ -341,19 +386,15 @@ module EasyML
341
386
  write_attribute(:schema, schema)
342
387
  end
343
388
 
344
- def learn_statistics
345
- stats = {
346
- raw: EasyML::Data::StatisticsLearner.learn(raw, self, :raw),
347
- }
348
- stats.merge!(processed: EasyML::Data::StatisticsLearner.learn(processed, self, :processed)) if processed.data.present?
349
-
350
- columns.select(&:is_computed).each do |col|
351
- if stats.dig(:processed, col.name)
352
- stats[:raw][col.name] = stats[:processed][col.name]
353
- end
354
- end
389
+ def learn_statistics(type: :raw, computed: false)
390
+ columns.learn(type: type, computed: computed)
391
+ update(
392
+ statistics: columns.reload.statistics,
393
+ )
394
+ end
355
395
 
356
- update(statistics: stats)
396
+ def statistics
397
+ (read_attribute(:statistics) || {}).with_indifferent_access
357
398
  end
358
399
 
359
400
  def process_data
@@ -410,10 +451,9 @@ module EasyML
410
451
  def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
411
452
  df = apply_missing_features(df, inference: inference)
412
453
  df = drop_nulls(df)
413
- df = preprocessor.postprocess(df, inference: inference)
454
+ df = columns.transform(df, inference: inference)
414
455
  df = apply_features(df, features)
415
- learn unless inference # After applying features, we need to learn new statistics
416
- df = preprocessor.postprocess(df, inference: inference, computed: true)
456
+ df = columns.transform(df, inference: inference, computed: true)
417
457
  df = apply_column_mask(df, inference: inference) unless all_columns
418
458
  df, = processed.split_features_targets(df, true, target) if split_ys
419
459
  df
@@ -494,16 +534,15 @@ module EasyML
494
534
  result.empty? ? nil : result
495
535
  end
496
536
 
497
- def processed?
498
- !should_split?
499
- end
500
-
501
537
  def decode_labels(ys, col: nil)
502
- preprocessor.decode_labels(ys, col: col.nil? ? target : col)
538
+ if col.nil?
539
+ col = target
540
+ end
541
+ preloaded_columns.find_by(name: col).decode_labels(ys)
503
542
  end
504
543
 
505
544
  def preprocessing_steps
506
- return {} if columns.nil? || (columns.respond_to?(:empty?) && columns.empty?)
545
+ return {} if preloaded_columns.nil? || (preloaded_columns.respond_to?(:empty?) && preloaded_columns.empty?)
507
546
  return @preprocessing_steps if @preprocessing_steps.present?
508
547
 
509
548
  training = standardize_preprocessing_steps(:training)
@@ -515,19 +554,12 @@ module EasyML
515
554
  }.compact.deep_symbolize_keys
516
555
  end
517
556
 
518
- def preprocessor
519
- @preprocessor ||= initialize_preprocessor
520
- return @preprocessor if @preprocessor.preprocessing_steps == preprocessing_steps
521
-
522
- @preprocessor = initialize_preprocessor
523
- end
524
-
525
557
  def target
526
558
  @target ||= preloaded_columns.find(&:is_target)&.name
527
559
  end
528
560
 
529
561
  def date_column
530
- @date_column ||= columns.date_column.first
562
+ @date_column ||= preloaded_columns.find(&:is_date_column?)
531
563
  end
532
564
 
533
565
  def drop_cols
@@ -596,7 +628,7 @@ module EasyML
596
628
  end
597
629
 
598
630
  def upload_remote_files
599
- return unless processed?
631
+ return if !needs_refresh?
600
632
 
601
633
  processed.upload.tap do
602
634
  features.each(&:upload_remote_files)
@@ -668,13 +700,16 @@ module EasyML
668
700
 
669
701
  def refresh_datasource
670
702
  datasource.reload.refresh
671
- refresh_datatypes
672
- initialize_splits
703
+ after_refresh_datasource
673
704
  end
674
705
 
675
706
  def refresh_datasource!
676
707
  datasource.reload.refresh!
677
- refresh_datatypes
708
+ after_refresh_datasource
709
+ end
710
+
711
+ def after_refresh_datasource
712
+ update(last_datasource_sha: datasource.sha)
678
713
  initialize_splits
679
714
  end
680
715
 
@@ -683,12 +718,24 @@ module EasyML
683
718
 
684
719
  SPLIT_ORDER.each do |segment|
685
720
  df = raw.read(segment)
721
+ learn_computed_columns(df) if segment == :train
686
722
  processed_df = normalize(df, all_columns: true)
687
723
  processed.save(segment, processed_df)
688
724
  end
689
725
  @normalized = true
690
726
  end
691
727
 
728
+ def learn_computed_columns(df)
729
+ return unless features.ready_to_apply.any?
730
+
731
+ df = df.clone
732
+ df = apply_features(df)
733
+ processed.save(:train, df)
734
+ learn(delete: false)
735
+ learn_statistics(type: :processed, computed: true)
736
+ processed.cleanup
737
+ end
738
+
692
739
  def drop_nulls(df)
693
740
  return df if drop_if_null.nil? || drop_if_null.empty?
694
741
 
@@ -698,8 +745,12 @@ module EasyML
698
745
  df.drop_nulls(subset: drop)
699
746
  end
700
747
 
748
+ # Pass refresh: false for frontend views so we don't query S3 during web requests
701
749
  def load_data(segment, **kwargs, &block)
702
- if processed?
750
+ needs_refresh = kwargs.key?(:refresh) ? kwargs[:refresh] : needs_refresh?
751
+ kwargs.delete(:refresh)
752
+
753
+ if !needs_refresh
703
754
  processed.load_data(segment, **kwargs, &block)
704
755
  else
705
756
  raw.load_data(segment, **kwargs, &block)
@@ -707,9 +758,7 @@ module EasyML
707
758
  end
708
759
 
709
760
  def fit
710
- computed_statistics = columns.where(is_computed: true).reduce({}) { |h, c| h.tap { h[c.name] = c.statistics.dig("processed") } }
711
- preprocessor.fit(raw.train(all_columns: true), computed_statistics)
712
- update(preprocessor_statistics: preprocessor.statistics)
761
+ learn_statistics(type: :raw)
713
762
  end
714
763
 
715
764
  # log_method :fit, "Learning statistics", verbose: true
@@ -719,7 +768,7 @@ module EasyML
719
768
  end
720
769
 
721
770
  def split_data(force: false)
722
- return unless force || should_split?
771
+ return unless force || needs_refresh?
723
772
 
724
773
  cleanup
725
774
  splitter.split(datasource) do |train_df, valid_df, test_df|
@@ -729,10 +778,6 @@ module EasyML
729
778
  end
730
779
  end
731
780
 
732
- def should_split?
733
- needs_refresh?
734
- end
735
-
736
781
  def filter_duplicate_features
737
782
  return unless attributes["features_attributes"].present?
738
783
 
@@ -753,6 +798,7 @@ module EasyML
753
798
  end
754
799
 
755
800
  def apply_features(df, features = self.features)
801
+ features = features.ready_to_apply
756
802
  if features.nil? || features.empty?
757
803
  df
758
804
  else
@@ -774,10 +820,6 @@ module EasyML
774
820
 
775
821
  result = feature.transform_batch(acc_df)
776
822
 
777
- unless result.is_a?(Polars::DataFrame)
778
- raise "Feature '#{feature.name}' must return a Polars::DataFrame, got #{result.class}"
779
- end
780
-
781
823
  result
782
824
  end
783
825
  end
@@ -789,16 +831,6 @@ module EasyML
789
831
  end).to_h.compact.reject { |_k, v| v["method"] == "none" }
790
832
  end
791
833
 
792
- def initialize_preprocessor
793
- EasyML::Data::Preprocessor.new(
794
- directory: Pathname.new(root_dir).append("preprocessor"),
795
- preprocessing_steps: preprocessing_steps,
796
- dataset: self,
797
- ).tap do |preprocessor|
798
- preprocessor.statistics = preprocessor_statistics
799
- end
800
- end
801
-
802
834
  def fully_reload
803
835
  return unless persisted?
804
836
 
@@ -2,28 +2,28 @@
2
2
  #
3
3
  # Table name: easy_ml_dataset_histories
4
4
  #
5
- # id :bigint not null, primary key
6
- # dataset_id :integer not null
7
- # name :string not null
8
- # description :string
9
- # dataset_type :string
10
- # status :string
11
- # version :string
12
- # datasource_id :integer
13
- # root_dir :string
14
- # configuration :json
15
- # num_rows :integer
16
- # workflow_status :string
17
- # statistics :json
18
- # preprocessor_statistics :json
19
- # schema :json
20
- # refreshed_at :datetime
21
- # created_at :datetime not null
22
- # updated_at :datetime not null
23
- # history_started_at :datetime not null
24
- # history_ended_at :datetime
25
- # history_user_id :integer
26
- # snapshot_id :string
5
+ # id :bigint not null, primary key
6
+ # dataset_id :integer not null
7
+ # name :string not null
8
+ # description :string
9
+ # dataset_type :string
10
+ # status :string
11
+ # version :string
12
+ # datasource_id :integer
13
+ # root_dir :string
14
+ # configuration :json
15
+ # num_rows :integer
16
+ # workflow_status :string
17
+ # statistics :json
18
+ # schema :json
19
+ # refreshed_at :datetime
20
+ # created_at :datetime not null
21
+ # updated_at :datetime not null
22
+ # history_started_at :datetime not null
23
+ # history_ended_at :datetime
24
+ # history_user_id :integer
25
+ # snapshot_id :string
26
+ # last_datasource_sha :string
27
27
  #
28
28
  module EasyML
29
29
  class DatasetHistory < ActiveRecord::Base
@@ -44,7 +44,7 @@ module EasyML
44
44
  true
45
45
  end
46
46
 
47
- def should_split?
47
+ def needs_refresh?
48
48
  false
49
49
  end
50
50
  end
@@ -10,6 +10,7 @@
10
10
  # refreshed_at :datetime
11
11
  # created_at :datetime not null
12
12
  # updated_at :datetime not null
13
+ # sha :string
13
14
  #
14
15
  module EasyML
15
16
  class Datasource < ActiveRecord::Base
@@ -119,11 +120,13 @@ module EasyML
119
120
  self.num_rows = data.shape[0]
120
121
  self.is_syncing = false
121
122
  self.refreshed_at = Time.now
123
+ self.sha = adapter.sha
122
124
  save
123
125
  end
124
126
 
125
127
  def refresh
126
128
  unless adapter.needs_refresh?
129
+ update(sha: adapter.sha) if sha.nil?
127
130
  update!(is_syncing: false)
128
131
  return
129
132
  end
@@ -15,6 +15,7 @@
15
15
  # history_ended_at :datetime
16
16
  # history_user_id :integer
17
17
  # snapshot_id :string
18
+ # sha :string
18
19
  #
19
20
  module EasyML
20
21
  class DatasourceHistory < ActiveRecord::Base
@@ -1,7 +1,7 @@
1
1
  module EasyML
2
2
  module Datasources
3
3
  class FileDatasource < BaseDatasource
4
- delegate :query, :convert_to_parquet, to: :reader
4
+ delegate :query, :convert_to_parquet, :sha, to: :reader
5
5
 
6
6
  def after_sync
7
7
  reader.normalize
@@ -6,18 +6,8 @@ module EasyML
6
6
  validates :df, presence: true
7
7
  add_configuration_attributes :df
8
8
 
9
- def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
10
- return if df.nil?
11
-
12
- df = self.df.clone
13
- df = df.filter(filter) if filter
14
- df = df.select(select) if select.present?
15
- df = df.unique if unique
16
- drop_cols &= df.columns
17
- df = df.drop(drop_cols) unless drop_cols.empty?
18
- df = df.sort(sort, reverse: descending) if sort
19
- df = df.limit(limit) if limit
20
- df
9
+ def query(**kwargs)
10
+ EasyML::Data::PolarsInMemory.query(df, **kwargs)
21
11
  end
22
12
 
23
13
  def in_batches(of: 10_000)
@@ -40,6 +30,10 @@ module EasyML
40
30
  datasource.updated_at
41
31
  end
42
32
 
33
+ def sha
34
+ nil
35
+ end
36
+
43
37
  def data
44
38
  df
45
39
  end
@@ -17,7 +17,7 @@ module EasyML
17
17
  add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :cache_for
18
18
 
19
19
  delegate :query, :data, :s3_access_key_id, :s3_secret_access_key, :before_sync, :after_sync, :clean,
20
- to: :synced_directory
20
+ :sha, to: :synced_directory
21
21
 
22
22
  def in_batches(&block)
23
23
  synced_directory.in_batches(&block)
@@ -55,6 +55,7 @@ module EasyML
55
55
  end
56
56
 
57
57
  belongs_to :dataset, class_name: "EasyML::Dataset"
58
+ has_many :columns, class_name: "EasyML::Column", dependent: :destroy
58
59
 
59
60
  validates :feature_class, presence: true
60
61
  validates :feature_position, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 }
@@ -72,7 +73,7 @@ module EasyML
72
73
  end
73
74
 
74
75
  # Combine all conditions with OR
75
- where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).select { |f| f.adapter.respond_to?(:fit) }.map(&:id))
76
+ where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).map(&:id))
76
77
  }
77
78
  scope :never_applied, -> { where(applied_at: nil) }
78
79
  scope :never_fit, -> do
@@ -81,6 +82,7 @@ module EasyML
81
82
  where(id: fittable.map(&:id))
82
83
  end
83
84
  scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
85
+ scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
84
86
 
85
87
  before_save :apply_defaults, if: :new_record?
86
88
  before_save :update_sha
@@ -223,8 +225,11 @@ module EasyML
223
225
  def fit(features: [self], async: false)
224
226
  ordered_features = features.sort_by(&:feature_position)
225
227
  jobs = ordered_features.map(&:build_batches)
228
+ job_count = jobs.dup.flatten.size
226
229
 
227
- if async
230
+ # This is very important! For whatever reason, Resque BatchJob does not properly
231
+ # handle batch finished callbacks for batch size = 1
232
+ if async && job_count > 1
228
233
  EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
229
234
  else
230
235
  jobs.flatten.each do |job|
@@ -240,7 +245,8 @@ module EasyML
240
245
  if batch_args.key?(:batch_start)
241
246
  actually_fit_batch(batch_args)
242
247
  else
243
- actually_fit_batch(get_batch_args(**batch_args))
248
+ batch_args = get_batch_args(**batch_args)
249
+ actually_fit_batch(batch_args)
244
250
  end
245
251
  end
246
252
 
@@ -288,12 +294,14 @@ module EasyML
288
294
  batch_args.symbolize_keys!
289
295
 
290
296
  if adapter.respond_to?(:batch)
291
- batch_df = adapter.fit(dataset.raw, self, batch_args)
297
+ df = dataset.raw
292
298
  else
293
299
  df = build_batch(batch_args)
294
- batch_df = adapter.fit(df, self, batch_args)
295
300
  end
296
301
  end
302
+ return if df.blank?
303
+
304
+ batch_df = adapter.fit(df, self, batch_args)
297
305
  if batch_df.present?
298
306
  store(batch_df)
299
307
  else
@@ -306,7 +314,11 @@ module EasyML
306
314
  return nil unless df.is_a?(Polars::DataFrame)
307
315
  return df if !adapter.respond_to?(:transform) && feature_store.empty?
308
316
 
317
+ df_len_was = df.shape[0]
309
318
  result = adapter.transform(df, self)
319
+ raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
320
+ df_len_now = result.shape[0]
321
+ raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
310
322
  update!(applied_at: Time.current)
311
323
  result
312
324
  end
@@ -384,8 +396,8 @@ module EasyML
384
396
  feature_store.list_partitions
385
397
  end
386
398
 
387
- def query(filter: nil)
388
- feature_store.query(filter: filter)
399
+ def query(**kwargs)
400
+ feature_store.query(**kwargs)
389
401
  end
390
402
 
391
403
  def store(df)
@@ -31,6 +31,18 @@ module EasyML
31
31
 
32
32
  after_find :download_remote_files
33
33
  scope :ordered, -> { order(feature_position: :asc) }
34
+ scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
35
+ scope :has_changes, lambda {
36
+ none
37
+ }
38
+ scope :never_applied, -> { where(applied_at: nil) }
39
+ scope :never_fit, -> do
40
+ fittable = where(fit_at: nil)
41
+ fittable = fittable.select { |f| f.adapter.respond_to?(:fit) }
42
+ where(id: fittable.map(&:id))
43
+ end
44
+ scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
45
+ scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
34
46
 
35
47
  def download_remote_files
36
48
  feature_store&.download
@@ -0,0 +1,15 @@
1
+ module EasyML
2
+ module FeatureList
3
+ def feature_list
4
+ self
5
+ end
6
+
7
+ def dataset
8
+ proxy_association.owner
9
+ end
10
+
11
+ def computed_column_names
12
+ flat_map(&:computes_columns).uniq
13
+ end
14
+ end
15
+ end