easy_ml 0.2.0.pre.rc89 → 0.2.0.pre.rc91

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/predictions_controller.rb +9 -4
  3. data/app/jobs/easy_ml/training_job.rb +2 -2
  4. data/app/models/easy_ml/column/imputers/base.rb +1 -1
  5. data/app/models/easy_ml/column/imputers/categorical.rb +1 -1
  6. data/app/models/easy_ml/column/imputers/embedding_encoder.rb +2 -0
  7. data/app/models/easy_ml/column/imputers/imputer.rb +4 -0
  8. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -0
  9. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -0
  10. data/app/models/easy_ml/column.rb +25 -1
  11. data/app/models/easy_ml/column_list/imputer.rb +4 -0
  12. data/app/models/easy_ml/column_list.rb +23 -8
  13. data/app/models/easy_ml/dataset/learner.rb +0 -10
  14. data/app/models/easy_ml/dataset.rb +28 -43
  15. data/app/models/easy_ml/deploy.rb +28 -19
  16. data/app/models/easy_ml/feature.rb +10 -8
  17. data/app/models/easy_ml/feature_history.rb +9 -0
  18. data/app/models/easy_ml/model.rb +12 -7
  19. data/app/models/easy_ml/models/xgboost/evals_callback.rb +2 -2
  20. data/app/models/easy_ml/prediction.rb +2 -2
  21. data/app/serializers/easy_ml/prediction_serializer.rb +2 -0
  22. data/lib/easy_ml/core/model_evaluator.rb +2 -0
  23. data/lib/easy_ml/core/tuner.rb +1 -1
  24. data/lib/easy_ml/data/dataset_manager/writer/base.rb +24 -5
  25. data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +20 -7
  26. data/lib/easy_ml/data/dataset_manager/writer.rb +4 -0
  27. data/lib/easy_ml/data/dataset_manager.rb +4 -0
  28. data/lib/easy_ml/data/polars_column.rb +0 -6
  29. data/lib/easy_ml/feature_store.rb +9 -13
  30. data/lib/easy_ml/predict.rb +5 -4
  31. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +1 -0
  32. data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_dataset_names.rb.tt +13 -0
  33. data/lib/easy_ml/timing.rb +3 -1
  34. data/lib/easy_ml/version.rb +1 -1
  35. metadata +3 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a0eb5ce84bdd93da3ea53e97f1b1ceab81a529a9bb076596f4edf7e49349eadf
4
- data.tar.gz: 5262f39ff5a1236729d28a8fa6715d1b12e6dd5b4319225a6df512493872bba0
3
+ metadata.gz: 3a12058c269a91c130f9158e1507c58dc94ad33517aabe568a2f0bc9f78b88eb
4
+ data.tar.gz: 6a37a568b6a8d8c100c21dea96487cb85494ef11ba3b30ac51aad8cab45654d7
5
5
  SHA512:
6
- metadata.gz: 8e8094ed3309b80e0ee70543667e7af982e0805ba6fe81a62ad2f46297eae4487440a4e6ffdca75f32ef045ef8b25440a8396032cf00f9fd6c191d63fc8c0386
7
- data.tar.gz: 97d057eb4ffa2acdb319a52de427cc3f3e7c42db8de349d063b36cd91479c68eca10ec150689d11070f0d20f24b620c0ce7a77f3e335420dd8c73612d706d1a0
6
+ metadata.gz: 4f344c00a9e2b557079943f7f6f2c4d7923dbb5b425423b81d58dbc6d63ac15d5e978d6f7e1ab2c02233fe8cc33b55d168ed1ffb0d1f2e1cfcc59670b812285d
7
+ data.tar.gz: 4831eac6b35b452b300408b37695d5116b6840404cf045e85d98780d3a120ae2d267b07852bc80b6918379b70f9c04a52a712e599751e469af72a0be6e1889c4
@@ -4,22 +4,27 @@ module EasyML
4
4
 
5
5
  def create
6
6
  slug = params[:model]
7
- unless EasyML::Model.find_by(slug: slug).inference_version.present?
7
+ model = EasyML::Model.find_by(slug: slug)
8
+ unless model.present?
9
+ return render json: { error: "Model not found" }, status: :not_found
10
+ end
11
+
12
+ unless model.inference_version.present?
8
13
  return render json: { error: "Model not found" }, status: :not_found
9
14
  end
10
15
 
11
16
  unless params.key?(:input)
12
- return render json: { error: "Must provide key: input" }, status: :not_found
17
+ return render json: { error: "Must provide key: input" }, status: :unprocessable_entity
13
18
  end
14
19
  input = params[:input].permit!.to_h
15
20
 
16
21
  unless input.is_a?(Hash)
17
- return render json: { error: "Input must be a hash" }, status: :not_found
22
+ return render json: { error: "Input must be a hash" }, status: :unprocessable_entity
18
23
  end
19
24
 
20
25
  valid, fields = EasyML::Predict.validate_input(slug, input)
21
26
  unless valid
22
- return render json: { error: "Missing required fields: #{fields}" }, status: :not_found
27
+ return render json: { error: "Missing required fields: #{fields}" }, status: :unprocessable_entity
23
28
  end
24
29
 
25
30
  type = (params[:type] || :predict).to_sym
@@ -10,13 +10,13 @@ module EasyML
10
10
 
11
11
  @last_activity = Time.current
12
12
  setup_signal_traps
13
- @monitor_thread = start_monitor_thread
13
+ # @monitor_thread = start_monitor_thread
14
14
 
15
15
  @model.actually_train do |iteration_info|
16
16
  @last_activity = Time.current
17
17
  end
18
18
  ensure
19
- @monitor_thread&.exit
19
+ # @monitor_thread&.exit
20
20
  @model.unlock!
21
21
  end
22
22
 
@@ -26,7 +26,7 @@ module EasyML
26
26
  end
27
27
  end
28
28
 
29
- attr_accessor :column, :preprocessing_step
29
+ attr_accessor :column, :preprocessing_step, :encode
30
30
 
31
31
  def initialize(column, preprocessing_step)
32
32
  @column = column
@@ -12,7 +12,7 @@ module EasyML
12
12
  def transform(df)
13
13
  return df unless allowed_categories.present?
14
14
 
15
- case column.datatype
15
+ case column.datatype.to_sym
16
16
  when :categorical
17
17
  df = df.with_column(
18
18
  Polars.when(Polars.col(column.name).is_in(allowed_categories))
@@ -9,6 +9,8 @@ module EasyML
9
9
  end
10
10
 
11
11
  def transform(df)
12
+ return df unless encode
13
+
12
14
  df = column.embed(df)
13
15
  df
14
16
  end
@@ -43,6 +43,10 @@ module EasyML
43
43
  @adapters ||= ordered_adapters.map { |klass| klass.new(column, preprocessing_step) }.select { |adapter| allowed?(adapter) && adapter.applies? }
44
44
  end
45
45
 
46
+ def encode=(value)
47
+ adapters.each { |adapter| adapter.encode = value }
48
+ end
49
+
46
50
  def description
47
51
  adapters.map(&:description).compact.join(", ")
48
52
  end
@@ -9,6 +9,7 @@ module EasyML
9
9
  end
10
10
 
11
11
  def transform(df)
12
+ return df unless encode
12
13
  return df unless allowed_categories.present?
13
14
 
14
15
  allowed_categories.each do |value|
@@ -9,6 +9,7 @@ module EasyML
9
9
  end
10
10
 
11
11
  def transform(df)
12
+ return df unless encode
12
13
  return df unless label_encoder.present?
13
14
 
14
15
  case column.datatype
@@ -184,9 +184,10 @@ module EasyML
184
184
  end
185
185
  end
186
186
 
187
- def transform(df, inference: false, computed: false)
187
+ def transform(df, inference: false, encode: true)
188
188
  imputer = inference && imputers.inference.anything? ? imputers.inference : imputers.training
189
189
 
190
+ imputer.encode = encode
190
191
  df = imputer.transform(df)
191
192
  df
192
193
  end
@@ -513,6 +514,29 @@ module EasyML
513
514
  EasyML::Import::Column.from_config(config, dataset, action: action)
514
515
  end
515
516
 
517
+ def cast_statement(df, df_col, expected_dtype)
518
+ expected_dtype = expected_dtype.is_a?(Polars::DataType) ? expected_dtype : expected_dtype.class
519
+ actual_type = df[df_col].dtype
520
+
521
+ cast_statement = case expected_dtype
522
+ when Polars::Boolean
523
+ case actual_type
524
+ when Polars::Boolean
525
+ Polars.col(df_col).cast(expected_dtype)
526
+ when Polars::String, Polars::Categorical
527
+ Polars.col(df_col).eq("true").cast(expected_dtype)
528
+ when Polars::Null
529
+ Polars.col(df_col)
530
+ else
531
+ raise "Unexpected dtype: #{actual_type} for column: #{df_col}"
532
+ end
533
+ else
534
+ Polars.col(df_col).cast(expected_dtype)
535
+ end
536
+
537
+ cast_statement.alias(df_col)
538
+ end
539
+
516
540
  def cast(value)
517
541
  return value if value.nil?
518
542
 
@@ -15,6 +15,10 @@ module EasyML
15
15
  @imputers ||= columns.map { |column| inference ? column.imputers(@_imputers).inference : column.imputers(@_imputers).training }
16
16
  end
17
17
 
18
+ def encode=(encode)
19
+ imputers.each { |imputer| imputer.encode = encode }
20
+ end
21
+
18
22
  def exprs
19
23
  imputers.flat_map(&:exprs).compact
20
24
  end
@@ -22,7 +22,7 @@ module EasyML
22
22
  end
23
23
  end
24
24
 
25
- def transform(df, inference: false, computed: false)
25
+ def transform(df, inference: false, computed: false, encode: true)
26
26
  return df if df.nil?
27
27
 
28
28
  if computed
@@ -33,14 +33,12 @@ module EasyML
33
33
 
34
34
  by_name = cols.index_by(&:name)
35
35
  cols.each do |column|
36
- df = column.transform(df, inference: inference, computed: computed) if column
36
+ df = column.transform(df, inference: inference, encode: encode) if column
37
37
  end
38
38
 
39
39
  df
40
40
  end
41
41
 
42
- measure_method_timing :transform
43
-
44
42
  def apply_clip(df)
45
43
  clip_cols = has_clip.raw
46
44
  return df unless clip_cols.any?
@@ -60,8 +58,6 @@ module EasyML
60
58
  reload
61
59
  end
62
60
 
63
- measure_method_timing :learn
64
-
65
61
  def statistics
66
62
  stats = { raw: {}, processed: {} }
67
63
  select(&:persisted?).inject(stats) do |h, col|
@@ -94,6 +90,27 @@ module EasyML
94
90
  end.sort.map { |arr| arr[1] }.uniq
95
91
  end
96
92
 
93
+ def apply_cast(df)
94
+ schema = dataset.schema
95
+ column_index = reduce({}) do |h, col|
96
+ h.tap do
97
+ col.aliases.each do |alias_name|
98
+ h[alias_name] = col
99
+ end
100
+ end
101
+ end
102
+ cast_statements = (df.columns & schema.keys.map(&:to_s)).map do |df_col|
103
+ db_col = column_index[df_col]
104
+ expected_dtype = schema[df_col.to_sym]
105
+ db_col.cast_statement(df, df_col, expected_dtype)
106
+ end
107
+ begin
108
+ df = df.with_columns(cast_statements)
109
+ rescue => e
110
+ binding.pry
111
+ end
112
+ end
113
+
97
114
  def cast(processed_or_raw)
98
115
  columns = where(is_computed: false)
99
116
  is_processed = processed_or_raw == :processed
@@ -154,8 +171,6 @@ module EasyML
154
171
  EasyML::Lineage.import(lineage, on_duplicate_key_update: { columns: %i[ column_id key occurred_at description ] })
155
172
  end
156
173
 
157
- measure_method_timing :set_feature_lineage
158
-
159
174
  private
160
175
 
161
176
  def import_new(new_columns, existing_columns)
@@ -57,8 +57,6 @@ module EasyML
57
57
  dataset.columns.set_feature_lineage(columns)
58
58
  end
59
59
 
60
- measure_method_timing :save_statistics
61
-
62
60
  def learn_statistics
63
61
  return @statistics if @statistics
64
62
 
@@ -78,8 +76,6 @@ module EasyML
78
76
  end
79
77
  end
80
78
 
81
- measure_method_timing :learn_statistics
82
-
83
79
  def prepare
84
80
  @schema = EasyML::Data::PolarsSchema.simplify(@dataset.raw_schema).symbolize_keys
85
81
  @raw_columns = @schema.keys.sort.map(&:to_s)
@@ -93,19 +89,13 @@ module EasyML
93
89
  EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[in_raw_dataset datatype] })
94
90
  end
95
91
 
96
- measure_method_timing :prepare
97
-
98
92
  def lazy_statistics
99
93
  Lazy.new(dataset, columns, type: type).learn
100
94
  end
101
95
 
102
- measure_method_timing :lazy_statistics
103
-
104
96
  def eager_statistics
105
97
  Eager.new(dataset, columns, type: type).learn
106
98
  end
107
-
108
- measure_method_timing :eager_statistics
109
99
  end
110
100
  end
111
101
  end
@@ -215,9 +215,10 @@ module EasyML
215
215
 
216
216
  @raw = raw.cp(version)
217
217
  @processed = processed.cp(version)
218
- features.each(&:bump_version)
219
-
220
- save
218
+ save.tap do
219
+ features.each(&:bump_version)
220
+ EasyML::Feature.import(features.to_a, on_duplicate_key_update: [:version])
221
+ end
221
222
  end
222
223
 
223
224
  def refreshed_datasource?
@@ -257,9 +258,6 @@ module EasyML
257
258
  end
258
259
  end
259
260
 
260
- include EasyML::Timing
261
- measure_method_timing :actually_refresh
262
-
263
261
  def refresh!(async: false)
264
262
  refreshing do
265
263
  prepare!
@@ -276,29 +274,22 @@ module EasyML
276
274
  end
277
275
  end
278
276
 
279
- measure_method_timing :refresh
280
-
281
277
  def fit_features!(async: false, features: self.features)
282
278
  fit_features(async: async, features: features, force: true)
283
279
  end
284
280
 
285
281
  def fit_features(async: false, features: self.features, force: false)
286
282
  features_to_compute = force ? features : features.needs_fit
287
- puts "Features to compute.... #{features_to_compute}"
288
283
  return after_fit_features if features_to_compute.empty?
289
284
 
290
285
  features.first.fit(features: features_to_compute, async: async)
291
286
  end
292
287
 
293
- measure_method_timing :fit_features
294
-
295
288
  def after_fit_features
296
- puts "After fit features"
297
289
  unlock!
298
290
  reload
299
291
  return if failed?
300
292
 
301
- puts "Actually refresh..."
302
293
  actually_refresh
303
294
  end
304
295
 
@@ -476,15 +467,24 @@ module EasyML
476
467
 
477
468
  def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
478
469
  df = apply_missing_columns(df, inference: inference)
479
- df = columns.transform(df, inference: inference)
480
- df = apply_features(df, features)
481
- df = columns.transform(df, inference: inference)
470
+ df = transform_columns(df, inference: inference, encode: false)
471
+ df = apply_features(df, features, inference: inference)
472
+ df = apply_cast(df) if inference
473
+ df = transform_columns(df, inference: inference)
482
474
  df = apply_column_mask(df, inference: inference) unless all_columns
483
475
  df = drop_nulls(df) unless inference
484
476
  df, = processed.split_features_targets(df, true, target) if split_ys
485
477
  df
486
478
  end
487
479
 
480
+ def transform_columns(df, inference: false, encode: true)
481
+ columns.transform(df, inference: inference, encode: encode)
482
+ end
483
+
484
+ def apply_cast(df)
485
+ columns.apply_cast(df)
486
+ end
487
+
488
488
  # Massage out one-hot cats to their canonical name
489
489
  #
490
490
  # Takes: ["Sex_male", "Sex_female", "Embarked_c", "PassengerId"]
@@ -503,8 +503,6 @@ module EasyML
503
503
  end.uniq.sort
504
504
  end
505
505
 
506
- measure_method_timing :normalize
507
-
508
506
  def missing_required_fields(df)
509
507
  desc_df = df.describe
510
508
 
@@ -633,22 +631,19 @@ module EasyML
633
631
  df[column_mask(df, inference: inference)]
634
632
  end
635
633
 
636
- measure_method_timing :apply_column_mask
637
-
638
- def apply_missing_columns(df, inference: false, include_one_hots: false)
634
+ def apply_missing_columns(df, inference: false)
639
635
  return df unless inference
640
636
 
641
- missing_columns = (col_order(inference: inference) - df.columns).compact
642
- unless include_one_hots
643
- columns.one_hots.each do |one_hot|
644
- virtual_columns = one_hot.virtual_columns
645
- if virtual_columns.all? { |vc| df.columns.include?(vc) }
646
- missing_columns -= columns.one_hots.flat_map(&:virtual_columns)
647
- else
648
- missing_columns += columns.one_hots.map(&:name) - df.columns
649
- end
637
+ required_cols = col_order(inference: inference).compact.uniq
638
+ columns.one_hots.each do |one_hot|
639
+ virtual_columns = one_hot.virtual_columns
640
+ if virtual_columns.all? { |vc| df.columns.include?(vc) }
641
+ required_cols -= virtual_columns
642
+ else
643
+ required_cols += [one_hot.name]
650
644
  end
651
645
  end
646
+ missing_columns = required_cols - df.columns
652
647
  df.with_columns(missing_columns.map { |f| Polars.lit(nil).alias(f) })
653
648
  end
654
649
 
@@ -771,8 +766,6 @@ module EasyML
771
766
  after_refresh_datasource
772
767
  end
773
768
 
774
- measure_method_timing :refresh_datasource
775
-
776
769
  def refresh_datasource!
777
770
  datasource.reload.refresh!
778
771
  after_refresh_datasource
@@ -798,8 +791,6 @@ module EasyML
798
791
  @normalized = true
799
792
  end
800
793
 
801
- measure_method_timing :normalize_all
802
-
803
794
  def learn_computed_columns(df)
804
795
  return unless features.ready_to_apply.any?
805
796
 
@@ -811,8 +802,6 @@ module EasyML
811
802
  processed.cleanup
812
803
  end
813
804
 
814
- measure_method_timing :learn_computed_columns
815
-
816
805
  def drop_nulls(df)
817
806
  return df if drop_if_null.nil? || drop_if_null.empty?
818
807
 
@@ -822,8 +811,6 @@ module EasyML
822
811
  df.drop_nulls(subset: drop)
823
812
  end
824
813
 
825
- measure_method_timing :drop_nulls
826
-
827
814
  # Pass refresh: false for frontend views so we don't query S3 during web requests
828
815
  def load_data(segment, **kwargs, &block)
829
816
  needs_refresh = kwargs.key?(:refresh) ? kwargs[:refresh] : needs_refresh?
@@ -876,8 +863,8 @@ module EasyML
876
863
  columns.find_by(name: column_name).update(is_date_column: true)
877
864
  end
878
865
 
879
- def apply_features(df, features = self.features)
880
- features = features.ready_to_apply
866
+ def apply_features(df, features = self.features, inference: false)
867
+ features = inference ? preloaded_features : features.ready_to_apply
881
868
  if features.nil? || features.empty?
882
869
  df
883
870
  else
@@ -897,15 +884,13 @@ module EasyML
897
884
  # Set SHA without querying
898
885
  feature.instance_variable_set(:@current_sha, shas[feature.feature_class])
899
886
 
900
- result = feature.transform_batch(acc_df)
887
+ result = feature.transform_batch(acc_df, inference: inference)
901
888
 
902
889
  result
903
890
  end
904
891
  end
905
892
  end
906
893
 
907
- measure_method_timing :apply_features
908
-
909
894
  def standardize_preprocessing_steps(type)
910
895
  columns.map(&:name).zip(columns.map do |col|
911
896
  col.preprocessing_steps&.dig(type)
@@ -48,28 +48,37 @@ module EasyML
48
48
 
49
49
  def actually_deploy
50
50
  lock_deploy do
51
- update(status: "running")
52
- EasyML::Event.create_event(self, "started")
53
-
54
- if identical_deploy.present?
55
- self.model_file = identical_deploy.model_file
56
- self.model_version = identical_deploy.model_version
57
- else
58
- if model_file.present?
59
- model.model_file = model_file
51
+ begin
52
+ update(status: "running")
53
+ EasyML::Event.create_event(self, "started")
54
+
55
+ if identical_deploy.present?
56
+ self.model_file = identical_deploy.model_file
57
+ self.model_version = identical_deploy.model_version
58
+ else
59
+ if model_file.present?
60
+ model.model_file = model_file
61
+ end
62
+ # model.load_model
63
+ self.model_version = model.actually_deploy
60
64
  end
61
- model.load_model
62
- self.model_version = model.actually_deploy
63
- end
64
65
 
65
- EasyML::Deploy.transaction do
66
- update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, status: :success)
67
- model.retraining_runs.where(status: :deployed).update_all(status: :success)
68
- retraining_run.update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, deploy_id: id, status: :deployed, is_deploying: false)
69
- end
66
+ EasyML::Deploy.transaction do
67
+ update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, status: :success)
68
+ model.retraining_runs.where(status: :deployed).update_all(status: :success)
69
+ retraining_run.update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, deploy_id: id, status: :deployed,)
70
+ end
70
71
 
71
- model_version.tap do
72
- EasyML::Event.create_event(self, "success")
72
+ model_version.tap do
73
+ EasyML::Event.create_event(self, "success")
74
+ end
75
+ rescue => e
76
+ update(status: "failed")
77
+ retraining_run.update(is_deploying: false)
78
+ EasyML::Event.create_event(self, "failed")
79
+ raise e
80
+ ensure
81
+ unlock!
73
82
  end
74
83
  end
75
84
  end
@@ -82,7 +82,7 @@ module EasyML
82
82
  fittable = fittable.select(&:fittable?)
83
83
  where(id: fittable.map(&:id))
84
84
  end
85
- scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed) }
85
+ scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed).or(where(needs_fit: true)) }
86
86
  scope :datasource_was_refreshed, -> do
87
87
  where(id: all.select(&:datasource_was_refreshed?).map(&:id))
88
88
  end
@@ -310,9 +310,9 @@ module EasyML
310
310
  end
311
311
 
312
312
  # Transform a single batch, used for testing the user's feature implementation
313
- def transform_batch(df = nil, batch_args = {})
313
+ def transform_batch(df = nil, batch_args = {}, inference: false)
314
314
  if df.is_a?(Polars::DataFrame)
315
- actually_transform_batch(df)
315
+ actually_transform_batch(df, inference: inference)
316
316
  else
317
317
  actually_transform_batch(build_batch(get_batch_args(**batch_args)))
318
318
  end
@@ -374,11 +374,12 @@ module EasyML
374
374
  batch_df
375
375
  end
376
376
 
377
- def actually_transform_batch(df)
377
+ def actually_transform_batch(df, inference: false)
378
378
  return nil unless df.is_a?(Polars::DataFrame)
379
379
  return df if !adapter.respond_to?(:transform) && feature_store.empty?
380
380
 
381
381
  df_len_was = df.shape[0]
382
+ orig_df = df.clone
382
383
  begin
383
384
  result = adapter.transform(df, self)
384
385
  rescue => e
@@ -386,8 +387,10 @@ module EasyML
386
387
  end
387
388
  raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
388
389
  df_len_now = result.shape[0]
389
- raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
390
- update!(applied_at: Time.current)
390
+ missing_columns = orig_df.columns - result.columns
391
+ raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if (df_len_now != df_len_was)
392
+ raise "Feature #{feature_class} removed #{missing_columns} columns" if missing_columns.any?
393
+ update!(applied_at: Time.current) unless inference
391
394
  result
392
395
  end
393
396
 
@@ -432,9 +435,8 @@ module EasyML
432
435
  end
433
436
 
434
437
  def bump_version
435
- old_version = version
438
+ feature_store.bump_version(version)
436
439
  write_attribute(:version, version + 1)
437
- feature_store.cp(old_version, version)
438
440
  self
439
441
  end
440
442
 
@@ -44,7 +44,16 @@ module EasyML
44
44
  scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
45
45
  scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
46
46
 
47
+ def wipe
48
+ false
49
+ end
50
+
47
51
  def download_remote_files
52
+ return unless snapshot_id # if not finished saving, skip
53
+ return if feature_store.synced?
54
+ return if @downloaded
55
+
56
+ @downloaded = true
48
57
  feature_store&.download
49
58
  end
50
59
  end
@@ -182,6 +182,7 @@ module EasyML
182
182
  lock_model do
183
183
  run = pending_run
184
184
  run.wrap_training do
185
+ dataset.refresh if dataset.needs_refresh?
185
186
  raise untrainable_error unless trainable?
186
187
 
187
188
  best_params = nil
@@ -210,6 +211,10 @@ module EasyML
210
211
  end
211
212
  end
212
213
 
214
+ def locked?
215
+ Support::Lockable.locked?(lock_key)
216
+ end
217
+
213
218
  def with_lock
214
219
  EasyML::Support::Lockable.with_lock(lock_key, stale_timeout: 60, resources: 1) do |client|
215
220
  yield client
@@ -273,7 +278,7 @@ module EasyML
273
278
  end
274
279
 
275
280
  def inference_version
276
- latest_deploy&.model_version
281
+ deploys.where(status: :success).order(id: :desc).limit(1).last&.model_version
277
282
  end
278
283
 
279
284
  alias_method :current_version, :inference_version
@@ -296,21 +301,21 @@ module EasyML
296
301
  )
297
302
  end
298
303
 
299
- def prepare_predict(xs)
304
+ def prepare_predict(xs, normalized: false)
300
305
  load_model!
301
- unless xs.is_a?(XGBoost::DMatrix)
306
+ if !normalized
302
307
  xs = dataset.normalize(xs, inference: true)
303
308
  end
304
309
  xs
305
310
  end
306
311
 
307
- def predict(xs)
308
- xs = prepare_predict(xs)
312
+ def predict(xs, normalized: false)
313
+ xs = prepare_predict(xs, normalized: normalized)
309
314
  adapter.predict(xs)
310
315
  end
311
316
 
312
- def predict_proba(xs)
313
- xs = prepare_predict(xs)
317
+ def predict_proba(xs, normalized: false)
318
+ xs = prepare_predict(xs, normalized: normalized)
314
319
  adapter.predict_proba(xs)
315
320
  end
316
321
 
@@ -49,7 +49,7 @@ module EasyML
49
49
  x_valid, y_valid = valid_dataset
50
50
  x_valid = x_valid.select(model.dataset.col_order(inference: true))
51
51
  @preprocessed ||= model.preprocess(x_valid, y_valid)
52
- y_pred = model.predict(@preprocessed)
52
+ y_pred = model.predict(@preprocessed, normalized: true)
53
53
  dataset = model.dataset.processed.valid(all_columns: true)
54
54
 
55
55
  metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
@@ -78,7 +78,7 @@ module EasyML
78
78
 
79
79
  track_feature_importance(booster)
80
80
  if tuner.nil?
81
- track_cumulative_feature_importance(false)
81
+ track_cumulative_feature_importance
82
82
  end
83
83
 
84
84
  booster
@@ -17,8 +17,8 @@ module EasyML
17
17
  class Prediction < ActiveRecord::Base
18
18
  self.table_name = "easy_ml_predictions"
19
19
 
20
- belongs_to :model
21
- belongs_to :model_history, optional: true
20
+ belongs_to :model, class_name: "EasyML::Model"
21
+ belongs_to :model_history, class_name: "EasyML::ModelHistory", optional: true
22
22
 
23
23
  validates :model_id, presence: true
24
24
  validates :prediction_type, presence: true, inclusion: { in: %w[regression classification] }
@@ -10,6 +10,8 @@ module EasyML
10
10
  object.prediction_value.symbolize_keys.dig(:value)
11
11
  when Numeric
12
12
  object.prediction_value
13
+ when Array
14
+ object.prediction_value
13
15
  end
14
16
  end
15
17
 
@@ -153,6 +153,8 @@ module EasyML
153
153
 
154
154
  def normalize_input(input)
155
155
  case input
156
+ when Polars::LazyFrame
157
+ normalize_input(input.collect)
156
158
  when Array
157
159
  if input.first.class == TrueClass || input.first.class == FalseClass
158
160
  input = input.map { |value| value ? 1.0 : 0.0 }
@@ -147,7 +147,7 @@ module EasyML
147
147
  end
148
148
  end
149
149
 
150
- y_pred = model.predict(x_normalized)
150
+ y_pred = model.predict(x_normalized, normalized: true)
151
151
  model.metrics = metrics
152
152
  metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
153
153
  metric = metrics.symbolize_keys.dig(model.evaluator[:metric].to_sym)
@@ -24,18 +24,31 @@ module EasyML
24
24
 
25
25
  def compact
26
26
  files = self.files
27
+ rows = query(lazy: true).collect
28
+ return unless rows.shape[0] > 0
29
+
30
+ FileUtils.rm(files)
27
31
 
28
32
  clear_unique_id
29
33
  File.join(root_dir, "compacted.parquet").tap do |target_file|
30
- safe_write(
31
- query(lazy: true),
32
- target_file
33
- )
34
- FileUtils.rm(files)
34
+ safe_write(rows, target_file)
35
35
  end
36
36
  clear_unique_id
37
37
  end
38
38
 
39
+ def cp(from,to)
40
+ return if from.nil? || !Dir.exist?(from)
41
+
42
+ FileUtils.mkdir_p(to)
43
+ files_to_cp = Dir.glob(Pathname.new(from).join("**/*")).select { |f| File.file?(f) }
44
+
45
+ files_to_cp.each do |file|
46
+ target_file = file.gsub(from, to)
47
+ FileUtils.mkdir_p(File.dirname(target_file))
48
+ FileUtils.cp(file, target_file)
49
+ end
50
+ end
51
+
39
52
  def unlock!
40
53
  clear_all_keys
41
54
  end
@@ -65,6 +78,8 @@ module EasyML
65
78
  end
66
79
 
67
80
  def safe_write(df, path)
81
+ raise "df must be a Polars::DataFrame or Polars::LazyFrame" unless df.is_a?(Polars::DataFrame) || df.is_a?(Polars::LazyFrame)
82
+
68
83
  FileUtils.mkdir_p(File.dirname(path))
69
84
  if df.is_a?(Polars::LazyFrame)
70
85
  # Depending on the query plan, sometimes sink_parquet will throw an error...
@@ -81,6 +96,10 @@ module EasyML
81
96
  df.write_parquet(path)
82
97
  end
83
98
  path
99
+ ensure
100
+ if Polars.scan_parquet(path).limit(1).schema.keys.empty?
101
+ raise "Failed to store to #{path}"
102
+ end
84
103
  end
85
104
 
86
105
  def clear_all_keys
@@ -17,9 +17,7 @@ module EasyML
17
17
  end
18
18
 
19
19
  def wipe
20
- partitions.each do |partition|
21
- FileUtils.rm_rf(File.join(root_dir, partition))
22
- end
20
+ super
23
21
  clear_all_keys
24
22
  end
25
23
 
@@ -33,22 +31,37 @@ module EasyML
33
31
  end
34
32
 
35
33
  def compact
36
- files = self.files
34
+ return if compacted?
35
+
37
36
  @df = query(lazy: true)
38
37
 
39
38
  clear_unique_id(subdir: "compacted")
40
39
  compact_each_partition.tap do
41
- FileUtils.rm(files)
42
40
  clear_unique_id
43
41
  end
42
+ uncompacted_folders.each do |folder|
43
+ FileUtils.rm_rf(File.join(root_dir, folder))
44
+ end
44
45
  end
45
46
 
46
47
  private
47
48
 
48
- def partitions
49
- Dir.glob(File.join(root_dir, "**/*")).map { |f| f.split("/").last }
49
+ def compacted?
50
+ uncompacted_folders.empty?
50
51
  end
51
52
 
53
+ def uncompacted_folders
54
+ folders - ["compacted"]
55
+ end
56
+
57
+ def folders
58
+ Dir.glob(File.join(root_dir, "**/*")).select { |f| File.directory?(f) }.map { |f| f.split("/").last }
59
+ end
60
+
61
+ # def partitions
62
+ # Dir.glob(File.join(root_dir, "**/*")).map { |f| f.split("/").last }
63
+ # end
64
+
52
65
  def compact_each_partition
53
66
  with_each_partition do |partition_df, _|
54
67
  safe_write(
@@ -31,6 +31,10 @@ module EasyML
31
31
  adapter_class.new(options).unlock!
32
32
  end
33
33
 
34
+ def cp(from, to)
35
+ adapter_class.new(options).cp(from, to)
36
+ end
37
+
34
38
  def store(df, *args)
35
39
  return df if df.is_a?(Polars::LazyFrame) ? df.schema.empty? : df.empty?
36
40
 
@@ -51,6 +51,10 @@ module EasyML
51
51
  def num_rows
52
52
  Reader.num_rows
53
53
  end
54
+
55
+ def cp(from, to)
56
+ Writer.cp(from, to)
57
+ end
54
58
  end
55
59
 
56
60
  def list_nulls(input = nil, **kwargs, &block)
@@ -121,8 +121,6 @@ module EasyML
121
121
  polars_type ? sym_to_polars(type_name) : type_name
122
122
  end
123
123
 
124
- measure_method_timing :determine_type
125
-
126
124
  # Determines if a string field is a date, text, or categorical
127
125
  # @param series [Polars::Series] The string series to analyze
128
126
  # @return [Symbol] One of :datetime, :text, or :categorical
@@ -149,8 +147,6 @@ module EasyML
149
147
  end
150
148
  end
151
149
 
152
- measure_method_timing :determine_string_type
153
-
154
150
  # Determines if a string field is categorical or free text
155
151
  # @param series [Polars::Series] The string series to analyze
156
152
  # @return [Symbol] Either :categorical or :text
@@ -178,8 +174,6 @@ module EasyML
178
174
  avg_percentage < 1.0 ? :text : :categorical
179
175
  end
180
176
 
181
- measure_method_timing :categorical_or_text?
182
-
183
177
  # Returns whether the field type is numeric
184
178
  # @param field_type [Symbol] The field type to check
185
179
  # @return [Boolean]
@@ -23,20 +23,16 @@ module EasyML
23
23
  end
24
24
  end
25
25
 
26
- def cp(old_version, new_version)
27
- old_dir = feature_dir_for_version(old_version)
28
- new_dir = feature_dir_for_version(new_version)
29
-
30
- return if old_dir.nil? || !Dir.exist?(old_dir)
31
-
32
- FileUtils.mkdir_p(new_dir)
33
- files_to_cp = Dir.glob(Pathname.new(old_dir).join("**/*")).select { |f| File.file?(f) }
26
+ def synced?
27
+ files.any?
28
+ end
34
29
 
35
- files_to_cp.each do |file|
36
- target_file = file.gsub(old_dir, new_dir)
37
- FileUtils.mkdir_p(File.dirname(target_file))
38
- FileUtils.cp(file, target_file)
39
- end
30
+ def bump_version(version)
31
+ compact
32
+ cp(
33
+ feature_dir_for_version(version),
34
+ feature_dir_for_version(version + 1),
35
+ )
40
36
  end
41
37
 
42
38
  private
@@ -3,6 +3,7 @@ require "singleton"
3
3
  module EasyML
4
4
  class Predict
5
5
  include Singleton
6
+ include EasyML::Timing
6
7
 
7
8
  attr_reader :models
8
9
 
@@ -20,7 +21,7 @@ module EasyML
20
21
  def self.predict(model_name, df, serialize: false)
21
22
  df = normalize_input(df)
22
23
  output = make_predictions(model_name, df) do |model, normalized_df|
23
- model.predict(normalized_df)
24
+ model.predict(normalized_df, normalized: true)
24
25
  end
25
26
 
26
27
  if serialize
@@ -33,7 +34,7 @@ module EasyML
33
34
  def self.predict_proba(model_name, df, serialize: false)
34
35
  df = normalize_input(df)
35
36
  output = make_predictions(model_name, df) do |model, normalized_df|
36
- probas = model.predict_proba(normalized_df)
37
+ probas = model.predict_proba(normalized_df, normalized: true)
37
38
  probas.map { |proba_array| proba_array.map { |p| p.round(4) } }
38
39
  end
39
40
 
@@ -91,8 +92,8 @@ module EasyML
91
92
 
92
93
  output = predictions.zip(raw_input, normalized_input).map do |pred, raw, norm|
93
94
  EasyML::Prediction.create!(
94
- model: current_version.model,
95
- model_history: current_version,
95
+ model_id: current_version.model.id,
96
+ model_history_id: current_version.id,
96
97
  prediction_type: current_version.model.task,
97
98
  prediction_value: pred,
98
99
  raw_input: raw,
@@ -59,6 +59,7 @@ module EasyML
59
59
  add_pca_model_id_to_easy_ml_columns
60
60
  add_workflow_status_to_easy_ml_dataset_histories
61
61
  add_metadata_to_easy_ml_predictions
62
+ add_unique_constraint_to_dataset_names
62
63
  ].freeze
63
64
 
64
65
  # Specify the next migration number
@@ -0,0 +1,13 @@
1
+ class AddUniqueConstraintToDatasetNames < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ if index_exists?(:easy_ml_datasets, :name)
4
+ remove_index :easy_ml_datasets, :name
5
+ end
6
+ add_index :easy_ml_datasets, :name, unique: true
7
+
8
+ if index_exists?(:easy_ml_dataset_histories, :name)
9
+ remove_index :easy_ml_dataset_histories, :name
10
+ end
11
+ add_index :easy_ml_dataset_histories, :name, unique: true
12
+ end
13
+ end
@@ -19,7 +19,9 @@ module EasyML
19
19
  result = send(method_alias, *args, **kwargs, &block)
20
20
  ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
21
21
  elapsed = ending - starting
22
- puts "#{method_name} took #{elapsed.round(2)} seconds"
22
+ 10.times do
23
+ puts "#{method_name} took #{elapsed.round(2)} seconds"
24
+ end
23
25
  # StatsD.measure("#{Rails.env}.#{prefix.present? ? "#{prefix}." : ""}#{method_name}.timing", elapsed)
24
26
  result
25
27
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc89"
4
+ VERSION = "0.2.0-rc91"
5
5
 
6
6
  module Version
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: easy_ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre.rc89
4
+ version: 0.2.0.pre.rc91
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brett Shollenberger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-03-03 00:00:00.000000000 Z
11
+ date: 2025-03-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
@@ -803,6 +803,7 @@ files:
803
803
  - lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt
804
804
  - lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt
805
805
  - lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt
806
+ - lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_dataset_names.rb.tt
806
807
  - lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt
807
808
  - lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_dataset_histories.rb.tt
808
809
  - lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_features.rb.tt