easy_ml 0.2.0.pre.rc89 → 0.2.0.pre.rc90

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/predictions_controller.rb +9 -4
  3. data/app/jobs/easy_ml/training_job.rb +2 -2
  4. data/app/models/easy_ml/column/imputers/base.rb +1 -1
  5. data/app/models/easy_ml/column/imputers/categorical.rb +1 -1
  6. data/app/models/easy_ml/column/imputers/embedding_encoder.rb +2 -0
  7. data/app/models/easy_ml/column/imputers/imputer.rb +4 -0
  8. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -0
  9. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -0
  10. data/app/models/easy_ml/column.rb +25 -1
  11. data/app/models/easy_ml/column_list/imputer.rb +4 -0
  12. data/app/models/easy_ml/column_list.rb +19 -8
  13. data/app/models/easy_ml/dataset/learner.rb +0 -10
  14. data/app/models/easy_ml/dataset.rb +23 -44
  15. data/app/models/easy_ml/deploy.rb +28 -19
  16. data/app/models/easy_ml/feature.rb +10 -8
  17. data/app/models/easy_ml/feature_history.rb +9 -0
  18. data/app/models/easy_ml/model.rb +12 -7
  19. data/app/models/easy_ml/models/xgboost/evals_callback.rb +2 -2
  20. data/app/models/easy_ml/prediction.rb +2 -2
  21. data/app/serializers/easy_ml/prediction_serializer.rb +2 -0
  22. data/lib/easy_ml/core/model_evaluator.rb +2 -0
  23. data/lib/easy_ml/core/tuner.rb +1 -1
  24. data/lib/easy_ml/data/dataset_manager/writer/base.rb +24 -5
  25. data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +21 -6
  26. data/lib/easy_ml/data/dataset_manager/writer.rb +4 -0
  27. data/lib/easy_ml/data/dataset_manager.rb +4 -0
  28. data/lib/easy_ml/data/polars_column.rb +0 -6
  29. data/lib/easy_ml/feature_store.rb +9 -13
  30. data/lib/easy_ml/predict.rb +5 -4
  31. data/lib/easy_ml/timing.rb +3 -1
  32. data/lib/easy_ml/version.rb +1 -1
  33. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a0eb5ce84bdd93da3ea53e97f1b1ceab81a529a9bb076596f4edf7e49349eadf
4
- data.tar.gz: 5262f39ff5a1236729d28a8fa6715d1b12e6dd5b4319225a6df512493872bba0
3
+ metadata.gz: '08be0b67dba395b4aa3493a0a0fa6e5cde31246f299a14460590c8ced298d557'
4
+ data.tar.gz: d72734f3d5045e1f3554eecadb641d141fbb89f02b3c6f0cb265e5588c6d2866
5
5
  SHA512:
6
- metadata.gz: 8e8094ed3309b80e0ee70543667e7af982e0805ba6fe81a62ad2f46297eae4487440a4e6ffdca75f32ef045ef8b25440a8396032cf00f9fd6c191d63fc8c0386
7
- data.tar.gz: 97d057eb4ffa2acdb319a52de427cc3f3e7c42db8de349d063b36cd91479c68eca10ec150689d11070f0d20f24b620c0ce7a77f3e335420dd8c73612d706d1a0
6
+ metadata.gz: a873a1cf9b00fd84dc0912392f4de8140eff18377c97e3d9464c7ce2d73a3a82c16f0ae34d09b095be290ca6cb287bc28bfa76864d1e003db99d308bc44413da
7
+ data.tar.gz: 47bc5ff93e92e8d51b62dad043917ce3b110a4141aa9e404456f2934dbe4b5d198506af81af7ddda78b511b96a564abc6df62834e376b02a4d7a6859dafdecfc
@@ -4,22 +4,27 @@ module EasyML
4
4
 
5
5
  def create
6
6
  slug = params[:model]
7
- unless EasyML::Model.find_by(slug: slug).inference_version.present?
7
+ model = EasyML::Model.find_by(slug: slug)
8
+ unless model.present?
9
+ return render json: { error: "Model not found" }, status: :not_found
10
+ end
11
+
12
+ unless model.inference_version.present?
8
13
  return render json: { error: "Model not found" }, status: :not_found
9
14
  end
10
15
 
11
16
  unless params.key?(:input)
12
- return render json: { error: "Must provide key: input" }, status: :not_found
17
+ return render json: { error: "Must provide key: input" }, status: :unprocessable_entity
13
18
  end
14
19
  input = params[:input].permit!.to_h
15
20
 
16
21
  unless input.is_a?(Hash)
17
- return render json: { error: "Input must be a hash" }, status: :not_found
22
+ return render json: { error: "Input must be a hash" }, status: :unprocessable_entity
18
23
  end
19
24
 
20
25
  valid, fields = EasyML::Predict.validate_input(slug, input)
21
26
  unless valid
22
- return render json: { error: "Missing required fields: #{fields}" }, status: :not_found
27
+ return render json: { error: "Missing required fields: #{fields}" }, status: :unprocessable_entity
23
28
  end
24
29
 
25
30
  type = (params[:type] || :predict).to_sym
@@ -10,13 +10,13 @@ module EasyML
10
10
 
11
11
  @last_activity = Time.current
12
12
  setup_signal_traps
13
- @monitor_thread = start_monitor_thread
13
+ # @monitor_thread = start_monitor_thread
14
14
 
15
15
  @model.actually_train do |iteration_info|
16
16
  @last_activity = Time.current
17
17
  end
18
18
  ensure
19
- @monitor_thread&.exit
19
+ # @monitor_thread&.exit
20
20
  @model.unlock!
21
21
  end
22
22
 
@@ -26,7 +26,7 @@ module EasyML
26
26
  end
27
27
  end
28
28
 
29
- attr_accessor :column, :preprocessing_step
29
+ attr_accessor :column, :preprocessing_step, :encode
30
30
 
31
31
  def initialize(column, preprocessing_step)
32
32
  @column = column
@@ -12,7 +12,7 @@ module EasyML
12
12
  def transform(df)
13
13
  return df unless allowed_categories.present?
14
14
 
15
- case column.datatype
15
+ case column.datatype.to_sym
16
16
  when :categorical
17
17
  df = df.with_column(
18
18
  Polars.when(Polars.col(column.name).is_in(allowed_categories))
@@ -9,6 +9,8 @@ module EasyML
9
9
  end
10
10
 
11
11
  def transform(df)
12
+ return df unless encode
13
+
12
14
  df = column.embed(df)
13
15
  df
14
16
  end
@@ -43,6 +43,10 @@ module EasyML
43
43
  @adapters ||= ordered_adapters.map { |klass| klass.new(column, preprocessing_step) }.select { |adapter| allowed?(adapter) && adapter.applies? }
44
44
  end
45
45
 
46
+ def encode=(value)
47
+ adapters.each { |adapter| adapter.encode = value }
48
+ end
49
+
46
50
  def description
47
51
  adapters.map(&:description).compact.join(", ")
48
52
  end
@@ -9,6 +9,7 @@ module EasyML
9
9
  end
10
10
 
11
11
  def transform(df)
12
+ return df unless encode
12
13
  return df unless allowed_categories.present?
13
14
 
14
15
  allowed_categories.each do |value|
@@ -9,6 +9,7 @@ module EasyML
9
9
  end
10
10
 
11
11
  def transform(df)
12
+ return df unless encode
12
13
  return df unless label_encoder.present?
13
14
 
14
15
  case column.datatype
@@ -184,9 +184,10 @@ module EasyML
184
184
  end
185
185
  end
186
186
 
187
- def transform(df, inference: false, computed: false)
187
+ def transform(df, inference: false, encode: true)
188
188
  imputer = inference && imputers.inference.anything? ? imputers.inference : imputers.training
189
189
 
190
+ imputer.encode = encode
190
191
  df = imputer.transform(df)
191
192
  df
192
193
  end
@@ -513,6 +514,29 @@ module EasyML
513
514
  EasyML::Import::Column.from_config(config, dataset, action: action)
514
515
  end
515
516
 
517
+ def cast_statement(df, df_col, expected_dtype)
518
+ expected_dtype = expected_dtype.class
519
+ actual_type = df[df_col].dtype
520
+
521
+ cast_statement = case expected_dtype
522
+ when Polars::Boolean
523
+ case actual_type
524
+ when Polars::Boolean
525
+ Polars.col(df_col).cast(expected_dtype)
526
+ when Polars::String, Polars::Categorical
527
+ Polars.col(df_col).eq("true").cast(expected_dtype)
528
+ when Polars::Null
529
+ Polars.col(df_col)
530
+ else
531
+ raise "Unexpected dtype: #{actual_type} for column: #{df_col}"
532
+ end
533
+ else
534
+ Polars.col(df_col).cast(expected_dtype)
535
+ end
536
+
537
+ cast_statement.alias(df_col)
538
+ end
539
+
516
540
  def cast(value)
517
541
  return value if value.nil?
518
542
 
@@ -15,6 +15,10 @@ module EasyML
15
15
  @imputers ||= columns.map { |column| inference ? column.imputers(@_imputers).inference : column.imputers(@_imputers).training }
16
16
  end
17
17
 
18
+ def encode=(encode)
19
+ imputers.each { |imputer| imputer.encode = encode }
20
+ end
21
+
18
22
  def exprs
19
23
  imputers.flat_map(&:exprs).compact
20
24
  end
@@ -22,7 +22,7 @@ module EasyML
22
22
  end
23
23
  end
24
24
 
25
- def transform(df, inference: false, computed: false)
25
+ def transform(df, inference: false, computed: false, encode: true)
26
26
  return df if df.nil?
27
27
 
28
28
  if computed
@@ -33,14 +33,12 @@ module EasyML
33
33
 
34
34
  by_name = cols.index_by(&:name)
35
35
  cols.each do |column|
36
- df = column.transform(df, inference: inference, computed: computed) if column
36
+ df = column.transform(df, inference: inference, encode: encode) if column
37
37
  end
38
38
 
39
39
  df
40
40
  end
41
41
 
42
- measure_method_timing :transform
43
-
44
42
  def apply_clip(df)
45
43
  clip_cols = has_clip.raw
46
44
  return df unless clip_cols.any?
@@ -60,8 +58,6 @@ module EasyML
60
58
  reload
61
59
  end
62
60
 
63
- measure_method_timing :learn
64
-
65
61
  def statistics
66
62
  stats = { raw: {}, processed: {} }
67
63
  select(&:persisted?).inject(stats) do |h, col|
@@ -94,6 +90,23 @@ module EasyML
94
90
  end.sort.map { |arr| arr[1] }.uniq
95
91
  end
96
92
 
93
+ def apply_cast(df)
94
+ schema = dataset.schema
95
+ column_index = reduce({}) do |h, col|
96
+ h.tap do
97
+ col.aliases.each do |alias_name|
98
+ h[alias_name] = col
99
+ end
100
+ end
101
+ end
102
+ cast_statements = (df.columns & schema.keys.map(&:to_s)).map do |df_col|
103
+ db_col = column_index[df_col]
104
+ expected_dtype = schema[df_col.to_sym]
105
+ db_col.cast_statement(df, df_col, expected_dtype)
106
+ end
107
+ df = df.with_columns(cast_statements)
108
+ end
109
+
97
110
  def cast(processed_or_raw)
98
111
  columns = where(is_computed: false)
99
112
  is_processed = processed_or_raw == :processed
@@ -154,8 +167,6 @@ module EasyML
154
167
  EasyML::Lineage.import(lineage, on_duplicate_key_update: { columns: %i[ column_id key occurred_at description ] })
155
168
  end
156
169
 
157
- measure_method_timing :set_feature_lineage
158
-
159
170
  private
160
171
 
161
172
  def import_new(new_columns, existing_columns)
@@ -57,8 +57,6 @@ module EasyML
57
57
  dataset.columns.set_feature_lineage(columns)
58
58
  end
59
59
 
60
- measure_method_timing :save_statistics
61
-
62
60
  def learn_statistics
63
61
  return @statistics if @statistics
64
62
 
@@ -78,8 +76,6 @@ module EasyML
78
76
  end
79
77
  end
80
78
 
81
- measure_method_timing :learn_statistics
82
-
83
79
  def prepare
84
80
  @schema = EasyML::Data::PolarsSchema.simplify(@dataset.raw_schema).symbolize_keys
85
81
  @raw_columns = @schema.keys.sort.map(&:to_s)
@@ -93,19 +89,13 @@ module EasyML
93
89
  EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[in_raw_dataset datatype] })
94
90
  end
95
91
 
96
- measure_method_timing :prepare
97
-
98
92
  def lazy_statistics
99
93
  Lazy.new(dataset, columns, type: type).learn
100
94
  end
101
95
 
102
- measure_method_timing :lazy_statistics
103
-
104
96
  def eager_statistics
105
97
  Eager.new(dataset, columns, type: type).learn
106
98
  end
107
-
108
- measure_method_timing :eager_statistics
109
99
  end
110
100
  end
111
101
  end
@@ -215,9 +215,9 @@ module EasyML
215
215
 
216
216
  @raw = raw.cp(version)
217
217
  @processed = processed.cp(version)
218
- features.each(&:bump_version)
219
-
220
- save
218
+ save.tap do
219
+ features.each(&:bump_version)
220
+ end
221
221
  end
222
222
 
223
223
  def refreshed_datasource?
@@ -257,9 +257,6 @@ module EasyML
257
257
  end
258
258
  end
259
259
 
260
- include EasyML::Timing
261
- measure_method_timing :actually_refresh
262
-
263
260
  def refresh!(async: false)
264
261
  refreshing do
265
262
  prepare!
@@ -276,29 +273,22 @@ module EasyML
276
273
  end
277
274
  end
278
275
 
279
- measure_method_timing :refresh
280
-
281
276
  def fit_features!(async: false, features: self.features)
282
277
  fit_features(async: async, features: features, force: true)
283
278
  end
284
279
 
285
280
  def fit_features(async: false, features: self.features, force: false)
286
281
  features_to_compute = force ? features : features.needs_fit
287
- puts "Features to compute.... #{features_to_compute}"
288
282
  return after_fit_features if features_to_compute.empty?
289
283
 
290
284
  features.first.fit(features: features_to_compute, async: async)
291
285
  end
292
286
 
293
- measure_method_timing :fit_features
294
-
295
287
  def after_fit_features
296
- puts "After fit features"
297
288
  unlock!
298
289
  reload
299
290
  return if failed?
300
291
 
301
- puts "Actually refresh..."
302
292
  actually_refresh
303
293
  end
304
294
 
@@ -476,15 +466,24 @@ module EasyML
476
466
 
477
467
  def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
478
468
  df = apply_missing_columns(df, inference: inference)
479
- df = columns.transform(df, inference: inference)
480
- df = apply_features(df, features)
481
- df = columns.transform(df, inference: inference)
469
+ df = transform_columns(df, inference: inference, encode: false)
470
+ df = apply_features(df, features, inference: inference)
471
+ df = apply_cast(df) if inference
472
+ df = transform_columns(df, inference: inference)
482
473
  df = apply_column_mask(df, inference: inference) unless all_columns
483
474
  df = drop_nulls(df) unless inference
484
475
  df, = processed.split_features_targets(df, true, target) if split_ys
485
476
  df
486
477
  end
487
478
 
479
+ def transform_columns(df, inference: false, encode: true)
480
+ columns.transform(df, inference: inference, encode: encode)
481
+ end
482
+
483
+ def apply_cast(df)
484
+ columns.apply_cast(df)
485
+ end
486
+
488
487
  # Massage out one-hot cats to their canonical name
489
488
  #
490
489
  # Takes: ["Sex_male", "Sex_female", "Embarked_c", "PassengerId"]
@@ -503,8 +502,6 @@ module EasyML
503
502
  end.uniq.sort
504
503
  end
505
504
 
506
- measure_method_timing :normalize
507
-
508
505
  def missing_required_fields(df)
509
506
  desc_df = df.describe
510
507
 
@@ -633,21 +630,13 @@ module EasyML
633
630
  df[column_mask(df, inference: inference)]
634
631
  end
635
632
 
636
- measure_method_timing :apply_column_mask
637
-
638
- def apply_missing_columns(df, inference: false, include_one_hots: false)
633
+ def apply_missing_columns(df, inference: false)
639
634
  return df unless inference
640
635
 
641
- missing_columns = (col_order(inference: inference) - df.columns).compact
642
- unless include_one_hots
643
- columns.one_hots.each do |one_hot|
644
- virtual_columns = one_hot.virtual_columns
645
- if virtual_columns.all? { |vc| df.columns.include?(vc) }
646
- missing_columns -= columns.one_hots.flat_map(&:virtual_columns)
647
- else
648
- missing_columns += columns.one_hots.map(&:name) - df.columns
649
- end
650
- end
636
+ missing_columns = (col_order(inference: inference) - df.columns).compact.uniq
637
+ columns.one_hots.each do |one_hot|
638
+ missing_columns -= one_hot.virtual_columns
639
+ missing_columns += [one_hot.name]
651
640
  end
652
641
  df.with_columns(missing_columns.map { |f| Polars.lit(nil).alias(f) })
653
642
  end
@@ -771,8 +760,6 @@ module EasyML
771
760
  after_refresh_datasource
772
761
  end
773
762
 
774
- measure_method_timing :refresh_datasource
775
-
776
763
  def refresh_datasource!
777
764
  datasource.reload.refresh!
778
765
  after_refresh_datasource
@@ -798,8 +785,6 @@ module EasyML
798
785
  @normalized = true
799
786
  end
800
787
 
801
- measure_method_timing :normalize_all
802
-
803
788
  def learn_computed_columns(df)
804
789
  return unless features.ready_to_apply.any?
805
790
 
@@ -811,8 +796,6 @@ module EasyML
811
796
  processed.cleanup
812
797
  end
813
798
 
814
- measure_method_timing :learn_computed_columns
815
-
816
799
  def drop_nulls(df)
817
800
  return df if drop_if_null.nil? || drop_if_null.empty?
818
801
 
@@ -822,8 +805,6 @@ module EasyML
822
805
  df.drop_nulls(subset: drop)
823
806
  end
824
807
 
825
- measure_method_timing :drop_nulls
826
-
827
808
  # Pass refresh: false for frontend views so we don't query S3 during web requests
828
809
  def load_data(segment, **kwargs, &block)
829
810
  needs_refresh = kwargs.key?(:refresh) ? kwargs[:refresh] : needs_refresh?
@@ -876,8 +857,8 @@ module EasyML
876
857
  columns.find_by(name: column_name).update(is_date_column: true)
877
858
  end
878
859
 
879
- def apply_features(df, features = self.features)
880
- features = features.ready_to_apply
860
+ def apply_features(df, features = self.features, inference: false)
861
+ features = inference ? preloaded_features : features.ready_to_apply
881
862
  if features.nil? || features.empty?
882
863
  df
883
864
  else
@@ -897,15 +878,13 @@ module EasyML
897
878
  # Set SHA without querying
898
879
  feature.instance_variable_set(:@current_sha, shas[feature.feature_class])
899
880
 
900
- result = feature.transform_batch(acc_df)
881
+ result = feature.transform_batch(acc_df, inference: inference)
901
882
 
902
883
  result
903
884
  end
904
885
  end
905
886
  end
906
887
 
907
- measure_method_timing :apply_features
908
-
909
888
  def standardize_preprocessing_steps(type)
910
889
  columns.map(&:name).zip(columns.map do |col|
911
890
  col.preprocessing_steps&.dig(type)
@@ -48,28 +48,37 @@ module EasyML
48
48
 
49
49
  def actually_deploy
50
50
  lock_deploy do
51
- update(status: "running")
52
- EasyML::Event.create_event(self, "started")
53
-
54
- if identical_deploy.present?
55
- self.model_file = identical_deploy.model_file
56
- self.model_version = identical_deploy.model_version
57
- else
58
- if model_file.present?
59
- model.model_file = model_file
51
+ begin
52
+ update(status: "running")
53
+ EasyML::Event.create_event(self, "started")
54
+
55
+ if identical_deploy.present?
56
+ self.model_file = identical_deploy.model_file
57
+ self.model_version = identical_deploy.model_version
58
+ else
59
+ if model_file.present?
60
+ model.model_file = model_file
61
+ end
62
+ # model.load_model
63
+ self.model_version = model.actually_deploy
60
64
  end
61
- model.load_model
62
- self.model_version = model.actually_deploy
63
- end
64
65
 
65
- EasyML::Deploy.transaction do
66
- update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, status: :success)
67
- model.retraining_runs.where(status: :deployed).update_all(status: :success)
68
- retraining_run.update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, deploy_id: id, status: :deployed, is_deploying: false)
69
- end
66
+ EasyML::Deploy.transaction do
67
+ update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, status: :success)
68
+ model.retraining_runs.where(status: :deployed).update_all(status: :success)
69
+ retraining_run.update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, deploy_id: id, status: :deployed,)
70
+ end
70
71
 
71
- model_version.tap do
72
- EasyML::Event.create_event(self, "success")
72
+ model_version.tap do
73
+ EasyML::Event.create_event(self, "success")
74
+ end
75
+ rescue => e
76
+ update(status: "failed")
77
+ retraining_run.update(is_deploying: false)
78
+ EasyML::Event.create_event(self, "failed")
79
+ raise e
80
+ ensure
81
+ unlock!
73
82
  end
74
83
  end
75
84
  end
@@ -82,7 +82,7 @@ module EasyML
82
82
  fittable = fittable.select(&:fittable?)
83
83
  where(id: fittable.map(&:id))
84
84
  end
85
- scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed) }
85
+ scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed).or(where(needs_fit: true)) }
86
86
  scope :datasource_was_refreshed, -> do
87
87
  where(id: all.select(&:datasource_was_refreshed?).map(&:id))
88
88
  end
@@ -310,9 +310,9 @@ module EasyML
310
310
  end
311
311
 
312
312
  # Transform a single batch, used for testing the user's feature implementation
313
- def transform_batch(df = nil, batch_args = {})
313
+ def transform_batch(df = nil, batch_args = {}, inference: false)
314
314
  if df.is_a?(Polars::DataFrame)
315
- actually_transform_batch(df)
315
+ actually_transform_batch(df, inference: inference)
316
316
  else
317
317
  actually_transform_batch(build_batch(get_batch_args(**batch_args)))
318
318
  end
@@ -374,11 +374,12 @@ module EasyML
374
374
  batch_df
375
375
  end
376
376
 
377
- def actually_transform_batch(df)
377
+ def actually_transform_batch(df, inference: false)
378
378
  return nil unless df.is_a?(Polars::DataFrame)
379
379
  return df if !adapter.respond_to?(:transform) && feature_store.empty?
380
380
 
381
381
  df_len_was = df.shape[0]
382
+ orig_df = df.clone
382
383
  begin
383
384
  result = adapter.transform(df, self)
384
385
  rescue => e
@@ -386,8 +387,10 @@ module EasyML
386
387
  end
387
388
  raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
388
389
  df_len_now = result.shape[0]
389
- raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
390
- update!(applied_at: Time.current)
390
+ missing_columns = orig_df.columns - result.columns
391
+ raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if (df_len_now != df_len_was)
392
+ raise "Feature #{feature_class} removed #{missing_columns} columns" if missing_columns.any?
393
+ update!(applied_at: Time.current) unless inference
391
394
  result
392
395
  end
393
396
 
@@ -432,9 +435,8 @@ module EasyML
432
435
  end
433
436
 
434
437
  def bump_version
435
- old_version = version
438
+ feature_store.bump_version(version)
436
439
  write_attribute(:version, version + 1)
437
- feature_store.cp(old_version, version)
438
440
  self
439
441
  end
440
442
 
@@ -44,7 +44,16 @@ module EasyML
44
44
  scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
45
45
  scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
46
46
 
47
+ def wipe
48
+ false
49
+ end
50
+
47
51
  def download_remote_files
52
+ return unless snapshot_id # if not finished saving, skip
53
+ return if feature_store.synced?
54
+ return if @downloaded
55
+
56
+ @downloaded = true
48
57
  feature_store&.download
49
58
  end
50
59
  end
@@ -182,6 +182,7 @@ module EasyML
182
182
  lock_model do
183
183
  run = pending_run
184
184
  run.wrap_training do
185
+ dataset.refresh if dataset.needs_refresh?
185
186
  raise untrainable_error unless trainable?
186
187
 
187
188
  best_params = nil
@@ -210,6 +211,10 @@ module EasyML
210
211
  end
211
212
  end
212
213
 
214
+ def locked?
215
+ Support::Lockable.locked?(lock_key)
216
+ end
217
+
213
218
  def with_lock
214
219
  EasyML::Support::Lockable.with_lock(lock_key, stale_timeout: 60, resources: 1) do |client|
215
220
  yield client
@@ -273,7 +278,7 @@ module EasyML
273
278
  end
274
279
 
275
280
  def inference_version
276
- latest_deploy&.model_version
281
+ deploys.where(status: :success).order(id: :desc).limit(1).last&.model_version
277
282
  end
278
283
 
279
284
  alias_method :current_version, :inference_version
@@ -296,21 +301,21 @@ module EasyML
296
301
  )
297
302
  end
298
303
 
299
- def prepare_predict(xs)
304
+ def prepare_predict(xs, normalized: false)
300
305
  load_model!
301
- unless xs.is_a?(XGBoost::DMatrix)
306
+ if !normalized
302
307
  xs = dataset.normalize(xs, inference: true)
303
308
  end
304
309
  xs
305
310
  end
306
311
 
307
- def predict(xs)
308
- xs = prepare_predict(xs)
312
+ def predict(xs, normalized: false)
313
+ xs = prepare_predict(xs, normalized: normalized)
309
314
  adapter.predict(xs)
310
315
  end
311
316
 
312
- def predict_proba(xs)
313
- xs = prepare_predict(xs)
317
+ def predict_proba(xs, normalized: false)
318
+ xs = prepare_predict(xs, normalized: normalized)
314
319
  adapter.predict_proba(xs)
315
320
  end
316
321
 
@@ -49,7 +49,7 @@ module EasyML
49
49
  x_valid, y_valid = valid_dataset
50
50
  x_valid = x_valid.select(model.dataset.col_order(inference: true))
51
51
  @preprocessed ||= model.preprocess(x_valid, y_valid)
52
- y_pred = model.predict(@preprocessed)
52
+ y_pred = model.predict(@preprocessed, normalized: true)
53
53
  dataset = model.dataset.processed.valid(all_columns: true)
54
54
 
55
55
  metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
@@ -78,7 +78,7 @@ module EasyML
78
78
 
79
79
  track_feature_importance(booster)
80
80
  if tuner.nil?
81
- track_cumulative_feature_importance(false)
81
+ track_cumulative_feature_importance
82
82
  end
83
83
 
84
84
  booster
@@ -17,8 +17,8 @@ module EasyML
17
17
  class Prediction < ActiveRecord::Base
18
18
  self.table_name = "easy_ml_predictions"
19
19
 
20
- belongs_to :model
21
- belongs_to :model_history, optional: true
20
+ belongs_to :model, class_name: "EasyML::Model"
21
+ belongs_to :model_history, class_name: "EasyML::ModelHistory", optional: true
22
22
 
23
23
  validates :model_id, presence: true
24
24
  validates :prediction_type, presence: true, inclusion: { in: %w[regression classification] }
@@ -10,6 +10,8 @@ module EasyML
10
10
  object.prediction_value.symbolize_keys.dig(:value)
11
11
  when Numeric
12
12
  object.prediction_value
13
+ when Array
14
+ object.prediction_value
13
15
  end
14
16
  end
15
17
 
@@ -153,6 +153,8 @@ module EasyML
153
153
 
154
154
  def normalize_input(input)
155
155
  case input
156
+ when Polars::LazyFrame
157
+ normalize_input(input.collect)
156
158
  when Array
157
159
  if input.first.class == TrueClass || input.first.class == FalseClass
158
160
  input = input.map { |value| value ? 1.0 : 0.0 }
@@ -147,7 +147,7 @@ module EasyML
147
147
  end
148
148
  end
149
149
 
150
- y_pred = model.predict(x_normalized)
150
+ y_pred = model.predict(x_normalized, normalized: true)
151
151
  model.metrics = metrics
152
152
  metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
153
153
  metric = metrics.symbolize_keys.dig(model.evaluator[:metric].to_sym)
@@ -24,18 +24,31 @@ module EasyML
24
24
 
25
25
  def compact
26
26
  files = self.files
27
+ rows = query(lazy: true).collect
28
+ return unless rows.shape[0] > 0
29
+
30
+ FileUtils.rm(files)
27
31
 
28
32
  clear_unique_id
29
33
  File.join(root_dir, "compacted.parquet").tap do |target_file|
30
- safe_write(
31
- query(lazy: true),
32
- target_file
33
- )
34
- FileUtils.rm(files)
34
+ safe_write(rows, target_file)
35
35
  end
36
36
  clear_unique_id
37
37
  end
38
38
 
39
+ def cp(from,to)
40
+ return if from.nil? || !Dir.exist?(from)
41
+
42
+ FileUtils.mkdir_p(to)
43
+ files_to_cp = Dir.glob(Pathname.new(from).join("**/*")).select { |f| File.file?(f) }
44
+
45
+ files_to_cp.each do |file|
46
+ target_file = file.gsub(from, to)
47
+ FileUtils.mkdir_p(File.dirname(target_file))
48
+ FileUtils.cp(file, target_file)
49
+ end
50
+ end
51
+
39
52
  def unlock!
40
53
  clear_all_keys
41
54
  end
@@ -65,6 +78,8 @@ module EasyML
65
78
  end
66
79
 
67
80
  def safe_write(df, path)
81
+ raise "df must be a Polars::DataFrame or Polars::LazyFrame" unless df.is_a?(Polars::DataFrame) || df.is_a?(Polars::LazyFrame)
82
+
68
83
  FileUtils.mkdir_p(File.dirname(path))
69
84
  if df.is_a?(Polars::LazyFrame)
70
85
  # Depending on the query plan, sometimes sink_parquet will throw an error...
@@ -81,6 +96,10 @@ module EasyML
81
96
  df.write_parquet(path)
82
97
  end
83
98
  path
99
+ ensure
100
+ if Polars.scan_parquet(path).limit(1).schema.keys.empty?
101
+ raise "Failed to store to #{path}"
102
+ end
84
103
  end
85
104
 
86
105
  def clear_all_keys
@@ -17,8 +17,8 @@ module EasyML
17
17
  end
18
18
 
19
19
  def wipe
20
- partitions.each do |partition|
21
- FileUtils.rm_rf(File.join(root_dir, partition))
20
+ folders.each do |folder|
21
+ FileUtils.rm_rf(File.join(root_dir, folder))
22
22
  end
23
23
  clear_all_keys
24
24
  end
@@ -33,22 +33,37 @@ module EasyML
33
33
  end
34
34
 
35
35
  def compact
36
- files = self.files
36
+ return if compacted?
37
+
37
38
  @df = query(lazy: true)
38
39
 
39
40
  clear_unique_id(subdir: "compacted")
40
41
  compact_each_partition.tap do
41
- FileUtils.rm(files)
42
42
  clear_unique_id
43
43
  end
44
+ uncompacted_folders.each do |folder|
45
+ FileUtils.rm_rf(File.join(root_dir, folder))
46
+ end
44
47
  end
45
48
 
46
49
  private
47
50
 
48
- def partitions
49
- Dir.glob(File.join(root_dir, "**/*")).map { |f| f.split("/").last }
51
+ def compacted?
52
+ uncompacted_folders.empty?
53
+ end
54
+
55
+ def uncompacted_folders
56
+ folders - ["compacted"]
50
57
  end
51
58
 
59
+ def folders
60
+ Dir.glob(File.join(root_dir, "**/*")).select { |f| File.directory?(f) }.map { |f| f.split("/").last }
61
+ end
62
+
63
+ # def partitions
64
+ # Dir.glob(File.join(root_dir, "**/*")).map { |f| f.split("/").last }
65
+ # end
66
+
52
67
  def compact_each_partition
53
68
  with_each_partition do |partition_df, _|
54
69
  safe_write(
@@ -31,6 +31,10 @@ module EasyML
31
31
  adapter_class.new(options).unlock!
32
32
  end
33
33
 
34
+ def cp(from, to)
35
+ adapter_class.new(options).cp(from, to)
36
+ end
37
+
34
38
  def store(df, *args)
35
39
  return df if df.is_a?(Polars::LazyFrame) ? df.schema.empty? : df.empty?
36
40
 
@@ -51,6 +51,10 @@ module EasyML
51
51
  def num_rows
52
52
  Reader.num_rows
53
53
  end
54
+
55
+ def cp(from, to)
56
+ Writer.cp(from, to)
57
+ end
54
58
  end
55
59
 
56
60
  def list_nulls(input = nil, **kwargs, &block)
@@ -121,8 +121,6 @@ module EasyML
121
121
  polars_type ? sym_to_polars(type_name) : type_name
122
122
  end
123
123
 
124
- measure_method_timing :determine_type
125
-
126
124
  # Determines if a string field is a date, text, or categorical
127
125
  # @param series [Polars::Series] The string series to analyze
128
126
  # @return [Symbol] One of :datetime, :text, or :categorical
@@ -149,8 +147,6 @@ module EasyML
149
147
  end
150
148
  end
151
149
 
152
- measure_method_timing :determine_string_type
153
-
154
150
  # Determines if a string field is categorical or free text
155
151
  # @param series [Polars::Series] The string series to analyze
156
152
  # @return [Symbol] Either :categorical or :text
@@ -178,8 +174,6 @@ module EasyML
178
174
  avg_percentage < 1.0 ? :text : :categorical
179
175
  end
180
176
 
181
- measure_method_timing :categorical_or_text?
182
-
183
177
  # Returns whether the field type is numeric
184
178
  # @param field_type [Symbol] The field type to check
185
179
  # @return [Boolean]
@@ -23,20 +23,16 @@ module EasyML
23
23
  end
24
24
  end
25
25
 
26
- def cp(old_version, new_version)
27
- old_dir = feature_dir_for_version(old_version)
28
- new_dir = feature_dir_for_version(new_version)
29
-
30
- return if old_dir.nil? || !Dir.exist?(old_dir)
31
-
32
- FileUtils.mkdir_p(new_dir)
33
- files_to_cp = Dir.glob(Pathname.new(old_dir).join("**/*")).select { |f| File.file?(f) }
26
+ def synced?
27
+ files.any?
28
+ end
34
29
 
35
- files_to_cp.each do |file|
36
- target_file = file.gsub(old_dir, new_dir)
37
- FileUtils.mkdir_p(File.dirname(target_file))
38
- FileUtils.cp(file, target_file)
39
- end
30
+ def bump_version(version)
31
+ compact
32
+ cp(
33
+ feature_dir_for_version(version),
34
+ feature_dir_for_version(version + 1),
35
+ )
40
36
  end
41
37
 
42
38
  private
@@ -3,6 +3,7 @@ require "singleton"
3
3
  module EasyML
4
4
  class Predict
5
5
  include Singleton
6
+ include EasyML::Timing
6
7
 
7
8
  attr_reader :models
8
9
 
@@ -20,7 +21,7 @@ module EasyML
20
21
  def self.predict(model_name, df, serialize: false)
21
22
  df = normalize_input(df)
22
23
  output = make_predictions(model_name, df) do |model, normalized_df|
23
- model.predict(normalized_df)
24
+ model.predict(normalized_df, normalized: true)
24
25
  end
25
26
 
26
27
  if serialize
@@ -33,7 +34,7 @@ module EasyML
33
34
  def self.predict_proba(model_name, df, serialize: false)
34
35
  df = normalize_input(df)
35
36
  output = make_predictions(model_name, df) do |model, normalized_df|
36
- probas = model.predict_proba(normalized_df)
37
+ probas = model.predict_proba(normalized_df, normalized: true)
37
38
  probas.map { |proba_array| proba_array.map { |p| p.round(4) } }
38
39
  end
39
40
 
@@ -91,8 +92,8 @@ module EasyML
91
92
 
92
93
  output = predictions.zip(raw_input, normalized_input).map do |pred, raw, norm|
93
94
  EasyML::Prediction.create!(
94
- model: current_version.model,
95
- model_history: current_version,
95
+ model_id: current_version.model.id,
96
+ model_history_id: current_version.id,
96
97
  prediction_type: current_version.model.task,
97
98
  prediction_value: pred,
98
99
  raw_input: raw,
@@ -19,7 +19,9 @@ module EasyML
19
19
  result = send(method_alias, *args, **kwargs, &block)
20
20
  ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
21
21
  elapsed = ending - starting
22
- puts "#{method_name} took #{elapsed.round(2)} seconds"
22
+ 10.times do
23
+ puts "#{method_name} took #{elapsed.round(2)} seconds"
24
+ end
23
25
  # StatsD.measure("#{Rails.env}.#{prefix.present? ? "#{prefix}." : ""}#{method_name}.timing", elapsed)
24
26
  result
25
27
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc89"
4
+ VERSION = "0.2.0-rc90"
5
5
 
6
6
  module Version
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: easy_ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre.rc89
4
+ version: 0.2.0.pre.rc90
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brett Shollenberger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-03-03 00:00:00.000000000 Z
11
+ date: 2025-03-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord