easy_ml 0.2.0.pre.rc89 → 0.2.0.pre.rc90
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/predictions_controller.rb +9 -4
- data/app/jobs/easy_ml/training_job.rb +2 -2
- data/app/models/easy_ml/column/imputers/base.rb +1 -1
- data/app/models/easy_ml/column/imputers/categorical.rb +1 -1
- data/app/models/easy_ml/column/imputers/embedding_encoder.rb +2 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +4 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -0
- data/app/models/easy_ml/column.rb +25 -1
- data/app/models/easy_ml/column_list/imputer.rb +4 -0
- data/app/models/easy_ml/column_list.rb +19 -8
- data/app/models/easy_ml/dataset/learner.rb +0 -10
- data/app/models/easy_ml/dataset.rb +23 -44
- data/app/models/easy_ml/deploy.rb +28 -19
- data/app/models/easy_ml/feature.rb +10 -8
- data/app/models/easy_ml/feature_history.rb +9 -0
- data/app/models/easy_ml/model.rb +12 -7
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +2 -2
- data/app/models/easy_ml/prediction.rb +2 -2
- data/app/serializers/easy_ml/prediction_serializer.rb +2 -0
- data/lib/easy_ml/core/model_evaluator.rb +2 -0
- data/lib/easy_ml/core/tuner.rb +1 -1
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +24 -5
- data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +21 -6
- data/lib/easy_ml/data/dataset_manager/writer.rb +4 -0
- data/lib/easy_ml/data/dataset_manager.rb +4 -0
- data/lib/easy_ml/data/polars_column.rb +0 -6
- data/lib/easy_ml/feature_store.rb +9 -13
- data/lib/easy_ml/predict.rb +5 -4
- data/lib/easy_ml/timing.rb +3 -1
- data/lib/easy_ml/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '08be0b67dba395b4aa3493a0a0fa6e5cde31246f299a14460590c8ced298d557'
|
4
|
+
data.tar.gz: d72734f3d5045e1f3554eecadb641d141fbb89f02b3c6f0cb265e5588c6d2866
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a873a1cf9b00fd84dc0912392f4de8140eff18377c97e3d9464c7ce2d73a3a82c16f0ae34d09b095be290ca6cb287bc28bfa76864d1e003db99d308bc44413da
|
7
|
+
data.tar.gz: 47bc5ff93e92e8d51b62dad043917ce3b110a4141aa9e404456f2934dbe4b5d198506af81af7ddda78b511b96a564abc6df62834e376b02a4d7a6859dafdecfc
|
@@ -4,22 +4,27 @@ module EasyML
|
|
4
4
|
|
5
5
|
def create
|
6
6
|
slug = params[:model]
|
7
|
-
|
7
|
+
model = EasyML::Model.find_by(slug: slug)
|
8
|
+
unless model.present?
|
9
|
+
return render json: { error: "Model not found" }, status: :not_found
|
10
|
+
end
|
11
|
+
|
12
|
+
unless model.inference_version.present?
|
8
13
|
return render json: { error: "Model not found" }, status: :not_found
|
9
14
|
end
|
10
15
|
|
11
16
|
unless params.key?(:input)
|
12
|
-
return render json: { error: "Must provide key: input" }, status: :
|
17
|
+
return render json: { error: "Must provide key: input" }, status: :unprocessable_entity
|
13
18
|
end
|
14
19
|
input = params[:input].permit!.to_h
|
15
20
|
|
16
21
|
unless input.is_a?(Hash)
|
17
|
-
return render json: { error: "Input must be a hash" }, status: :
|
22
|
+
return render json: { error: "Input must be a hash" }, status: :unprocessable_entity
|
18
23
|
end
|
19
24
|
|
20
25
|
valid, fields = EasyML::Predict.validate_input(slug, input)
|
21
26
|
unless valid
|
22
|
-
return render json: { error: "Missing required fields: #{fields}" }, status: :
|
27
|
+
return render json: { error: "Missing required fields: #{fields}" }, status: :unprocessable_entity
|
23
28
|
end
|
24
29
|
|
25
30
|
type = (params[:type] || :predict).to_sym
|
@@ -10,13 +10,13 @@ module EasyML
|
|
10
10
|
|
11
11
|
@last_activity = Time.current
|
12
12
|
setup_signal_traps
|
13
|
-
@monitor_thread = start_monitor_thread
|
13
|
+
# @monitor_thread = start_monitor_thread
|
14
14
|
|
15
15
|
@model.actually_train do |iteration_info|
|
16
16
|
@last_activity = Time.current
|
17
17
|
end
|
18
18
|
ensure
|
19
|
-
@monitor_thread&.exit
|
19
|
+
# @monitor_thread&.exit
|
20
20
|
@model.unlock!
|
21
21
|
end
|
22
22
|
|
@@ -43,6 +43,10 @@ module EasyML
|
|
43
43
|
@adapters ||= ordered_adapters.map { |klass| klass.new(column, preprocessing_step) }.select { |adapter| allowed?(adapter) && adapter.applies? }
|
44
44
|
end
|
45
45
|
|
46
|
+
def encode=(value)
|
47
|
+
adapters.each { |adapter| adapter.encode = value }
|
48
|
+
end
|
49
|
+
|
46
50
|
def description
|
47
51
|
adapters.map(&:description).compact.join(", ")
|
48
52
|
end
|
@@ -184,9 +184,10 @@ module EasyML
|
|
184
184
|
end
|
185
185
|
end
|
186
186
|
|
187
|
-
def transform(df, inference: false,
|
187
|
+
def transform(df, inference: false, encode: true)
|
188
188
|
imputer = inference && imputers.inference.anything? ? imputers.inference : imputers.training
|
189
189
|
|
190
|
+
imputer.encode = encode
|
190
191
|
df = imputer.transform(df)
|
191
192
|
df
|
192
193
|
end
|
@@ -513,6 +514,29 @@ module EasyML
|
|
513
514
|
EasyML::Import::Column.from_config(config, dataset, action: action)
|
514
515
|
end
|
515
516
|
|
517
|
+
def cast_statement(df, df_col, expected_dtype)
|
518
|
+
expected_dtype = expected_dtype.class
|
519
|
+
actual_type = df[df_col].dtype
|
520
|
+
|
521
|
+
cast_statement = case expected_dtype
|
522
|
+
when Polars::Boolean
|
523
|
+
case actual_type
|
524
|
+
when Polars::Boolean
|
525
|
+
Polars.col(df_col).cast(expected_dtype)
|
526
|
+
when Polars::String, Polars::Categorical
|
527
|
+
Polars.col(df_col).eq("true").cast(expected_dtype)
|
528
|
+
when Polars::Null
|
529
|
+
Polars.col(df_col)
|
530
|
+
else
|
531
|
+
raise "Unexpected dtype: #{actual_type} for column: #{df_col}"
|
532
|
+
end
|
533
|
+
else
|
534
|
+
Polars.col(df_col).cast(expected_dtype)
|
535
|
+
end
|
536
|
+
|
537
|
+
cast_statement.alias(df_col)
|
538
|
+
end
|
539
|
+
|
516
540
|
def cast(value)
|
517
541
|
return value if value.nil?
|
518
542
|
|
@@ -15,6 +15,10 @@ module EasyML
|
|
15
15
|
@imputers ||= columns.map { |column| inference ? column.imputers(@_imputers).inference : column.imputers(@_imputers).training }
|
16
16
|
end
|
17
17
|
|
18
|
+
def encode=(encode)
|
19
|
+
imputers.each { |imputer| imputer.encode = encode }
|
20
|
+
end
|
21
|
+
|
18
22
|
def exprs
|
19
23
|
imputers.flat_map(&:exprs).compact
|
20
24
|
end
|
@@ -22,7 +22,7 @@ module EasyML
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
-
def transform(df, inference: false, computed: false)
|
25
|
+
def transform(df, inference: false, computed: false, encode: true)
|
26
26
|
return df if df.nil?
|
27
27
|
|
28
28
|
if computed
|
@@ -33,14 +33,12 @@ module EasyML
|
|
33
33
|
|
34
34
|
by_name = cols.index_by(&:name)
|
35
35
|
cols.each do |column|
|
36
|
-
df = column.transform(df, inference: inference,
|
36
|
+
df = column.transform(df, inference: inference, encode: encode) if column
|
37
37
|
end
|
38
38
|
|
39
39
|
df
|
40
40
|
end
|
41
41
|
|
42
|
-
measure_method_timing :transform
|
43
|
-
|
44
42
|
def apply_clip(df)
|
45
43
|
clip_cols = has_clip.raw
|
46
44
|
return df unless clip_cols.any?
|
@@ -60,8 +58,6 @@ module EasyML
|
|
60
58
|
reload
|
61
59
|
end
|
62
60
|
|
63
|
-
measure_method_timing :learn
|
64
|
-
|
65
61
|
def statistics
|
66
62
|
stats = { raw: {}, processed: {} }
|
67
63
|
select(&:persisted?).inject(stats) do |h, col|
|
@@ -94,6 +90,23 @@ module EasyML
|
|
94
90
|
end.sort.map { |arr| arr[1] }.uniq
|
95
91
|
end
|
96
92
|
|
93
|
+
def apply_cast(df)
|
94
|
+
schema = dataset.schema
|
95
|
+
column_index = reduce({}) do |h, col|
|
96
|
+
h.tap do
|
97
|
+
col.aliases.each do |alias_name|
|
98
|
+
h[alias_name] = col
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
cast_statements = (df.columns & schema.keys.map(&:to_s)).map do |df_col|
|
103
|
+
db_col = column_index[df_col]
|
104
|
+
expected_dtype = schema[df_col.to_sym]
|
105
|
+
db_col.cast_statement(df, df_col, expected_dtype)
|
106
|
+
end
|
107
|
+
df = df.with_columns(cast_statements)
|
108
|
+
end
|
109
|
+
|
97
110
|
def cast(processed_or_raw)
|
98
111
|
columns = where(is_computed: false)
|
99
112
|
is_processed = processed_or_raw == :processed
|
@@ -154,8 +167,6 @@ module EasyML
|
|
154
167
|
EasyML::Lineage.import(lineage, on_duplicate_key_update: { columns: %i[ column_id key occurred_at description ] })
|
155
168
|
end
|
156
169
|
|
157
|
-
measure_method_timing :set_feature_lineage
|
158
|
-
|
159
170
|
private
|
160
171
|
|
161
172
|
def import_new(new_columns, existing_columns)
|
@@ -57,8 +57,6 @@ module EasyML
|
|
57
57
|
dataset.columns.set_feature_lineage(columns)
|
58
58
|
end
|
59
59
|
|
60
|
-
measure_method_timing :save_statistics
|
61
|
-
|
62
60
|
def learn_statistics
|
63
61
|
return @statistics if @statistics
|
64
62
|
|
@@ -78,8 +76,6 @@ module EasyML
|
|
78
76
|
end
|
79
77
|
end
|
80
78
|
|
81
|
-
measure_method_timing :learn_statistics
|
82
|
-
|
83
79
|
def prepare
|
84
80
|
@schema = EasyML::Data::PolarsSchema.simplify(@dataset.raw_schema).symbolize_keys
|
85
81
|
@raw_columns = @schema.keys.sort.map(&:to_s)
|
@@ -93,19 +89,13 @@ module EasyML
|
|
93
89
|
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[in_raw_dataset datatype] })
|
94
90
|
end
|
95
91
|
|
96
|
-
measure_method_timing :prepare
|
97
|
-
|
98
92
|
def lazy_statistics
|
99
93
|
Lazy.new(dataset, columns, type: type).learn
|
100
94
|
end
|
101
95
|
|
102
|
-
measure_method_timing :lazy_statistics
|
103
|
-
|
104
96
|
def eager_statistics
|
105
97
|
Eager.new(dataset, columns, type: type).learn
|
106
98
|
end
|
107
|
-
|
108
|
-
measure_method_timing :eager_statistics
|
109
99
|
end
|
110
100
|
end
|
111
101
|
end
|
@@ -215,9 +215,9 @@ module EasyML
|
|
215
215
|
|
216
216
|
@raw = raw.cp(version)
|
217
217
|
@processed = processed.cp(version)
|
218
|
-
|
219
|
-
|
220
|
-
|
218
|
+
save.tap do
|
219
|
+
features.each(&:bump_version)
|
220
|
+
end
|
221
221
|
end
|
222
222
|
|
223
223
|
def refreshed_datasource?
|
@@ -257,9 +257,6 @@ module EasyML
|
|
257
257
|
end
|
258
258
|
end
|
259
259
|
|
260
|
-
include EasyML::Timing
|
261
|
-
measure_method_timing :actually_refresh
|
262
|
-
|
263
260
|
def refresh!(async: false)
|
264
261
|
refreshing do
|
265
262
|
prepare!
|
@@ -276,29 +273,22 @@ module EasyML
|
|
276
273
|
end
|
277
274
|
end
|
278
275
|
|
279
|
-
measure_method_timing :refresh
|
280
|
-
|
281
276
|
def fit_features!(async: false, features: self.features)
|
282
277
|
fit_features(async: async, features: features, force: true)
|
283
278
|
end
|
284
279
|
|
285
280
|
def fit_features(async: false, features: self.features, force: false)
|
286
281
|
features_to_compute = force ? features : features.needs_fit
|
287
|
-
puts "Features to compute.... #{features_to_compute}"
|
288
282
|
return after_fit_features if features_to_compute.empty?
|
289
283
|
|
290
284
|
features.first.fit(features: features_to_compute, async: async)
|
291
285
|
end
|
292
286
|
|
293
|
-
measure_method_timing :fit_features
|
294
|
-
|
295
287
|
def after_fit_features
|
296
|
-
puts "After fit features"
|
297
288
|
unlock!
|
298
289
|
reload
|
299
290
|
return if failed?
|
300
291
|
|
301
|
-
puts "Actually refresh..."
|
302
292
|
actually_refresh
|
303
293
|
end
|
304
294
|
|
@@ -476,15 +466,24 @@ module EasyML
|
|
476
466
|
|
477
467
|
def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
|
478
468
|
df = apply_missing_columns(df, inference: inference)
|
479
|
-
df =
|
480
|
-
df = apply_features(df, features)
|
481
|
-
df =
|
469
|
+
df = transform_columns(df, inference: inference, encode: false)
|
470
|
+
df = apply_features(df, features, inference: inference)
|
471
|
+
df = apply_cast(df) if inference
|
472
|
+
df = transform_columns(df, inference: inference)
|
482
473
|
df = apply_column_mask(df, inference: inference) unless all_columns
|
483
474
|
df = drop_nulls(df) unless inference
|
484
475
|
df, = processed.split_features_targets(df, true, target) if split_ys
|
485
476
|
df
|
486
477
|
end
|
487
478
|
|
479
|
+
def transform_columns(df, inference: false, encode: true)
|
480
|
+
columns.transform(df, inference: inference, encode: encode)
|
481
|
+
end
|
482
|
+
|
483
|
+
def apply_cast(df)
|
484
|
+
columns.apply_cast(df)
|
485
|
+
end
|
486
|
+
|
488
487
|
# Massage out one-hot cats to their canonical name
|
489
488
|
#
|
490
489
|
# Takes: ["Sex_male", "Sex_female", "Embarked_c", "PassengerId"]
|
@@ -503,8 +502,6 @@ module EasyML
|
|
503
502
|
end.uniq.sort
|
504
503
|
end
|
505
504
|
|
506
|
-
measure_method_timing :normalize
|
507
|
-
|
508
505
|
def missing_required_fields(df)
|
509
506
|
desc_df = df.describe
|
510
507
|
|
@@ -633,21 +630,13 @@ module EasyML
|
|
633
630
|
df[column_mask(df, inference: inference)]
|
634
631
|
end
|
635
632
|
|
636
|
-
|
637
|
-
|
638
|
-
def apply_missing_columns(df, inference: false, include_one_hots: false)
|
633
|
+
def apply_missing_columns(df, inference: false)
|
639
634
|
return df unless inference
|
640
635
|
|
641
|
-
missing_columns = (col_order(inference: inference) - df.columns).compact
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
if virtual_columns.all? { |vc| df.columns.include?(vc) }
|
646
|
-
missing_columns -= columns.one_hots.flat_map(&:virtual_columns)
|
647
|
-
else
|
648
|
-
missing_columns += columns.one_hots.map(&:name) - df.columns
|
649
|
-
end
|
650
|
-
end
|
636
|
+
missing_columns = (col_order(inference: inference) - df.columns).compact.uniq
|
637
|
+
columns.one_hots.each do |one_hot|
|
638
|
+
missing_columns -= one_hot.virtual_columns
|
639
|
+
missing_columns += [one_hot.name]
|
651
640
|
end
|
652
641
|
df.with_columns(missing_columns.map { |f| Polars.lit(nil).alias(f) })
|
653
642
|
end
|
@@ -771,8 +760,6 @@ module EasyML
|
|
771
760
|
after_refresh_datasource
|
772
761
|
end
|
773
762
|
|
774
|
-
measure_method_timing :refresh_datasource
|
775
|
-
|
776
763
|
def refresh_datasource!
|
777
764
|
datasource.reload.refresh!
|
778
765
|
after_refresh_datasource
|
@@ -798,8 +785,6 @@ module EasyML
|
|
798
785
|
@normalized = true
|
799
786
|
end
|
800
787
|
|
801
|
-
measure_method_timing :normalize_all
|
802
|
-
|
803
788
|
def learn_computed_columns(df)
|
804
789
|
return unless features.ready_to_apply.any?
|
805
790
|
|
@@ -811,8 +796,6 @@ module EasyML
|
|
811
796
|
processed.cleanup
|
812
797
|
end
|
813
798
|
|
814
|
-
measure_method_timing :learn_computed_columns
|
815
|
-
|
816
799
|
def drop_nulls(df)
|
817
800
|
return df if drop_if_null.nil? || drop_if_null.empty?
|
818
801
|
|
@@ -822,8 +805,6 @@ module EasyML
|
|
822
805
|
df.drop_nulls(subset: drop)
|
823
806
|
end
|
824
807
|
|
825
|
-
measure_method_timing :drop_nulls
|
826
|
-
|
827
808
|
# Pass refresh: false for frontend views so we don't query S3 during web requests
|
828
809
|
def load_data(segment, **kwargs, &block)
|
829
810
|
needs_refresh = kwargs.key?(:refresh) ? kwargs[:refresh] : needs_refresh?
|
@@ -876,8 +857,8 @@ module EasyML
|
|
876
857
|
columns.find_by(name: column_name).update(is_date_column: true)
|
877
858
|
end
|
878
859
|
|
879
|
-
def apply_features(df, features = self.features)
|
880
|
-
features = features.ready_to_apply
|
860
|
+
def apply_features(df, features = self.features, inference: false)
|
861
|
+
features = inference ? preloaded_features : features.ready_to_apply
|
881
862
|
if features.nil? || features.empty?
|
882
863
|
df
|
883
864
|
else
|
@@ -897,15 +878,13 @@ module EasyML
|
|
897
878
|
# Set SHA without querying
|
898
879
|
feature.instance_variable_set(:@current_sha, shas[feature.feature_class])
|
899
880
|
|
900
|
-
result = feature.transform_batch(acc_df)
|
881
|
+
result = feature.transform_batch(acc_df, inference: inference)
|
901
882
|
|
902
883
|
result
|
903
884
|
end
|
904
885
|
end
|
905
886
|
end
|
906
887
|
|
907
|
-
measure_method_timing :apply_features
|
908
|
-
|
909
888
|
def standardize_preprocessing_steps(type)
|
910
889
|
columns.map(&:name).zip(columns.map do |col|
|
911
890
|
col.preprocessing_steps&.dig(type)
|
@@ -48,28 +48,37 @@ module EasyML
|
|
48
48
|
|
49
49
|
def actually_deploy
|
50
50
|
lock_deploy do
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
51
|
+
begin
|
52
|
+
update(status: "running")
|
53
|
+
EasyML::Event.create_event(self, "started")
|
54
|
+
|
55
|
+
if identical_deploy.present?
|
56
|
+
self.model_file = identical_deploy.model_file
|
57
|
+
self.model_version = identical_deploy.model_version
|
58
|
+
else
|
59
|
+
if model_file.present?
|
60
|
+
model.model_file = model_file
|
61
|
+
end
|
62
|
+
# model.load_model
|
63
|
+
self.model_version = model.actually_deploy
|
60
64
|
end
|
61
|
-
model.load_model
|
62
|
-
self.model_version = model.actually_deploy
|
63
|
-
end
|
64
65
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
66
|
+
EasyML::Deploy.transaction do
|
67
|
+
update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, status: :success)
|
68
|
+
model.retraining_runs.where(status: :deployed).update_all(status: :success)
|
69
|
+
retraining_run.update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, deploy_id: id, status: :deployed,)
|
70
|
+
end
|
70
71
|
|
71
|
-
|
72
|
-
|
72
|
+
model_version.tap do
|
73
|
+
EasyML::Event.create_event(self, "success")
|
74
|
+
end
|
75
|
+
rescue => e
|
76
|
+
update(status: "failed")
|
77
|
+
retraining_run.update(is_deploying: false)
|
78
|
+
EasyML::Event.create_event(self, "failed")
|
79
|
+
raise e
|
80
|
+
ensure
|
81
|
+
unlock!
|
73
82
|
end
|
74
83
|
end
|
75
84
|
end
|
@@ -82,7 +82,7 @@ module EasyML
|
|
82
82
|
fittable = fittable.select(&:fittable?)
|
83
83
|
where(id: fittable.map(&:id))
|
84
84
|
end
|
85
|
-
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed) }
|
85
|
+
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed).or(where(needs_fit: true)) }
|
86
86
|
scope :datasource_was_refreshed, -> do
|
87
87
|
where(id: all.select(&:datasource_was_refreshed?).map(&:id))
|
88
88
|
end
|
@@ -310,9 +310,9 @@ module EasyML
|
|
310
310
|
end
|
311
311
|
|
312
312
|
# Transform a single batch, used for testing the user's feature implementation
|
313
|
-
def transform_batch(df = nil, batch_args = {})
|
313
|
+
def transform_batch(df = nil, batch_args = {}, inference: false)
|
314
314
|
if df.is_a?(Polars::DataFrame)
|
315
|
-
actually_transform_batch(df)
|
315
|
+
actually_transform_batch(df, inference: inference)
|
316
316
|
else
|
317
317
|
actually_transform_batch(build_batch(get_batch_args(**batch_args)))
|
318
318
|
end
|
@@ -374,11 +374,12 @@ module EasyML
|
|
374
374
|
batch_df
|
375
375
|
end
|
376
376
|
|
377
|
-
def actually_transform_batch(df)
|
377
|
+
def actually_transform_batch(df, inference: false)
|
378
378
|
return nil unless df.is_a?(Polars::DataFrame)
|
379
379
|
return df if !adapter.respond_to?(:transform) && feature_store.empty?
|
380
380
|
|
381
381
|
df_len_was = df.shape[0]
|
382
|
+
orig_df = df.clone
|
382
383
|
begin
|
383
384
|
result = adapter.transform(df, self)
|
384
385
|
rescue => e
|
@@ -386,8 +387,10 @@ module EasyML
|
|
386
387
|
end
|
387
388
|
raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
|
388
389
|
df_len_now = result.shape[0]
|
389
|
-
|
390
|
-
|
390
|
+
missing_columns = orig_df.columns - result.columns
|
391
|
+
raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if (df_len_now != df_len_was)
|
392
|
+
raise "Feature #{feature_class} removed #{missing_columns} columns" if missing_columns.any?
|
393
|
+
update!(applied_at: Time.current) unless inference
|
391
394
|
result
|
392
395
|
end
|
393
396
|
|
@@ -432,9 +435,8 @@ module EasyML
|
|
432
435
|
end
|
433
436
|
|
434
437
|
def bump_version
|
435
|
-
|
438
|
+
feature_store.bump_version(version)
|
436
439
|
write_attribute(:version, version + 1)
|
437
|
-
feature_store.cp(old_version, version)
|
438
440
|
self
|
439
441
|
end
|
440
442
|
|
@@ -44,7 +44,16 @@ module EasyML
|
|
44
44
|
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
|
45
45
|
scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
|
46
46
|
|
47
|
+
def wipe
|
48
|
+
false
|
49
|
+
end
|
50
|
+
|
47
51
|
def download_remote_files
|
52
|
+
return unless snapshot_id # if not finished saving, skip
|
53
|
+
return if feature_store.synced?
|
54
|
+
return if @downloaded
|
55
|
+
|
56
|
+
@downloaded = true
|
48
57
|
feature_store&.download
|
49
58
|
end
|
50
59
|
end
|
data/app/models/easy_ml/model.rb
CHANGED
@@ -182,6 +182,7 @@ module EasyML
|
|
182
182
|
lock_model do
|
183
183
|
run = pending_run
|
184
184
|
run.wrap_training do
|
185
|
+
dataset.refresh if dataset.needs_refresh?
|
185
186
|
raise untrainable_error unless trainable?
|
186
187
|
|
187
188
|
best_params = nil
|
@@ -210,6 +211,10 @@ module EasyML
|
|
210
211
|
end
|
211
212
|
end
|
212
213
|
|
214
|
+
def locked?
|
215
|
+
Support::Lockable.locked?(lock_key)
|
216
|
+
end
|
217
|
+
|
213
218
|
def with_lock
|
214
219
|
EasyML::Support::Lockable.with_lock(lock_key, stale_timeout: 60, resources: 1) do |client|
|
215
220
|
yield client
|
@@ -273,7 +278,7 @@ module EasyML
|
|
273
278
|
end
|
274
279
|
|
275
280
|
def inference_version
|
276
|
-
|
281
|
+
deploys.where(status: :success).order(id: :desc).limit(1).last&.model_version
|
277
282
|
end
|
278
283
|
|
279
284
|
alias_method :current_version, :inference_version
|
@@ -296,21 +301,21 @@ module EasyML
|
|
296
301
|
)
|
297
302
|
end
|
298
303
|
|
299
|
-
def prepare_predict(xs)
|
304
|
+
def prepare_predict(xs, normalized: false)
|
300
305
|
load_model!
|
301
|
-
|
306
|
+
if !normalized
|
302
307
|
xs = dataset.normalize(xs, inference: true)
|
303
308
|
end
|
304
309
|
xs
|
305
310
|
end
|
306
311
|
|
307
|
-
def predict(xs)
|
308
|
-
xs = prepare_predict(xs)
|
312
|
+
def predict(xs, normalized: false)
|
313
|
+
xs = prepare_predict(xs, normalized: normalized)
|
309
314
|
adapter.predict(xs)
|
310
315
|
end
|
311
316
|
|
312
|
-
def predict_proba(xs)
|
313
|
-
xs = prepare_predict(xs)
|
317
|
+
def predict_proba(xs, normalized: false)
|
318
|
+
xs = prepare_predict(xs, normalized: normalized)
|
314
319
|
adapter.predict_proba(xs)
|
315
320
|
end
|
316
321
|
|
@@ -49,7 +49,7 @@ module EasyML
|
|
49
49
|
x_valid, y_valid = valid_dataset
|
50
50
|
x_valid = x_valid.select(model.dataset.col_order(inference: true))
|
51
51
|
@preprocessed ||= model.preprocess(x_valid, y_valid)
|
52
|
-
y_pred = model.predict(@preprocessed)
|
52
|
+
y_pred = model.predict(@preprocessed, normalized: true)
|
53
53
|
dataset = model.dataset.processed.valid(all_columns: true)
|
54
54
|
|
55
55
|
metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
|
@@ -78,7 +78,7 @@ module EasyML
|
|
78
78
|
|
79
79
|
track_feature_importance(booster)
|
80
80
|
if tuner.nil?
|
81
|
-
track_cumulative_feature_importance
|
81
|
+
track_cumulative_feature_importance
|
82
82
|
end
|
83
83
|
|
84
84
|
booster
|
@@ -17,8 +17,8 @@ module EasyML
|
|
17
17
|
class Prediction < ActiveRecord::Base
|
18
18
|
self.table_name = "easy_ml_predictions"
|
19
19
|
|
20
|
-
belongs_to :model
|
21
|
-
belongs_to :model_history, optional: true
|
20
|
+
belongs_to :model, class_name: "EasyML::Model"
|
21
|
+
belongs_to :model_history, class_name: "EasyML::ModelHistory", optional: true
|
22
22
|
|
23
23
|
validates :model_id, presence: true
|
24
24
|
validates :prediction_type, presence: true, inclusion: { in: %w[regression classification] }
|
@@ -153,6 +153,8 @@ module EasyML
|
|
153
153
|
|
154
154
|
def normalize_input(input)
|
155
155
|
case input
|
156
|
+
when Polars::LazyFrame
|
157
|
+
normalize_input(input.collect)
|
156
158
|
when Array
|
157
159
|
if input.first.class == TrueClass || input.first.class == FalseClass
|
158
160
|
input = input.map { |value| value ? 1.0 : 0.0 }
|
data/lib/easy_ml/core/tuner.rb
CHANGED
@@ -147,7 +147,7 @@ module EasyML
|
|
147
147
|
end
|
148
148
|
end
|
149
149
|
|
150
|
-
y_pred = model.predict(x_normalized)
|
150
|
+
y_pred = model.predict(x_normalized, normalized: true)
|
151
151
|
model.metrics = metrics
|
152
152
|
metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
|
153
153
|
metric = metrics.symbolize_keys.dig(model.evaluator[:metric].to_sym)
|
@@ -24,18 +24,31 @@ module EasyML
|
|
24
24
|
|
25
25
|
def compact
|
26
26
|
files = self.files
|
27
|
+
rows = query(lazy: true).collect
|
28
|
+
return unless rows.shape[0] > 0
|
29
|
+
|
30
|
+
FileUtils.rm(files)
|
27
31
|
|
28
32
|
clear_unique_id
|
29
33
|
File.join(root_dir, "compacted.parquet").tap do |target_file|
|
30
|
-
safe_write(
|
31
|
-
query(lazy: true),
|
32
|
-
target_file
|
33
|
-
)
|
34
|
-
FileUtils.rm(files)
|
34
|
+
safe_write(rows, target_file)
|
35
35
|
end
|
36
36
|
clear_unique_id
|
37
37
|
end
|
38
38
|
|
39
|
+
def cp(from,to)
|
40
|
+
return if from.nil? || !Dir.exist?(from)
|
41
|
+
|
42
|
+
FileUtils.mkdir_p(to)
|
43
|
+
files_to_cp = Dir.glob(Pathname.new(from).join("**/*")).select { |f| File.file?(f) }
|
44
|
+
|
45
|
+
files_to_cp.each do |file|
|
46
|
+
target_file = file.gsub(from, to)
|
47
|
+
FileUtils.mkdir_p(File.dirname(target_file))
|
48
|
+
FileUtils.cp(file, target_file)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
39
52
|
def unlock!
|
40
53
|
clear_all_keys
|
41
54
|
end
|
@@ -65,6 +78,8 @@ module EasyML
|
|
65
78
|
end
|
66
79
|
|
67
80
|
def safe_write(df, path)
|
81
|
+
raise "df must be a Polars::DataFrame or Polars::LazyFrame" unless df.is_a?(Polars::DataFrame) || df.is_a?(Polars::LazyFrame)
|
82
|
+
|
68
83
|
FileUtils.mkdir_p(File.dirname(path))
|
69
84
|
if df.is_a?(Polars::LazyFrame)
|
70
85
|
# Depending on the query plan, sometimes sink_parquet will throw an error...
|
@@ -81,6 +96,10 @@ module EasyML
|
|
81
96
|
df.write_parquet(path)
|
82
97
|
end
|
83
98
|
path
|
99
|
+
ensure
|
100
|
+
if Polars.scan_parquet(path).limit(1).schema.keys.empty?
|
101
|
+
raise "Failed to store to #{path}"
|
102
|
+
end
|
84
103
|
end
|
85
104
|
|
86
105
|
def clear_all_keys
|
@@ -17,8 +17,8 @@ module EasyML
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def wipe
|
20
|
-
|
21
|
-
FileUtils.rm_rf(File.join(root_dir,
|
20
|
+
folders.each do |folder|
|
21
|
+
FileUtils.rm_rf(File.join(root_dir, folder))
|
22
22
|
end
|
23
23
|
clear_all_keys
|
24
24
|
end
|
@@ -33,22 +33,37 @@ module EasyML
|
|
33
33
|
end
|
34
34
|
|
35
35
|
def compact
|
36
|
-
|
36
|
+
return if compacted?
|
37
|
+
|
37
38
|
@df = query(lazy: true)
|
38
39
|
|
39
40
|
clear_unique_id(subdir: "compacted")
|
40
41
|
compact_each_partition.tap do
|
41
|
-
FileUtils.rm(files)
|
42
42
|
clear_unique_id
|
43
43
|
end
|
44
|
+
uncompacted_folders.each do |folder|
|
45
|
+
FileUtils.rm_rf(File.join(root_dir, folder))
|
46
|
+
end
|
44
47
|
end
|
45
48
|
|
46
49
|
private
|
47
50
|
|
48
|
-
def
|
49
|
-
|
51
|
+
def compacted?
|
52
|
+
uncompacted_folders.empty?
|
53
|
+
end
|
54
|
+
|
55
|
+
def uncompacted_folders
|
56
|
+
folders - ["compacted"]
|
50
57
|
end
|
51
58
|
|
59
|
+
def folders
|
60
|
+
Dir.glob(File.join(root_dir, "**/*")).select { |f| File.directory?(f) }.map { |f| f.split("/").last }
|
61
|
+
end
|
62
|
+
|
63
|
+
# def partitions
|
64
|
+
# Dir.glob(File.join(root_dir, "**/*")).map { |f| f.split("/").last }
|
65
|
+
# end
|
66
|
+
|
52
67
|
def compact_each_partition
|
53
68
|
with_each_partition do |partition_df, _|
|
54
69
|
safe_write(
|
@@ -121,8 +121,6 @@ module EasyML
|
|
121
121
|
polars_type ? sym_to_polars(type_name) : type_name
|
122
122
|
end
|
123
123
|
|
124
|
-
measure_method_timing :determine_type
|
125
|
-
|
126
124
|
# Determines if a string field is a date, text, or categorical
|
127
125
|
# @param series [Polars::Series] The string series to analyze
|
128
126
|
# @return [Symbol] One of :datetime, :text, or :categorical
|
@@ -149,8 +147,6 @@ module EasyML
|
|
149
147
|
end
|
150
148
|
end
|
151
149
|
|
152
|
-
measure_method_timing :determine_string_type
|
153
|
-
|
154
150
|
# Determines if a string field is categorical or free text
|
155
151
|
# @param series [Polars::Series] The string series to analyze
|
156
152
|
# @return [Symbol] Either :categorical or :text
|
@@ -178,8 +174,6 @@ module EasyML
|
|
178
174
|
avg_percentage < 1.0 ? :text : :categorical
|
179
175
|
end
|
180
176
|
|
181
|
-
measure_method_timing :categorical_or_text?
|
182
|
-
|
183
177
|
# Returns whether the field type is numeric
|
184
178
|
# @param field_type [Symbol] The field type to check
|
185
179
|
# @return [Boolean]
|
@@ -23,20 +23,16 @@ module EasyML
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
def
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
return if old_dir.nil? || !Dir.exist?(old_dir)
|
31
|
-
|
32
|
-
FileUtils.mkdir_p(new_dir)
|
33
|
-
files_to_cp = Dir.glob(Pathname.new(old_dir).join("**/*")).select { |f| File.file?(f) }
|
26
|
+
def synced?
|
27
|
+
files.any?
|
28
|
+
end
|
34
29
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
30
|
+
def bump_version(version)
|
31
|
+
compact
|
32
|
+
cp(
|
33
|
+
feature_dir_for_version(version),
|
34
|
+
feature_dir_for_version(version + 1),
|
35
|
+
)
|
40
36
|
end
|
41
37
|
|
42
38
|
private
|
data/lib/easy_ml/predict.rb
CHANGED
@@ -3,6 +3,7 @@ require "singleton"
|
|
3
3
|
module EasyML
|
4
4
|
class Predict
|
5
5
|
include Singleton
|
6
|
+
include EasyML::Timing
|
6
7
|
|
7
8
|
attr_reader :models
|
8
9
|
|
@@ -20,7 +21,7 @@ module EasyML
|
|
20
21
|
def self.predict(model_name, df, serialize: false)
|
21
22
|
df = normalize_input(df)
|
22
23
|
output = make_predictions(model_name, df) do |model, normalized_df|
|
23
|
-
model.predict(normalized_df)
|
24
|
+
model.predict(normalized_df, normalized: true)
|
24
25
|
end
|
25
26
|
|
26
27
|
if serialize
|
@@ -33,7 +34,7 @@ module EasyML
|
|
33
34
|
def self.predict_proba(model_name, df, serialize: false)
|
34
35
|
df = normalize_input(df)
|
35
36
|
output = make_predictions(model_name, df) do |model, normalized_df|
|
36
|
-
probas = model.predict_proba(normalized_df)
|
37
|
+
probas = model.predict_proba(normalized_df, normalized: true)
|
37
38
|
probas.map { |proba_array| proba_array.map { |p| p.round(4) } }
|
38
39
|
end
|
39
40
|
|
@@ -91,8 +92,8 @@ module EasyML
|
|
91
92
|
|
92
93
|
output = predictions.zip(raw_input, normalized_input).map do |pred, raw, norm|
|
93
94
|
EasyML::Prediction.create!(
|
94
|
-
|
95
|
-
|
95
|
+
model_id: current_version.model.id,
|
96
|
+
model_history_id: current_version.id,
|
96
97
|
prediction_type: current_version.model.task,
|
97
98
|
prediction_value: pred,
|
98
99
|
raw_input: raw,
|
data/lib/easy_ml/timing.rb
CHANGED
@@ -19,7 +19,9 @@ module EasyML
|
|
19
19
|
result = send(method_alias, *args, **kwargs, &block)
|
20
20
|
ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
21
21
|
elapsed = ending - starting
|
22
|
-
|
22
|
+
10.times do
|
23
|
+
puts "#{method_name} took #{elapsed.round(2)} seconds"
|
24
|
+
end
|
23
25
|
# StatsD.measure("#{Rails.env}.#{prefix.present? ? "#{prefix}." : ""}#{method_name}.timing", elapsed)
|
24
26
|
result
|
25
27
|
end
|
data/lib/easy_ml/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: easy_ml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.pre.
|
4
|
+
version: 0.2.0.pre.rc90
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brett Shollenberger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-03-
|
11
|
+
date: 2025-03-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|