easy_ml 0.2.0.pre.rc89 → 0.2.0.pre.rc91
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/predictions_controller.rb +9 -4
- data/app/jobs/easy_ml/training_job.rb +2 -2
- data/app/models/easy_ml/column/imputers/base.rb +1 -1
- data/app/models/easy_ml/column/imputers/categorical.rb +1 -1
- data/app/models/easy_ml/column/imputers/embedding_encoder.rb +2 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +4 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -0
- data/app/models/easy_ml/column.rb +25 -1
- data/app/models/easy_ml/column_list/imputer.rb +4 -0
- data/app/models/easy_ml/column_list.rb +23 -8
- data/app/models/easy_ml/dataset/learner.rb +0 -10
- data/app/models/easy_ml/dataset.rb +28 -43
- data/app/models/easy_ml/deploy.rb +28 -19
- data/app/models/easy_ml/feature.rb +10 -8
- data/app/models/easy_ml/feature_history.rb +9 -0
- data/app/models/easy_ml/model.rb +12 -7
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +2 -2
- data/app/models/easy_ml/prediction.rb +2 -2
- data/app/serializers/easy_ml/prediction_serializer.rb +2 -0
- data/lib/easy_ml/core/model_evaluator.rb +2 -0
- data/lib/easy_ml/core/tuner.rb +1 -1
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +24 -5
- data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +20 -7
- data/lib/easy_ml/data/dataset_manager/writer.rb +4 -0
- data/lib/easy_ml/data/dataset_manager.rb +4 -0
- data/lib/easy_ml/data/polars_column.rb +0 -6
- data/lib/easy_ml/feature_store.rb +9 -13
- data/lib/easy_ml/predict.rb +5 -4
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +1 -0
- data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_dataset_names.rb.tt +13 -0
- data/lib/easy_ml/timing.rb +3 -1
- data/lib/easy_ml/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3a12058c269a91c130f9158e1507c58dc94ad33517aabe568a2f0bc9f78b88eb
|
4
|
+
data.tar.gz: 6a37a568b6a8d8c100c21dea96487cb85494ef11ba3b30ac51aad8cab45654d7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f344c00a9e2b557079943f7f6f2c4d7923dbb5b425423b81d58dbc6d63ac15d5e978d6f7e1ab2c02233fe8cc33b55d168ed1ffb0d1f2e1cfcc59670b812285d
|
7
|
+
data.tar.gz: 4831eac6b35b452b300408b37695d5116b6840404cf045e85d98780d3a120ae2d267b07852bc80b6918379b70f9c04a52a712e599751e469af72a0be6e1889c4
|
@@ -4,22 +4,27 @@ module EasyML
|
|
4
4
|
|
5
5
|
def create
|
6
6
|
slug = params[:model]
|
7
|
-
|
7
|
+
model = EasyML::Model.find_by(slug: slug)
|
8
|
+
unless model.present?
|
9
|
+
return render json: { error: "Model not found" }, status: :not_found
|
10
|
+
end
|
11
|
+
|
12
|
+
unless model.inference_version.present?
|
8
13
|
return render json: { error: "Model not found" }, status: :not_found
|
9
14
|
end
|
10
15
|
|
11
16
|
unless params.key?(:input)
|
12
|
-
return render json: { error: "Must provide key: input" }, status: :
|
17
|
+
return render json: { error: "Must provide key: input" }, status: :unprocessable_entity
|
13
18
|
end
|
14
19
|
input = params[:input].permit!.to_h
|
15
20
|
|
16
21
|
unless input.is_a?(Hash)
|
17
|
-
return render json: { error: "Input must be a hash" }, status: :
|
22
|
+
return render json: { error: "Input must be a hash" }, status: :unprocessable_entity
|
18
23
|
end
|
19
24
|
|
20
25
|
valid, fields = EasyML::Predict.validate_input(slug, input)
|
21
26
|
unless valid
|
22
|
-
return render json: { error: "Missing required fields: #{fields}" }, status: :
|
27
|
+
return render json: { error: "Missing required fields: #{fields}" }, status: :unprocessable_entity
|
23
28
|
end
|
24
29
|
|
25
30
|
type = (params[:type] || :predict).to_sym
|
@@ -10,13 +10,13 @@ module EasyML
|
|
10
10
|
|
11
11
|
@last_activity = Time.current
|
12
12
|
setup_signal_traps
|
13
|
-
@monitor_thread = start_monitor_thread
|
13
|
+
# @monitor_thread = start_monitor_thread
|
14
14
|
|
15
15
|
@model.actually_train do |iteration_info|
|
16
16
|
@last_activity = Time.current
|
17
17
|
end
|
18
18
|
ensure
|
19
|
-
@monitor_thread&.exit
|
19
|
+
# @monitor_thread&.exit
|
20
20
|
@model.unlock!
|
21
21
|
end
|
22
22
|
|
@@ -43,6 +43,10 @@ module EasyML
|
|
43
43
|
@adapters ||= ordered_adapters.map { |klass| klass.new(column, preprocessing_step) }.select { |adapter| allowed?(adapter) && adapter.applies? }
|
44
44
|
end
|
45
45
|
|
46
|
+
def encode=(value)
|
47
|
+
adapters.each { |adapter| adapter.encode = value }
|
48
|
+
end
|
49
|
+
|
46
50
|
def description
|
47
51
|
adapters.map(&:description).compact.join(", ")
|
48
52
|
end
|
@@ -184,9 +184,10 @@ module EasyML
|
|
184
184
|
end
|
185
185
|
end
|
186
186
|
|
187
|
-
def transform(df, inference: false,
|
187
|
+
def transform(df, inference: false, encode: true)
|
188
188
|
imputer = inference && imputers.inference.anything? ? imputers.inference : imputers.training
|
189
189
|
|
190
|
+
imputer.encode = encode
|
190
191
|
df = imputer.transform(df)
|
191
192
|
df
|
192
193
|
end
|
@@ -513,6 +514,29 @@ module EasyML
|
|
513
514
|
EasyML::Import::Column.from_config(config, dataset, action: action)
|
514
515
|
end
|
515
516
|
|
517
|
+
def cast_statement(df, df_col, expected_dtype)
|
518
|
+
expected_dtype = expected_dtype.is_a?(Polars::DataType) ? expected_dtype : expected_dtype.class
|
519
|
+
actual_type = df[df_col].dtype
|
520
|
+
|
521
|
+
cast_statement = case expected_dtype
|
522
|
+
when Polars::Boolean
|
523
|
+
case actual_type
|
524
|
+
when Polars::Boolean
|
525
|
+
Polars.col(df_col).cast(expected_dtype)
|
526
|
+
when Polars::String, Polars::Categorical
|
527
|
+
Polars.col(df_col).eq("true").cast(expected_dtype)
|
528
|
+
when Polars::Null
|
529
|
+
Polars.col(df_col)
|
530
|
+
else
|
531
|
+
raise "Unexpected dtype: #{actual_type} for column: #{df_col}"
|
532
|
+
end
|
533
|
+
else
|
534
|
+
Polars.col(df_col).cast(expected_dtype)
|
535
|
+
end
|
536
|
+
|
537
|
+
cast_statement.alias(df_col)
|
538
|
+
end
|
539
|
+
|
516
540
|
def cast(value)
|
517
541
|
return value if value.nil?
|
518
542
|
|
@@ -15,6 +15,10 @@ module EasyML
|
|
15
15
|
@imputers ||= columns.map { |column| inference ? column.imputers(@_imputers).inference : column.imputers(@_imputers).training }
|
16
16
|
end
|
17
17
|
|
18
|
+
def encode=(encode)
|
19
|
+
imputers.each { |imputer| imputer.encode = encode }
|
20
|
+
end
|
21
|
+
|
18
22
|
def exprs
|
19
23
|
imputers.flat_map(&:exprs).compact
|
20
24
|
end
|
@@ -22,7 +22,7 @@ module EasyML
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
-
def transform(df, inference: false, computed: false)
|
25
|
+
def transform(df, inference: false, computed: false, encode: true)
|
26
26
|
return df if df.nil?
|
27
27
|
|
28
28
|
if computed
|
@@ -33,14 +33,12 @@ module EasyML
|
|
33
33
|
|
34
34
|
by_name = cols.index_by(&:name)
|
35
35
|
cols.each do |column|
|
36
|
-
df = column.transform(df, inference: inference,
|
36
|
+
df = column.transform(df, inference: inference, encode: encode) if column
|
37
37
|
end
|
38
38
|
|
39
39
|
df
|
40
40
|
end
|
41
41
|
|
42
|
-
measure_method_timing :transform
|
43
|
-
|
44
42
|
def apply_clip(df)
|
45
43
|
clip_cols = has_clip.raw
|
46
44
|
return df unless clip_cols.any?
|
@@ -60,8 +58,6 @@ module EasyML
|
|
60
58
|
reload
|
61
59
|
end
|
62
60
|
|
63
|
-
measure_method_timing :learn
|
64
|
-
|
65
61
|
def statistics
|
66
62
|
stats = { raw: {}, processed: {} }
|
67
63
|
select(&:persisted?).inject(stats) do |h, col|
|
@@ -94,6 +90,27 @@ module EasyML
|
|
94
90
|
end.sort.map { |arr| arr[1] }.uniq
|
95
91
|
end
|
96
92
|
|
93
|
+
def apply_cast(df)
|
94
|
+
schema = dataset.schema
|
95
|
+
column_index = reduce({}) do |h, col|
|
96
|
+
h.tap do
|
97
|
+
col.aliases.each do |alias_name|
|
98
|
+
h[alias_name] = col
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
cast_statements = (df.columns & schema.keys.map(&:to_s)).map do |df_col|
|
103
|
+
db_col = column_index[df_col]
|
104
|
+
expected_dtype = schema[df_col.to_sym]
|
105
|
+
db_col.cast_statement(df, df_col, expected_dtype)
|
106
|
+
end
|
107
|
+
begin
|
108
|
+
df = df.with_columns(cast_statements)
|
109
|
+
rescue => e
|
110
|
+
binding.pry
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
97
114
|
def cast(processed_or_raw)
|
98
115
|
columns = where(is_computed: false)
|
99
116
|
is_processed = processed_or_raw == :processed
|
@@ -154,8 +171,6 @@ module EasyML
|
|
154
171
|
EasyML::Lineage.import(lineage, on_duplicate_key_update: { columns: %i[ column_id key occurred_at description ] })
|
155
172
|
end
|
156
173
|
|
157
|
-
measure_method_timing :set_feature_lineage
|
158
|
-
|
159
174
|
private
|
160
175
|
|
161
176
|
def import_new(new_columns, existing_columns)
|
@@ -57,8 +57,6 @@ module EasyML
|
|
57
57
|
dataset.columns.set_feature_lineage(columns)
|
58
58
|
end
|
59
59
|
|
60
|
-
measure_method_timing :save_statistics
|
61
|
-
|
62
60
|
def learn_statistics
|
63
61
|
return @statistics if @statistics
|
64
62
|
|
@@ -78,8 +76,6 @@ module EasyML
|
|
78
76
|
end
|
79
77
|
end
|
80
78
|
|
81
|
-
measure_method_timing :learn_statistics
|
82
|
-
|
83
79
|
def prepare
|
84
80
|
@schema = EasyML::Data::PolarsSchema.simplify(@dataset.raw_schema).symbolize_keys
|
85
81
|
@raw_columns = @schema.keys.sort.map(&:to_s)
|
@@ -93,19 +89,13 @@ module EasyML
|
|
93
89
|
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[in_raw_dataset datatype] })
|
94
90
|
end
|
95
91
|
|
96
|
-
measure_method_timing :prepare
|
97
|
-
|
98
92
|
def lazy_statistics
|
99
93
|
Lazy.new(dataset, columns, type: type).learn
|
100
94
|
end
|
101
95
|
|
102
|
-
measure_method_timing :lazy_statistics
|
103
|
-
|
104
96
|
def eager_statistics
|
105
97
|
Eager.new(dataset, columns, type: type).learn
|
106
98
|
end
|
107
|
-
|
108
|
-
measure_method_timing :eager_statistics
|
109
99
|
end
|
110
100
|
end
|
111
101
|
end
|
@@ -215,9 +215,10 @@ module EasyML
|
|
215
215
|
|
216
216
|
@raw = raw.cp(version)
|
217
217
|
@processed = processed.cp(version)
|
218
|
-
|
219
|
-
|
220
|
-
|
218
|
+
save.tap do
|
219
|
+
features.each(&:bump_version)
|
220
|
+
EasyML::Feature.import(features.to_a, on_duplicate_key_update: [:version])
|
221
|
+
end
|
221
222
|
end
|
222
223
|
|
223
224
|
def refreshed_datasource?
|
@@ -257,9 +258,6 @@ module EasyML
|
|
257
258
|
end
|
258
259
|
end
|
259
260
|
|
260
|
-
include EasyML::Timing
|
261
|
-
measure_method_timing :actually_refresh
|
262
|
-
|
263
261
|
def refresh!(async: false)
|
264
262
|
refreshing do
|
265
263
|
prepare!
|
@@ -276,29 +274,22 @@ module EasyML
|
|
276
274
|
end
|
277
275
|
end
|
278
276
|
|
279
|
-
measure_method_timing :refresh
|
280
|
-
|
281
277
|
def fit_features!(async: false, features: self.features)
|
282
278
|
fit_features(async: async, features: features, force: true)
|
283
279
|
end
|
284
280
|
|
285
281
|
def fit_features(async: false, features: self.features, force: false)
|
286
282
|
features_to_compute = force ? features : features.needs_fit
|
287
|
-
puts "Features to compute.... #{features_to_compute}"
|
288
283
|
return after_fit_features if features_to_compute.empty?
|
289
284
|
|
290
285
|
features.first.fit(features: features_to_compute, async: async)
|
291
286
|
end
|
292
287
|
|
293
|
-
measure_method_timing :fit_features
|
294
|
-
|
295
288
|
def after_fit_features
|
296
|
-
puts "After fit features"
|
297
289
|
unlock!
|
298
290
|
reload
|
299
291
|
return if failed?
|
300
292
|
|
301
|
-
puts "Actually refresh..."
|
302
293
|
actually_refresh
|
303
294
|
end
|
304
295
|
|
@@ -476,15 +467,24 @@ module EasyML
|
|
476
467
|
|
477
468
|
def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
|
478
469
|
df = apply_missing_columns(df, inference: inference)
|
479
|
-
df =
|
480
|
-
df = apply_features(df, features)
|
481
|
-
df =
|
470
|
+
df = transform_columns(df, inference: inference, encode: false)
|
471
|
+
df = apply_features(df, features, inference: inference)
|
472
|
+
df = apply_cast(df) if inference
|
473
|
+
df = transform_columns(df, inference: inference)
|
482
474
|
df = apply_column_mask(df, inference: inference) unless all_columns
|
483
475
|
df = drop_nulls(df) unless inference
|
484
476
|
df, = processed.split_features_targets(df, true, target) if split_ys
|
485
477
|
df
|
486
478
|
end
|
487
479
|
|
480
|
+
def transform_columns(df, inference: false, encode: true)
|
481
|
+
columns.transform(df, inference: inference, encode: encode)
|
482
|
+
end
|
483
|
+
|
484
|
+
def apply_cast(df)
|
485
|
+
columns.apply_cast(df)
|
486
|
+
end
|
487
|
+
|
488
488
|
# Massage out one-hot cats to their canonical name
|
489
489
|
#
|
490
490
|
# Takes: ["Sex_male", "Sex_female", "Embarked_c", "PassengerId"]
|
@@ -503,8 +503,6 @@ module EasyML
|
|
503
503
|
end.uniq.sort
|
504
504
|
end
|
505
505
|
|
506
|
-
measure_method_timing :normalize
|
507
|
-
|
508
506
|
def missing_required_fields(df)
|
509
507
|
desc_df = df.describe
|
510
508
|
|
@@ -633,22 +631,19 @@ module EasyML
|
|
633
631
|
df[column_mask(df, inference: inference)]
|
634
632
|
end
|
635
633
|
|
636
|
-
|
637
|
-
|
638
|
-
def apply_missing_columns(df, inference: false, include_one_hots: false)
|
634
|
+
def apply_missing_columns(df, inference: false)
|
639
635
|
return df unless inference
|
640
636
|
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
missing_columns += columns.one_hots.map(&:name) - df.columns
|
649
|
-
end
|
637
|
+
required_cols = col_order(inference: inference).compact.uniq
|
638
|
+
columns.one_hots.each do |one_hot|
|
639
|
+
virtual_columns = one_hot.virtual_columns
|
640
|
+
if virtual_columns.all? { |vc| df.columns.include?(vc) }
|
641
|
+
required_cols -= virtual_columns
|
642
|
+
else
|
643
|
+
required_cols += [one_hot.name]
|
650
644
|
end
|
651
645
|
end
|
646
|
+
missing_columns = required_cols - df.columns
|
652
647
|
df.with_columns(missing_columns.map { |f| Polars.lit(nil).alias(f) })
|
653
648
|
end
|
654
649
|
|
@@ -771,8 +766,6 @@ module EasyML
|
|
771
766
|
after_refresh_datasource
|
772
767
|
end
|
773
768
|
|
774
|
-
measure_method_timing :refresh_datasource
|
775
|
-
|
776
769
|
def refresh_datasource!
|
777
770
|
datasource.reload.refresh!
|
778
771
|
after_refresh_datasource
|
@@ -798,8 +791,6 @@ module EasyML
|
|
798
791
|
@normalized = true
|
799
792
|
end
|
800
793
|
|
801
|
-
measure_method_timing :normalize_all
|
802
|
-
|
803
794
|
def learn_computed_columns(df)
|
804
795
|
return unless features.ready_to_apply.any?
|
805
796
|
|
@@ -811,8 +802,6 @@ module EasyML
|
|
811
802
|
processed.cleanup
|
812
803
|
end
|
813
804
|
|
814
|
-
measure_method_timing :learn_computed_columns
|
815
|
-
|
816
805
|
def drop_nulls(df)
|
817
806
|
return df if drop_if_null.nil? || drop_if_null.empty?
|
818
807
|
|
@@ -822,8 +811,6 @@ module EasyML
|
|
822
811
|
df.drop_nulls(subset: drop)
|
823
812
|
end
|
824
813
|
|
825
|
-
measure_method_timing :drop_nulls
|
826
|
-
|
827
814
|
# Pass refresh: false for frontend views so we don't query S3 during web requests
|
828
815
|
def load_data(segment, **kwargs, &block)
|
829
816
|
needs_refresh = kwargs.key?(:refresh) ? kwargs[:refresh] : needs_refresh?
|
@@ -876,8 +863,8 @@ module EasyML
|
|
876
863
|
columns.find_by(name: column_name).update(is_date_column: true)
|
877
864
|
end
|
878
865
|
|
879
|
-
def apply_features(df, features = self.features)
|
880
|
-
features = features.ready_to_apply
|
866
|
+
def apply_features(df, features = self.features, inference: false)
|
867
|
+
features = inference ? preloaded_features : features.ready_to_apply
|
881
868
|
if features.nil? || features.empty?
|
882
869
|
df
|
883
870
|
else
|
@@ -897,15 +884,13 @@ module EasyML
|
|
897
884
|
# Set SHA without querying
|
898
885
|
feature.instance_variable_set(:@current_sha, shas[feature.feature_class])
|
899
886
|
|
900
|
-
result = feature.transform_batch(acc_df)
|
887
|
+
result = feature.transform_batch(acc_df, inference: inference)
|
901
888
|
|
902
889
|
result
|
903
890
|
end
|
904
891
|
end
|
905
892
|
end
|
906
893
|
|
907
|
-
measure_method_timing :apply_features
|
908
|
-
|
909
894
|
def standardize_preprocessing_steps(type)
|
910
895
|
columns.map(&:name).zip(columns.map do |col|
|
911
896
|
col.preprocessing_steps&.dig(type)
|
@@ -48,28 +48,37 @@ module EasyML
|
|
48
48
|
|
49
49
|
def actually_deploy
|
50
50
|
lock_deploy do
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
51
|
+
begin
|
52
|
+
update(status: "running")
|
53
|
+
EasyML::Event.create_event(self, "started")
|
54
|
+
|
55
|
+
if identical_deploy.present?
|
56
|
+
self.model_file = identical_deploy.model_file
|
57
|
+
self.model_version = identical_deploy.model_version
|
58
|
+
else
|
59
|
+
if model_file.present?
|
60
|
+
model.model_file = model_file
|
61
|
+
end
|
62
|
+
# model.load_model
|
63
|
+
self.model_version = model.actually_deploy
|
60
64
|
end
|
61
|
-
model.load_model
|
62
|
-
self.model_version = model.actually_deploy
|
63
|
-
end
|
64
65
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
66
|
+
EasyML::Deploy.transaction do
|
67
|
+
update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, status: :success)
|
68
|
+
model.retraining_runs.where(status: :deployed).update_all(status: :success)
|
69
|
+
retraining_run.update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, deploy_id: id, status: :deployed,)
|
70
|
+
end
|
70
71
|
|
71
|
-
|
72
|
-
|
72
|
+
model_version.tap do
|
73
|
+
EasyML::Event.create_event(self, "success")
|
74
|
+
end
|
75
|
+
rescue => e
|
76
|
+
update(status: "failed")
|
77
|
+
retraining_run.update(is_deploying: false)
|
78
|
+
EasyML::Event.create_event(self, "failed")
|
79
|
+
raise e
|
80
|
+
ensure
|
81
|
+
unlock!
|
73
82
|
end
|
74
83
|
end
|
75
84
|
end
|
@@ -82,7 +82,7 @@ module EasyML
|
|
82
82
|
fittable = fittable.select(&:fittable?)
|
83
83
|
where(id: fittable.map(&:id))
|
84
84
|
end
|
85
|
-
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed) }
|
85
|
+
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed).or(where(needs_fit: true)) }
|
86
86
|
scope :datasource_was_refreshed, -> do
|
87
87
|
where(id: all.select(&:datasource_was_refreshed?).map(&:id))
|
88
88
|
end
|
@@ -310,9 +310,9 @@ module EasyML
|
|
310
310
|
end
|
311
311
|
|
312
312
|
# Transform a single batch, used for testing the user's feature implementation
|
313
|
-
def transform_batch(df = nil, batch_args = {})
|
313
|
+
def transform_batch(df = nil, batch_args = {}, inference: false)
|
314
314
|
if df.is_a?(Polars::DataFrame)
|
315
|
-
actually_transform_batch(df)
|
315
|
+
actually_transform_batch(df, inference: inference)
|
316
316
|
else
|
317
317
|
actually_transform_batch(build_batch(get_batch_args(**batch_args)))
|
318
318
|
end
|
@@ -374,11 +374,12 @@ module EasyML
|
|
374
374
|
batch_df
|
375
375
|
end
|
376
376
|
|
377
|
-
def actually_transform_batch(df)
|
377
|
+
def actually_transform_batch(df, inference: false)
|
378
378
|
return nil unless df.is_a?(Polars::DataFrame)
|
379
379
|
return df if !adapter.respond_to?(:transform) && feature_store.empty?
|
380
380
|
|
381
381
|
df_len_was = df.shape[0]
|
382
|
+
orig_df = df.clone
|
382
383
|
begin
|
383
384
|
result = adapter.transform(df, self)
|
384
385
|
rescue => e
|
@@ -386,8 +387,10 @@ module EasyML
|
|
386
387
|
end
|
387
388
|
raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
|
388
389
|
df_len_now = result.shape[0]
|
389
|
-
|
390
|
-
|
390
|
+
missing_columns = orig_df.columns - result.columns
|
391
|
+
raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if (df_len_now != df_len_was)
|
392
|
+
raise "Feature #{feature_class} removed #{missing_columns} columns" if missing_columns.any?
|
393
|
+
update!(applied_at: Time.current) unless inference
|
391
394
|
result
|
392
395
|
end
|
393
396
|
|
@@ -432,9 +435,8 @@ module EasyML
|
|
432
435
|
end
|
433
436
|
|
434
437
|
def bump_version
|
435
|
-
|
438
|
+
feature_store.bump_version(version)
|
436
439
|
write_attribute(:version, version + 1)
|
437
|
-
feature_store.cp(old_version, version)
|
438
440
|
self
|
439
441
|
end
|
440
442
|
|
@@ -44,7 +44,16 @@ module EasyML
|
|
44
44
|
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
|
45
45
|
scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
|
46
46
|
|
47
|
+
def wipe
|
48
|
+
false
|
49
|
+
end
|
50
|
+
|
47
51
|
def download_remote_files
|
52
|
+
return unless snapshot_id # if not finished saving, skip
|
53
|
+
return if feature_store.synced?
|
54
|
+
return if @downloaded
|
55
|
+
|
56
|
+
@downloaded = true
|
48
57
|
feature_store&.download
|
49
58
|
end
|
50
59
|
end
|
data/app/models/easy_ml/model.rb
CHANGED
@@ -182,6 +182,7 @@ module EasyML
|
|
182
182
|
lock_model do
|
183
183
|
run = pending_run
|
184
184
|
run.wrap_training do
|
185
|
+
dataset.refresh if dataset.needs_refresh?
|
185
186
|
raise untrainable_error unless trainable?
|
186
187
|
|
187
188
|
best_params = nil
|
@@ -210,6 +211,10 @@ module EasyML
|
|
210
211
|
end
|
211
212
|
end
|
212
213
|
|
214
|
+
def locked?
|
215
|
+
Support::Lockable.locked?(lock_key)
|
216
|
+
end
|
217
|
+
|
213
218
|
def with_lock
|
214
219
|
EasyML::Support::Lockable.with_lock(lock_key, stale_timeout: 60, resources: 1) do |client|
|
215
220
|
yield client
|
@@ -273,7 +278,7 @@ module EasyML
|
|
273
278
|
end
|
274
279
|
|
275
280
|
def inference_version
|
276
|
-
|
281
|
+
deploys.where(status: :success).order(id: :desc).limit(1).last&.model_version
|
277
282
|
end
|
278
283
|
|
279
284
|
alias_method :current_version, :inference_version
|
@@ -296,21 +301,21 @@ module EasyML
|
|
296
301
|
)
|
297
302
|
end
|
298
303
|
|
299
|
-
def prepare_predict(xs)
|
304
|
+
def prepare_predict(xs, normalized: false)
|
300
305
|
load_model!
|
301
|
-
|
306
|
+
if !normalized
|
302
307
|
xs = dataset.normalize(xs, inference: true)
|
303
308
|
end
|
304
309
|
xs
|
305
310
|
end
|
306
311
|
|
307
|
-
def predict(xs)
|
308
|
-
xs = prepare_predict(xs)
|
312
|
+
def predict(xs, normalized: false)
|
313
|
+
xs = prepare_predict(xs, normalized: normalized)
|
309
314
|
adapter.predict(xs)
|
310
315
|
end
|
311
316
|
|
312
|
-
def predict_proba(xs)
|
313
|
-
xs = prepare_predict(xs)
|
317
|
+
def predict_proba(xs, normalized: false)
|
318
|
+
xs = prepare_predict(xs, normalized: normalized)
|
314
319
|
adapter.predict_proba(xs)
|
315
320
|
end
|
316
321
|
|
@@ -49,7 +49,7 @@ module EasyML
|
|
49
49
|
x_valid, y_valid = valid_dataset
|
50
50
|
x_valid = x_valid.select(model.dataset.col_order(inference: true))
|
51
51
|
@preprocessed ||= model.preprocess(x_valid, y_valid)
|
52
|
-
y_pred = model.predict(@preprocessed)
|
52
|
+
y_pred = model.predict(@preprocessed, normalized: true)
|
53
53
|
dataset = model.dataset.processed.valid(all_columns: true)
|
54
54
|
|
55
55
|
metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
|
@@ -78,7 +78,7 @@ module EasyML
|
|
78
78
|
|
79
79
|
track_feature_importance(booster)
|
80
80
|
if tuner.nil?
|
81
|
-
track_cumulative_feature_importance
|
81
|
+
track_cumulative_feature_importance
|
82
82
|
end
|
83
83
|
|
84
84
|
booster
|
@@ -17,8 +17,8 @@ module EasyML
|
|
17
17
|
class Prediction < ActiveRecord::Base
|
18
18
|
self.table_name = "easy_ml_predictions"
|
19
19
|
|
20
|
-
belongs_to :model
|
21
|
-
belongs_to :model_history, optional: true
|
20
|
+
belongs_to :model, class_name: "EasyML::Model"
|
21
|
+
belongs_to :model_history, class_name: "EasyML::ModelHistory", optional: true
|
22
22
|
|
23
23
|
validates :model_id, presence: true
|
24
24
|
validates :prediction_type, presence: true, inclusion: { in: %w[regression classification] }
|
@@ -153,6 +153,8 @@ module EasyML
|
|
153
153
|
|
154
154
|
def normalize_input(input)
|
155
155
|
case input
|
156
|
+
when Polars::LazyFrame
|
157
|
+
normalize_input(input.collect)
|
156
158
|
when Array
|
157
159
|
if input.first.class == TrueClass || input.first.class == FalseClass
|
158
160
|
input = input.map { |value| value ? 1.0 : 0.0 }
|
data/lib/easy_ml/core/tuner.rb
CHANGED
@@ -147,7 +147,7 @@ module EasyML
|
|
147
147
|
end
|
148
148
|
end
|
149
149
|
|
150
|
-
y_pred = model.predict(x_normalized)
|
150
|
+
y_pred = model.predict(x_normalized, normalized: true)
|
151
151
|
model.metrics = metrics
|
152
152
|
metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
|
153
153
|
metric = metrics.symbolize_keys.dig(model.evaluator[:metric].to_sym)
|
@@ -24,18 +24,31 @@ module EasyML
|
|
24
24
|
|
25
25
|
def compact
|
26
26
|
files = self.files
|
27
|
+
rows = query(lazy: true).collect
|
28
|
+
return unless rows.shape[0] > 0
|
29
|
+
|
30
|
+
FileUtils.rm(files)
|
27
31
|
|
28
32
|
clear_unique_id
|
29
33
|
File.join(root_dir, "compacted.parquet").tap do |target_file|
|
30
|
-
safe_write(
|
31
|
-
query(lazy: true),
|
32
|
-
target_file
|
33
|
-
)
|
34
|
-
FileUtils.rm(files)
|
34
|
+
safe_write(rows, target_file)
|
35
35
|
end
|
36
36
|
clear_unique_id
|
37
37
|
end
|
38
38
|
|
39
|
+
def cp(from,to)
|
40
|
+
return if from.nil? || !Dir.exist?(from)
|
41
|
+
|
42
|
+
FileUtils.mkdir_p(to)
|
43
|
+
files_to_cp = Dir.glob(Pathname.new(from).join("**/*")).select { |f| File.file?(f) }
|
44
|
+
|
45
|
+
files_to_cp.each do |file|
|
46
|
+
target_file = file.gsub(from, to)
|
47
|
+
FileUtils.mkdir_p(File.dirname(target_file))
|
48
|
+
FileUtils.cp(file, target_file)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
39
52
|
def unlock!
|
40
53
|
clear_all_keys
|
41
54
|
end
|
@@ -65,6 +78,8 @@ module EasyML
|
|
65
78
|
end
|
66
79
|
|
67
80
|
def safe_write(df, path)
|
81
|
+
raise "df must be a Polars::DataFrame or Polars::LazyFrame" unless df.is_a?(Polars::DataFrame) || df.is_a?(Polars::LazyFrame)
|
82
|
+
|
68
83
|
FileUtils.mkdir_p(File.dirname(path))
|
69
84
|
if df.is_a?(Polars::LazyFrame)
|
70
85
|
# Depending on the query plan, sometimes sink_parquet will throw an error...
|
@@ -81,6 +96,10 @@ module EasyML
|
|
81
96
|
df.write_parquet(path)
|
82
97
|
end
|
83
98
|
path
|
99
|
+
ensure
|
100
|
+
if Polars.scan_parquet(path).limit(1).schema.keys.empty?
|
101
|
+
raise "Failed to store to #{path}"
|
102
|
+
end
|
84
103
|
end
|
85
104
|
|
86
105
|
def clear_all_keys
|
@@ -17,9 +17,7 @@ module EasyML
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def wipe
|
20
|
-
|
21
|
-
FileUtils.rm_rf(File.join(root_dir, partition))
|
22
|
-
end
|
20
|
+
super
|
23
21
|
clear_all_keys
|
24
22
|
end
|
25
23
|
|
@@ -33,22 +31,37 @@ module EasyML
|
|
33
31
|
end
|
34
32
|
|
35
33
|
def compact
|
36
|
-
|
34
|
+
return if compacted?
|
35
|
+
|
37
36
|
@df = query(lazy: true)
|
38
37
|
|
39
38
|
clear_unique_id(subdir: "compacted")
|
40
39
|
compact_each_partition.tap do
|
41
|
-
FileUtils.rm(files)
|
42
40
|
clear_unique_id
|
43
41
|
end
|
42
|
+
uncompacted_folders.each do |folder|
|
43
|
+
FileUtils.rm_rf(File.join(root_dir, folder))
|
44
|
+
end
|
44
45
|
end
|
45
46
|
|
46
47
|
private
|
47
48
|
|
48
|
-
def
|
49
|
-
|
49
|
+
def compacted?
|
50
|
+
uncompacted_folders.empty?
|
50
51
|
end
|
51
52
|
|
53
|
+
def uncompacted_folders
|
54
|
+
folders - ["compacted"]
|
55
|
+
end
|
56
|
+
|
57
|
+
def folders
|
58
|
+
Dir.glob(File.join(root_dir, "**/*")).select { |f| File.directory?(f) }.map { |f| f.split("/").last }
|
59
|
+
end
|
60
|
+
|
61
|
+
# def partitions
|
62
|
+
# Dir.glob(File.join(root_dir, "**/*")).map { |f| f.split("/").last }
|
63
|
+
# end
|
64
|
+
|
52
65
|
def compact_each_partition
|
53
66
|
with_each_partition do |partition_df, _|
|
54
67
|
safe_write(
|
@@ -121,8 +121,6 @@ module EasyML
|
|
121
121
|
polars_type ? sym_to_polars(type_name) : type_name
|
122
122
|
end
|
123
123
|
|
124
|
-
measure_method_timing :determine_type
|
125
|
-
|
126
124
|
# Determines if a string field is a date, text, or categorical
|
127
125
|
# @param series [Polars::Series] The string series to analyze
|
128
126
|
# @return [Symbol] One of :datetime, :text, or :categorical
|
@@ -149,8 +147,6 @@ module EasyML
|
|
149
147
|
end
|
150
148
|
end
|
151
149
|
|
152
|
-
measure_method_timing :determine_string_type
|
153
|
-
|
154
150
|
# Determines if a string field is categorical or free text
|
155
151
|
# @param series [Polars::Series] The string series to analyze
|
156
152
|
# @return [Symbol] Either :categorical or :text
|
@@ -178,8 +174,6 @@ module EasyML
|
|
178
174
|
avg_percentage < 1.0 ? :text : :categorical
|
179
175
|
end
|
180
176
|
|
181
|
-
measure_method_timing :categorical_or_text?
|
182
|
-
|
183
177
|
# Returns whether the field type is numeric
|
184
178
|
# @param field_type [Symbol] The field type to check
|
185
179
|
# @return [Boolean]
|
@@ -23,20 +23,16 @@ module EasyML
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
def
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
return if old_dir.nil? || !Dir.exist?(old_dir)
|
31
|
-
|
32
|
-
FileUtils.mkdir_p(new_dir)
|
33
|
-
files_to_cp = Dir.glob(Pathname.new(old_dir).join("**/*")).select { |f| File.file?(f) }
|
26
|
+
def synced?
|
27
|
+
files.any?
|
28
|
+
end
|
34
29
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
30
|
+
def bump_version(version)
|
31
|
+
compact
|
32
|
+
cp(
|
33
|
+
feature_dir_for_version(version),
|
34
|
+
feature_dir_for_version(version + 1),
|
35
|
+
)
|
40
36
|
end
|
41
37
|
|
42
38
|
private
|
data/lib/easy_ml/predict.rb
CHANGED
@@ -3,6 +3,7 @@ require "singleton"
|
|
3
3
|
module EasyML
|
4
4
|
class Predict
|
5
5
|
include Singleton
|
6
|
+
include EasyML::Timing
|
6
7
|
|
7
8
|
attr_reader :models
|
8
9
|
|
@@ -20,7 +21,7 @@ module EasyML
|
|
20
21
|
def self.predict(model_name, df, serialize: false)
|
21
22
|
df = normalize_input(df)
|
22
23
|
output = make_predictions(model_name, df) do |model, normalized_df|
|
23
|
-
model.predict(normalized_df)
|
24
|
+
model.predict(normalized_df, normalized: true)
|
24
25
|
end
|
25
26
|
|
26
27
|
if serialize
|
@@ -33,7 +34,7 @@ module EasyML
|
|
33
34
|
def self.predict_proba(model_name, df, serialize: false)
|
34
35
|
df = normalize_input(df)
|
35
36
|
output = make_predictions(model_name, df) do |model, normalized_df|
|
36
|
-
probas = model.predict_proba(normalized_df)
|
37
|
+
probas = model.predict_proba(normalized_df, normalized: true)
|
37
38
|
probas.map { |proba_array| proba_array.map { |p| p.round(4) } }
|
38
39
|
end
|
39
40
|
|
@@ -91,8 +92,8 @@ module EasyML
|
|
91
92
|
|
92
93
|
output = predictions.zip(raw_input, normalized_input).map do |pred, raw, norm|
|
93
94
|
EasyML::Prediction.create!(
|
94
|
-
|
95
|
-
|
95
|
+
model_id: current_version.model.id,
|
96
|
+
model_history_id: current_version.id,
|
96
97
|
prediction_type: current_version.model.task,
|
97
98
|
prediction_value: pred,
|
98
99
|
raw_input: raw,
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class AddUniqueConstraintToDatasetNames < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
2
|
+
def change
|
3
|
+
if index_exists?(:easy_ml_datasets, :name)
|
4
|
+
remove_index :easy_ml_datasets, :name
|
5
|
+
end
|
6
|
+
add_index :easy_ml_datasets, :name, unique: true
|
7
|
+
|
8
|
+
if index_exists?(:easy_ml_dataset_histories, :name)
|
9
|
+
remove_index :easy_ml_dataset_histories, :name
|
10
|
+
end
|
11
|
+
add_index :easy_ml_dataset_histories, :name, unique: true
|
12
|
+
end
|
13
|
+
end
|
data/lib/easy_ml/timing.rb
CHANGED
@@ -19,7 +19,9 @@ module EasyML
|
|
19
19
|
result = send(method_alias, *args, **kwargs, &block)
|
20
20
|
ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
21
21
|
elapsed = ending - starting
|
22
|
-
|
22
|
+
10.times do
|
23
|
+
puts "#{method_name} took #{elapsed.round(2)} seconds"
|
24
|
+
end
|
23
25
|
# StatsD.measure("#{Rails.env}.#{prefix.present? ? "#{prefix}." : ""}#{method_name}.timing", elapsed)
|
24
26
|
result
|
25
27
|
end
|
data/lib/easy_ml/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: easy_ml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.pre.
|
4
|
+
version: 0.2.0.pre.rc91
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brett Shollenberger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-03-
|
11
|
+
date: 2025-03-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -803,6 +803,7 @@ files:
|
|
803
803
|
- lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt
|
804
804
|
- lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt
|
805
805
|
- lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt
|
806
|
+
- lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_dataset_names.rb.tt
|
806
807
|
- lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt
|
807
808
|
- lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_dataset_histories.rb.tt
|
808
809
|
- lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_features.rb.tt
|