easy_ml 0.2.0.pre.rc63 → 0.2.0.pre.rc68
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/jobs/easy_ml/batch_job.rb +11 -2
- data/app/jobs/easy_ml/compute_feature_job.rb +13 -29
- data/app/jobs/easy_ml/refresh_dataset_job.rb +2 -2
- data/app/models/easy_ml/column/imputers/imputer.rb +0 -26
- data/app/models/easy_ml/column/selector.rb +1 -1
- data/app/models/easy_ml/column.rb +12 -1
- data/app/models/easy_ml/dataset.rb +1 -4
- data/app/models/easy_ml/feature.rb +67 -8
- data/lib/easy_ml/feature_store.rb +3 -3
- data/lib/easy_ml/version.rb +1 -1
- metadata +2 -3
- data/app/jobs/easy_ml/finalize_feature_job.rb +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2404bc2b6613d95627dc89265e76351f648d9b45e65f5935a71a0e2334d08d2c
|
4
|
+
data.tar.gz: 072f1d58d6e5d7864e3cdb365701c6fb6e635bc20e3e9ef415fe52772671ce76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d4e8ab597db1470630e555448fb057af07fab38ff5125fd7ce548db5ba71ca7abac26d1761fb892459890b5c8bce169cd9e165d80b4b91ec6a806dda1a36cbe5
|
7
|
+
data.tar.gz: 71cab456d196c15e03b16e2677cb3b64e08527b2b4acc20ac8ee6c0f0ceea671b74df0d61c174a914cfa85f7e476ad77f264ae78e6b604ed8f50ca2756d84956
|
@@ -40,14 +40,23 @@ module EasyML
|
|
40
40
|
Resque.redis.rpush("batch:#{parent_id}:remaining", batch.to_json)
|
41
41
|
end
|
42
42
|
|
43
|
-
|
43
|
+
handle_batch(parent_id, batch)
|
44
|
+
end
|
45
|
+
|
46
|
+
def handle_batch(parent_id, batch)
|
47
|
+
if batch.size > 1
|
48
|
+
enqueue_batch(batch)
|
49
|
+
else
|
50
|
+
run_one_batch(parent_id, batch.first)
|
51
|
+
after_batch_hook(parent_id, batch)
|
52
|
+
end
|
44
53
|
end
|
45
54
|
|
46
55
|
def enqueue_next_batch(caller, parent_id)
|
47
56
|
next_batch = Resque.redis.lpop("batch:#{parent_id}:remaining")
|
48
57
|
payload = Resque.decode(next_batch)
|
49
58
|
|
50
|
-
caller.
|
59
|
+
caller.handle_batch(parent_id, payload)
|
51
60
|
end
|
52
61
|
|
53
62
|
def next_batch?(parent_id)
|
@@ -4,37 +4,21 @@ module EasyML
|
|
4
4
|
|
5
5
|
@queue = :easy_ml
|
6
6
|
|
7
|
-
def self.perform(batch_id,
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
feature.update(workflow_status: :analyzing) if feature.workflow_status == :ready
|
20
|
-
feature.fit_batch(options.merge!(batch_id: batch_id))
|
21
|
-
rescue => e
|
22
|
-
EasyML::Feature.transaction do
|
23
|
-
return if dataset.reload.workflow_status == :failed
|
24
|
-
|
25
|
-
feature.update(workflow_status: :failed)
|
26
|
-
dataset.update(workflow_status: :failed)
|
27
|
-
build_error_with_context(dataset, e, batch_id, feature)
|
28
|
-
end
|
29
|
-
end
|
7
|
+
def self.perform(batch_id, batch_args = {})
|
8
|
+
# This is very, very, very, very, very important
|
9
|
+
# if you don't dup the batch_args, resque-batched-job will
|
10
|
+
# fail in some non-obvious ways, because it will try to
|
11
|
+
# decode to match the original batch args EXACTLY.
|
12
|
+
#
|
13
|
+
# This will waste your time so please just don't remove this .dup!!!
|
14
|
+
#
|
15
|
+
# https://github.com/drfeelngood/resque-batched-job/blob/master/lib/resque/plugins/batched_job.rb#L86
|
16
|
+
batch_args = batch_args.dup
|
17
|
+
run_one_batch(batch_id, batch_args)
|
30
18
|
end
|
31
19
|
|
32
|
-
def self.
|
33
|
-
|
34
|
-
batch = feature.build_batch(batch_id: batch_id)
|
35
|
-
|
36
|
-
# Convert any dataframes in the context to serialized form
|
37
|
-
error.create_context(context: batch)
|
20
|
+
def self.run_one_batch(batch_id, batch_args)
|
21
|
+
EasyML::Feature.fit_one_batch(batch_id, batch_args)
|
38
22
|
end
|
39
23
|
|
40
24
|
def self.after_batch_hook(batch_id, *args)
|
@@ -16,14 +16,14 @@ module EasyML
|
|
16
16
|
if dataset.features.needs_fit.any?
|
17
17
|
dataset.fit_features(async: true)
|
18
18
|
else
|
19
|
-
dataset.
|
19
|
+
dataset.after_fit_features
|
20
20
|
end
|
21
21
|
rescue StandardError => e
|
22
22
|
if Rails.env.test?
|
23
23
|
raise e
|
24
24
|
end
|
25
25
|
dataset.update(workflow_status: :failed)
|
26
|
-
handle_error(dataset, e)
|
26
|
+
EasyML::Event.handle_error(dataset, e)
|
27
27
|
end
|
28
28
|
end
|
29
29
|
end
|
@@ -8,7 +8,6 @@ module EasyML
|
|
8
8
|
@column = column
|
9
9
|
@dataset = column.dataset
|
10
10
|
@preprocessing_step = preprocessing_step.with_indifferent_access
|
11
|
-
validate_preprocessing_step!
|
12
11
|
end
|
13
12
|
|
14
13
|
def inspect
|
@@ -72,31 +71,6 @@ module EasyML
|
|
72
71
|
|
73
72
|
EasyML::Column::Imputers::OrdinalEncoder.new(column, preprocessing_step).decode_labels(df)
|
74
73
|
end
|
75
|
-
|
76
|
-
private
|
77
|
-
|
78
|
-
def validate_preprocessing_step!
|
79
|
-
validate_params!
|
80
|
-
validate_method!
|
81
|
-
end
|
82
|
-
|
83
|
-
def validate_params!
|
84
|
-
return unless preprocessing_step[:params]
|
85
|
-
|
86
|
-
preprocessing_step[:params].keys.each do |param|
|
87
|
-
unless Imputers.supported_params.include?(param.to_sym)
|
88
|
-
raise ArgumentError, "Unsupported preprocessing parameter '#{param}'. Supported parameters are: #{Imputers.supported_params.join(", ")}"
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
def validate_method!
|
94
|
-
return unless preprocessing_step[:method]
|
95
|
-
|
96
|
-
unless Imputers.supported_methods.include?(preprocessing_step[:method].to_sym)
|
97
|
-
raise ArgumentError, "Unsupported preprocessing method '#{preprocessing_step[:method]}'. Supported methods are: #{Imputers.supported_methods.join(", ")}"
|
98
|
-
end
|
99
|
-
end
|
100
74
|
end
|
101
75
|
end
|
102
76
|
end
|
@@ -43,6 +43,7 @@ module EasyML
|
|
43
43
|
before_save :set_defaults
|
44
44
|
before_save :set_feature_lineage
|
45
45
|
before_save :set_polars_datatype
|
46
|
+
after_find :ensure_feature_exists
|
46
47
|
|
47
48
|
# Scopes
|
48
49
|
scope :visible, -> { where(hidden: false) }
|
@@ -108,6 +109,13 @@ module EasyML
|
|
108
109
|
}
|
109
110
|
scope :is_learning, -> { where(is_learning: true) }
|
110
111
|
|
112
|
+
def ensure_feature_exists
|
113
|
+
if feature && !feature.has_code?
|
114
|
+
feature.destroy
|
115
|
+
update(feature_id: nil)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
111
119
|
def display_attributes
|
112
120
|
attributes.except(:statistics)
|
113
121
|
end
|
@@ -206,9 +214,10 @@ module EasyML
|
|
206
214
|
|
207
215
|
def raw_dtype
|
208
216
|
return @raw_dtype if @raw_dtype
|
217
|
+
set_feature_lineage
|
209
218
|
|
210
219
|
if in_raw_dataset?
|
211
|
-
@raw_dtype = raw&.data&.to_series
|
220
|
+
@raw_dtype = raw&.data&.to_series.try(:dtype)
|
212
221
|
elsif already_computed?
|
213
222
|
@raw_dtype = processed&.data&.to_series&.dtype
|
214
223
|
end
|
@@ -284,6 +293,8 @@ module EasyML
|
|
284
293
|
alias_method :feature, :computing_feature
|
285
294
|
|
286
295
|
def set_feature_lineage
|
296
|
+
return if dataset.nil?
|
297
|
+
|
287
298
|
if dataset.features.computed_column_names.include?(name)
|
288
299
|
if computed_by.nil?
|
289
300
|
assign_attributes(
|
@@ -202,7 +202,6 @@ module EasyML
|
|
202
202
|
prepare!
|
203
203
|
fit_features!(async: async)
|
204
204
|
end
|
205
|
-
after_fit_features unless async
|
206
205
|
end
|
207
206
|
|
208
207
|
def refresh(async: false)
|
@@ -212,7 +211,6 @@ module EasyML
|
|
212
211
|
prepare
|
213
212
|
fit_features(async: async)
|
214
213
|
end
|
215
|
-
after_fit_features unless async
|
216
214
|
end
|
217
215
|
|
218
216
|
def fit_features!(async: false, features: self.features)
|
@@ -221,7 +219,7 @@ module EasyML
|
|
221
219
|
|
222
220
|
def fit_features(async: false, features: self.features, force: false)
|
223
221
|
features_to_compute = force ? features : features.needs_fit
|
224
|
-
return if features_to_compute.empty?
|
222
|
+
return after_fit_features if features_to_compute.empty?
|
225
223
|
|
226
224
|
features.first.fit(features: features_to_compute, async: async)
|
227
225
|
end
|
@@ -231,7 +229,6 @@ module EasyML
|
|
231
229
|
reload
|
232
230
|
return if failed?
|
233
231
|
|
234
|
-
features.update_all(needs_fit: false, fit_at: Time.current)
|
235
232
|
actually_refresh
|
236
233
|
end
|
237
234
|
|
@@ -82,7 +82,7 @@ module EasyML
|
|
82
82
|
where(id: fittable.map(&:id))
|
83
83
|
end
|
84
84
|
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
|
85
|
-
scope :ready_to_apply, -> { where.not(id:
|
85
|
+
scope :ready_to_apply, -> { where(needs_fit: false).where.not(id: has_changes.map(&:id)) }
|
86
86
|
|
87
87
|
before_save :apply_defaults, if: :new_record?
|
88
88
|
before_save :update_sha
|
@@ -95,6 +95,10 @@ module EasyML
|
|
95
95
|
raise InvalidFeatureError, "Invalid feature class: #{feature_class}"
|
96
96
|
end
|
97
97
|
|
98
|
+
def has_code?
|
99
|
+
feature_klass.present?
|
100
|
+
end
|
101
|
+
|
98
102
|
def adapter
|
99
103
|
@adapter ||= feature_klass.new
|
100
104
|
end
|
@@ -232,13 +236,54 @@ module EasyML
|
|
232
236
|
if async && job_count > 1
|
233
237
|
EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
|
234
238
|
else
|
235
|
-
jobs.
|
236
|
-
|
239
|
+
jobs.each do |feature_batch|
|
240
|
+
feature_batch.each do |batch_args|
|
241
|
+
EasyML::ComputeFeatureJob.perform(nil, batch_args)
|
242
|
+
end
|
243
|
+
feature = EasyML::Feature.find(feature_batch.first.dig(:feature_id))
|
244
|
+
feature.after_fit
|
237
245
|
end
|
238
|
-
|
246
|
+
dataset.after_fit_features
|
239
247
|
end
|
240
248
|
end
|
241
249
|
|
250
|
+
def self.fit_one_batch(batch_id, batch_args = {})
|
251
|
+
batch_args.symbolize_keys!
|
252
|
+
feature_id = batch_args.dig(:feature_id)
|
253
|
+
feature = EasyML::Feature.find(feature_id)
|
254
|
+
dataset = feature.dataset
|
255
|
+
|
256
|
+
# Check if any feature has failed before proceeding
|
257
|
+
return if dataset.features.any? { |f| f.workflow_status == "failed" }
|
258
|
+
|
259
|
+
feature.update(workflow_status: :analyzing) if feature.workflow_status == :ready
|
260
|
+
begin
|
261
|
+
feature.fit_batch(batch_args.merge!(batch_id: batch_id))
|
262
|
+
rescue => e
|
263
|
+
EasyML::Feature.transaction do
|
264
|
+
return if dataset.reload.workflow_status == :failed
|
265
|
+
|
266
|
+
feature.update(workflow_status: :failed)
|
267
|
+
dataset.update(workflow_status: :failed)
|
268
|
+
build_error_with_context(dataset, e, batch_id, feature)
|
269
|
+
end
|
270
|
+
raise e
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
def self.build_error_with_context(dataset, error, batch_id, feature)
|
275
|
+
error = EasyML::Event.handle_error(dataset, error)
|
276
|
+
batch = feature.build_batch(batch_id: batch_id)
|
277
|
+
|
278
|
+
# Convert any dataframes in the context to serialized form
|
279
|
+
error.create_context(context: batch)
|
280
|
+
end
|
281
|
+
|
282
|
+
def self.fit_feature_failed(dataset, e)
|
283
|
+
dataset.update(workflow_status: :failed)
|
284
|
+
EasyML::Event.handle_error(dataset, e)
|
285
|
+
end
|
286
|
+
|
242
287
|
# Fit a single batch, used for testing the user's feature implementation
|
243
288
|
def fit_batch(batch_args = {})
|
244
289
|
batch_args.symbolize_keys!
|
@@ -301,7 +346,11 @@ module EasyML
|
|
301
346
|
end
|
302
347
|
return if df.blank?
|
303
348
|
|
304
|
-
|
349
|
+
begin
|
350
|
+
batch_df = adapter.fit(df, self, batch_args)
|
351
|
+
rescue => e
|
352
|
+
raise "Feature #{feature_class}#fit failed: #{e.message}"
|
353
|
+
end
|
305
354
|
if batch_df.present?
|
306
355
|
store(batch_df)
|
307
356
|
else
|
@@ -315,7 +364,11 @@ module EasyML
|
|
315
364
|
return df if !adapter.respond_to?(:transform) && feature_store.empty?
|
316
365
|
|
317
366
|
df_len_was = df.shape[0]
|
318
|
-
|
367
|
+
begin
|
368
|
+
result = adapter.transform(df, self)
|
369
|
+
rescue => e
|
370
|
+
raise "Feature #{feature_class}#transform failed: #{e.message}"
|
371
|
+
end
|
319
372
|
raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
|
320
373
|
df_len_now = result.shape[0]
|
321
374
|
raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
|
@@ -411,8 +464,10 @@ module EasyML
|
|
411
464
|
end
|
412
465
|
|
413
466
|
def after_fit
|
467
|
+
update_sha
|
468
|
+
|
414
469
|
updates = {
|
415
|
-
|
470
|
+
fit_at: Time.current,
|
416
471
|
needs_fit: false,
|
417
472
|
workflow_status: :ready,
|
418
473
|
}.compact
|
@@ -472,7 +527,11 @@ module EasyML
|
|
472
527
|
end
|
473
528
|
|
474
529
|
def feature_klass
|
475
|
-
|
530
|
+
begin
|
531
|
+
@feature_klass ||= EasyML::Features::Registry.find(feature_class.to_s).dig(:feature_class).constantize
|
532
|
+
rescue => e
|
533
|
+
nil
|
534
|
+
end
|
476
535
|
end
|
477
536
|
|
478
537
|
def config
|
@@ -15,10 +15,10 @@ module EasyML
|
|
15
15
|
max_key = df[primary_key].max
|
16
16
|
batch_size = feature.batch_size || 10_000
|
17
17
|
|
18
|
-
# Try to parse as integers if they're strings
|
19
18
|
begin
|
20
|
-
|
21
|
-
|
19
|
+
# We are intentionally not using to_i, so it will raise an error for keys like "A1"
|
20
|
+
min_key = Integer(min_key) if min_key.is_a?(String)
|
21
|
+
max_key = Integer(max_key) if max_key.is_a?(String)
|
22
22
|
rescue ArgumentError
|
23
23
|
return store_without_partitioning(df)
|
24
24
|
end
|
data/lib/easy_ml/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: easy_ml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.pre.
|
4
|
+
version: 0.2.0.pre.rc68
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brett Shollenberger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-02-
|
11
|
+
date: 2025-02-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -553,7 +553,6 @@ files:
|
|
553
553
|
- app/jobs/easy_ml/clean_job.rb
|
554
554
|
- app/jobs/easy_ml/compute_feature_job.rb
|
555
555
|
- app/jobs/easy_ml/deploy_job.rb
|
556
|
-
- app/jobs/easy_ml/finalize_feature_job.rb
|
557
556
|
- app/jobs/easy_ml/refresh_dataset_job.rb
|
558
557
|
- app/jobs/easy_ml/schedule_retraining_job.rb
|
559
558
|
- app/jobs/easy_ml/sync_datasource_job.rb
|