easy_ml 0.2.0.pre.rc63 → 0.2.0.pre.rc65
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/jobs/easy_ml/batch_job.rb +11 -2
- data/app/jobs/easy_ml/compute_feature_job.rb +15 -13
- data/app/jobs/easy_ml/refresh_dataset_job.rb +2 -2
- data/app/models/easy_ml/column/imputers/imputer.rb +0 -26
- data/app/models/easy_ml/column/selector.rb +1 -1
- data/app/models/easy_ml/column.rb +2 -1
- data/app/models/easy_ml/dataset.rb +1 -3
- data/app/models/easy_ml/feature.rb +42 -6
- data/lib/easy_ml/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d6993d639004ee88981816cf11422f458d2fa5caa121e760d075c7a73ae70195
|
4
|
+
data.tar.gz: 0e60804c7d59f8c3402be88b6b6ae5e24a7c9542875cbc29f606bbd500227b1f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b11150da87d6dafb5d0a71f0c9b8391012a388ba51eb17544dca044d8616b2d9898fa65ef37e7c8cb22f627669e648b23edc37f6582f5aa40d4619228c57ed02
|
7
|
+
data.tar.gz: 7fd29a43e9a2a15b3388e2c592fe5772d15e394103d7c0651fbe0404abd7bd8637e8ff32f3c38cac4220ce815ab49f369c5767d00dab0cd5ed060a924d3fa8bb
|
@@ -40,14 +40,23 @@ module EasyML
|
|
40
40
|
Resque.redis.rpush("batch:#{parent_id}:remaining", batch.to_json)
|
41
41
|
end
|
42
42
|
|
43
|
-
|
43
|
+
handle_batch(parent_id, batch)
|
44
|
+
end
|
45
|
+
|
46
|
+
def handle_batch(parent_id, batch)
|
47
|
+
if batch.size > 1
|
48
|
+
enqueue_batch(batch)
|
49
|
+
else
|
50
|
+
run_one_batch(parent_id, batch.first)
|
51
|
+
after_batch_hook(parent_id, batch)
|
52
|
+
end
|
44
53
|
end
|
45
54
|
|
46
55
|
def enqueue_next_batch(caller, parent_id)
|
47
56
|
next_batch = Resque.redis.lpop("batch:#{parent_id}:remaining")
|
48
57
|
payload = Resque.decode(next_batch)
|
49
58
|
|
50
|
-
caller.
|
59
|
+
caller.handle_batch(parent_id, payload)
|
51
60
|
end
|
52
61
|
|
53
62
|
def next_batch?(parent_id)
|
@@ -4,20 +4,18 @@ module EasyML
|
|
4
4
|
|
5
5
|
@queue = :easy_ml
|
6
6
|
|
7
|
-
def self.perform(batch_id,
|
7
|
+
def self.perform(batch_id, batch_args = {})
|
8
8
|
begin
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
#
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
feature.update(workflow_status: :analyzing) if feature.workflow_status == :ready
|
20
|
-
feature.fit_batch(options.merge!(batch_id: batch_id))
|
9
|
+
# This is very, very, very, very, very important
|
10
|
+
# if you don't dup the batch_args, resque-batched-job will
|
11
|
+
# fail in some non-obvious ways, because it will try to
|
12
|
+
# decode to match the original batch args EXACTLY.
|
13
|
+
#
|
14
|
+
# This will waste your time so please just don't remove this .dup!!!
|
15
|
+
#
|
16
|
+
# https://github.com/drfeelngood/resque-batched-job/blob/master/lib/resque/plugins/batched_job.rb#L86
|
17
|
+
batch_args = batch_args.dup
|
18
|
+
run_one_batch(batch_id, batch_args)
|
21
19
|
rescue => e
|
22
20
|
EasyML::Feature.transaction do
|
23
21
|
return if dataset.reload.workflow_status == :failed
|
@@ -29,6 +27,10 @@ module EasyML
|
|
29
27
|
end
|
30
28
|
end
|
31
29
|
|
30
|
+
def self.run_one_batch(batch_id, batch_args)
|
31
|
+
EasyML::Feature.fit_one_batch(batch_id, batch_args)
|
32
|
+
end
|
33
|
+
|
32
34
|
def self.build_error_with_context(dataset, error, batch_id, feature)
|
33
35
|
error = EasyML::Event.handle_error(dataset, error)
|
34
36
|
batch = feature.build_batch(batch_id: batch_id)
|
@@ -16,14 +16,14 @@ module EasyML
|
|
16
16
|
if dataset.features.needs_fit.any?
|
17
17
|
dataset.fit_features(async: true)
|
18
18
|
else
|
19
|
-
dataset.
|
19
|
+
dataset.after_fit_features
|
20
20
|
end
|
21
21
|
rescue StandardError => e
|
22
22
|
if Rails.env.test?
|
23
23
|
raise e
|
24
24
|
end
|
25
25
|
dataset.update(workflow_status: :failed)
|
26
|
-
handle_error(dataset, e)
|
26
|
+
EasyML::Event.handle_error(dataset, e)
|
27
27
|
end
|
28
28
|
end
|
29
29
|
end
|
@@ -8,7 +8,6 @@ module EasyML
|
|
8
8
|
@column = column
|
9
9
|
@dataset = column.dataset
|
10
10
|
@preprocessing_step = preprocessing_step.with_indifferent_access
|
11
|
-
validate_preprocessing_step!
|
12
11
|
end
|
13
12
|
|
14
13
|
def inspect
|
@@ -72,31 +71,6 @@ module EasyML
|
|
72
71
|
|
73
72
|
EasyML::Column::Imputers::OrdinalEncoder.new(column, preprocessing_step).decode_labels(df)
|
74
73
|
end
|
75
|
-
|
76
|
-
private
|
77
|
-
|
78
|
-
def validate_preprocessing_step!
|
79
|
-
validate_params!
|
80
|
-
validate_method!
|
81
|
-
end
|
82
|
-
|
83
|
-
def validate_params!
|
84
|
-
return unless preprocessing_step[:params]
|
85
|
-
|
86
|
-
preprocessing_step[:params].keys.each do |param|
|
87
|
-
unless Imputers.supported_params.include?(param.to_sym)
|
88
|
-
raise ArgumentError, "Unsupported preprocessing parameter '#{param}'. Supported parameters are: #{Imputers.supported_params.join(", ")}"
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
def validate_method!
|
94
|
-
return unless preprocessing_step[:method]
|
95
|
-
|
96
|
-
unless Imputers.supported_methods.include?(preprocessing_step[:method].to_sym)
|
97
|
-
raise ArgumentError, "Unsupported preprocessing method '#{preprocessing_step[:method]}'. Supported methods are: #{Imputers.supported_methods.join(", ")}"
|
98
|
-
end
|
99
|
-
end
|
100
74
|
end
|
101
75
|
end
|
102
76
|
end
|
@@ -206,9 +206,10 @@ module EasyML
|
|
206
206
|
|
207
207
|
def raw_dtype
|
208
208
|
return @raw_dtype if @raw_dtype
|
209
|
+
set_feature_lineage
|
209
210
|
|
210
211
|
if in_raw_dataset?
|
211
|
-
@raw_dtype = raw&.data&.to_series
|
212
|
+
@raw_dtype = raw&.data&.to_series.try(:dtype)
|
212
213
|
elsif already_computed?
|
213
214
|
@raw_dtype = processed&.data&.to_series&.dtype
|
214
215
|
end
|
@@ -202,7 +202,6 @@ module EasyML
|
|
202
202
|
prepare!
|
203
203
|
fit_features!(async: async)
|
204
204
|
end
|
205
|
-
after_fit_features unless async
|
206
205
|
end
|
207
206
|
|
208
207
|
def refresh(async: false)
|
@@ -212,7 +211,6 @@ module EasyML
|
|
212
211
|
prepare
|
213
212
|
fit_features(async: async)
|
214
213
|
end
|
215
|
-
after_fit_features unless async
|
216
214
|
end
|
217
215
|
|
218
216
|
def fit_features!(async: false, features: self.features)
|
@@ -227,11 +225,11 @@ module EasyML
|
|
227
225
|
end
|
228
226
|
|
229
227
|
def after_fit_features
|
228
|
+
puts "AFTER FIT FEATURES!"
|
230
229
|
unlock!
|
231
230
|
reload
|
232
231
|
return if failed?
|
233
232
|
|
234
|
-
features.update_all(needs_fit: false, fit_at: Time.current)
|
235
233
|
actually_refresh
|
236
234
|
end
|
237
235
|
|
@@ -232,13 +232,41 @@ module EasyML
|
|
232
232
|
if async && job_count > 1
|
233
233
|
EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
|
234
234
|
else
|
235
|
-
jobs.
|
236
|
-
|
235
|
+
jobs.each do |feature_batch|
|
236
|
+
feature_batch.each do |batch_args|
|
237
|
+
EasyML::ComputeFeatureJob.perform(nil, batch_args)
|
238
|
+
end
|
239
|
+
feature = EasyML::Feature.find(feature_batch.first.dig(:feature_id))
|
240
|
+
feature.after_fit
|
237
241
|
end
|
238
|
-
|
242
|
+
dataset.after_fit_features
|
239
243
|
end
|
240
244
|
end
|
241
245
|
|
246
|
+
def self.fit_one_batch(batch_id, batch_args = {})
|
247
|
+
batch_args.symbolize_keys!
|
248
|
+
feature_id = batch_args.dig(:feature_id)
|
249
|
+
feature = EasyML::Feature.find(feature_id)
|
250
|
+
dataset = feature.dataset
|
251
|
+
|
252
|
+
# Check if any feature has failed before proceeding
|
253
|
+
if dataset.features.any? { |f| f.workflow_status == "failed" }
|
254
|
+
return
|
255
|
+
end
|
256
|
+
feature.update(workflow_status: :analyzing) if feature.workflow_status == :ready
|
257
|
+
begin
|
258
|
+
feature.fit_batch(batch_args.merge!(batch_id: batch_id))
|
259
|
+
rescue => e
|
260
|
+
EasyML::Feature.fit_feature_failed(dataset, e)
|
261
|
+
raise e
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
def self.fit_feature_failed(dataset, e)
|
266
|
+
dataset.update(workflow_status: :failed)
|
267
|
+
EasyML::Event.handle_error(dataset, e)
|
268
|
+
end
|
269
|
+
|
242
270
|
# Fit a single batch, used for testing the user's feature implementation
|
243
271
|
def fit_batch(batch_args = {})
|
244
272
|
batch_args.symbolize_keys!
|
@@ -301,7 +329,11 @@ module EasyML
|
|
301
329
|
end
|
302
330
|
return if df.blank?
|
303
331
|
|
304
|
-
|
332
|
+
begin
|
333
|
+
batch_df = adapter.fit(df, self, batch_args)
|
334
|
+
rescue => e
|
335
|
+
raise "Feature #{feature_class}#fit failed: #{e.message}"
|
336
|
+
end
|
305
337
|
if batch_df.present?
|
306
338
|
store(batch_df)
|
307
339
|
else
|
@@ -315,7 +347,11 @@ module EasyML
|
|
315
347
|
return df if !adapter.respond_to?(:transform) && feature_store.empty?
|
316
348
|
|
317
349
|
df_len_was = df.shape[0]
|
318
|
-
|
350
|
+
begin
|
351
|
+
result = adapter.transform(df, self)
|
352
|
+
rescue => e
|
353
|
+
raise "Feature #{feature_class}#transform failed: #{e.message}"
|
354
|
+
end
|
319
355
|
raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
|
320
356
|
df_len_now = result.shape[0]
|
321
357
|
raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
|
@@ -412,7 +448,7 @@ module EasyML
|
|
412
448
|
|
413
449
|
def after_fit
|
414
450
|
updates = {
|
415
|
-
|
451
|
+
fit_at: Time.current,
|
416
452
|
needs_fit: false,
|
417
453
|
workflow_status: :ready,
|
418
454
|
}.compact
|
data/lib/easy_ml/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: easy_ml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.pre.
|
4
|
+
version: 0.2.0.pre.rc65
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brett Shollenberger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-02-
|
11
|
+
date: 2025-02-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|