easy_ml 0.2.0.pre.rc63 → 0.2.0.pre.rc65

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 30ec865849f66547055d1835ccc18753308d9a6cd3c8a5862a01a20151b2e772
4
- data.tar.gz: df9c0eee094ea607370bd84f97f2f269919e08e07fac1a4e0a17f96bc0286093
3
+ metadata.gz: d6993d639004ee88981816cf11422f458d2fa5caa121e760d075c7a73ae70195
4
+ data.tar.gz: 0e60804c7d59f8c3402be88b6b6ae5e24a7c9542875cbc29f606bbd500227b1f
5
5
  SHA512:
6
- metadata.gz: 6241d23ced8da76c9ca7e6789bfeccc75824b9e217fa01850f6ab6475ee77716b222d4c0661e965d69de8b95d9719d15ed67a2f1835c359ca3d186d193058b2c
7
- data.tar.gz: a92fdc6934bf28d0f6a18f5356d8ac76f5ee0195f96fbed9110dc1862c976922ec7e1141b65e7050d80c1b63e5a90e7a5903147f9001167e7e12e7c553e9339c
6
+ metadata.gz: b11150da87d6dafb5d0a71f0c9b8391012a388ba51eb17544dca044d8616b2d9898fa65ef37e7c8cb22f627669e648b23edc37f6582f5aa40d4619228c57ed02
7
+ data.tar.gz: 7fd29a43e9a2a15b3388e2c592fe5772d15e394103d7c0651fbe0404abd7bd8637e8ff32f3c38cac4220ce815ab49f369c5767d00dab0cd5ed060a924d3fa8bb
@@ -40,14 +40,23 @@ module EasyML
40
40
  Resque.redis.rpush("batch:#{parent_id}:remaining", batch.to_json)
41
41
  end
42
42
 
43
- enqueue_batch(batch)
43
+ handle_batch(parent_id, batch)
44
+ end
45
+
46
+ def handle_batch(parent_id, batch)
47
+ if batch.size > 1
48
+ enqueue_batch(batch)
49
+ else
50
+ run_one_batch(parent_id, batch.first)
51
+ after_batch_hook(parent_id, batch)
52
+ end
44
53
  end
45
54
 
46
55
  def enqueue_next_batch(caller, parent_id)
47
56
  next_batch = Resque.redis.lpop("batch:#{parent_id}:remaining")
48
57
  payload = Resque.decode(next_batch)
49
58
 
50
- caller.enqueue_batch(payload)
59
+ caller.handle_batch(parent_id, payload)
51
60
  end
52
61
 
53
62
  def next_batch?(parent_id)
@@ -4,20 +4,18 @@ module EasyML
4
4
 
5
5
  @queue = :easy_ml
6
6
 
7
- def self.perform(batch_id, options = {})
7
+ def self.perform(batch_id, batch_args = {})
8
8
  begin
9
- options.symbolize_keys!
10
- feature_id = options.dig(:feature_id)
11
- feature = EasyML::Feature.find(feature_id)
12
- dataset = feature.dataset
13
-
14
- # Check if any feature has failed before proceeding
15
- if dataset.features.any? { |f| f.workflow_status == "failed" }
16
- return
17
- end
18
-
19
- feature.update(workflow_status: :analyzing) if feature.workflow_status == :ready
20
- feature.fit_batch(options.merge!(batch_id: batch_id))
9
+ # This is very, very, very, very, very important
10
+ # if you don't dup the batch_args, resque-batched-job will
11
+ # fail in some non-obvious ways, because it will try to
12
+ # decode to match the original batch args EXACTLY.
13
+ #
14
+ # This will waste your time so please just don't remove this .dup!!!
15
+ #
16
+ # https://github.com/drfeelngood/resque-batched-job/blob/master/lib/resque/plugins/batched_job.rb#L86
17
+ batch_args = batch_args.dup
18
+ run_one_batch(batch_id, batch_args)
21
19
  rescue => e
22
20
  EasyML::Feature.transaction do
23
21
  return if dataset.reload.workflow_status == :failed
@@ -29,6 +27,10 @@ module EasyML
29
27
  end
30
28
  end
31
29
 
30
+ def self.run_one_batch(batch_id, batch_args)
31
+ EasyML::Feature.fit_one_batch(batch_id, batch_args)
32
+ end
33
+
32
34
  def self.build_error_with_context(dataset, error, batch_id, feature)
33
35
  error = EasyML::Event.handle_error(dataset, error)
34
36
  batch = feature.build_batch(batch_id: batch_id)
@@ -16,14 +16,14 @@ module EasyML
16
16
  if dataset.features.needs_fit.any?
17
17
  dataset.fit_features(async: true)
18
18
  else
19
- dataset.actually_refresh
19
+ dataset.after_fit_features
20
20
  end
21
21
  rescue StandardError => e
22
22
  if Rails.env.test?
23
23
  raise e
24
24
  end
25
25
  dataset.update(workflow_status: :failed)
26
- handle_error(dataset, e)
26
+ EasyML::Event.handle_error(dataset, e)
27
27
  end
28
28
  end
29
29
  end
@@ -8,7 +8,6 @@ module EasyML
8
8
  @column = column
9
9
  @dataset = column.dataset
10
10
  @preprocessing_step = preprocessing_step.with_indifferent_access
11
- validate_preprocessing_step!
12
11
  end
13
12
 
14
13
  def inspect
@@ -72,31 +71,6 @@ module EasyML
72
71
 
73
72
  EasyML::Column::Imputers::OrdinalEncoder.new(column, preprocessing_step).decode_labels(df)
74
73
  end
75
-
76
- private
77
-
78
- def validate_preprocessing_step!
79
- validate_params!
80
- validate_method!
81
- end
82
-
83
- def validate_params!
84
- return unless preprocessing_step[:params]
85
-
86
- preprocessing_step[:params].keys.each do |param|
87
- unless Imputers.supported_params.include?(param.to_sym)
88
- raise ArgumentError, "Unsupported preprocessing parameter '#{param}'. Supported parameters are: #{Imputers.supported_params.join(", ")}"
89
- end
90
- end
91
- end
92
-
93
- def validate_method!
94
- return unless preprocessing_step[:method]
95
-
96
- unless Imputers.supported_methods.include?(preprocessing_step[:method].to_sym)
97
- raise ArgumentError, "Unsupported preprocessing method '#{preprocessing_step[:method]}'. Supported methods are: #{Imputers.supported_methods.join(", ")}"
98
- end
99
- end
100
74
  end
101
75
  end
102
76
  end
@@ -45,7 +45,7 @@ module EasyML
45
45
  end
46
46
 
47
47
  def data(**kwargs)
48
- if column.is_computed?
48
+ if column.is_computed? && !column.in_raw_dataset?
49
49
  Selector.new(column, :processed).send(:select, :data, **kwargs)
50
50
  else
51
51
  select(:data, **kwargs)
@@ -206,9 +206,10 @@ module EasyML
206
206
 
207
207
  def raw_dtype
208
208
  return @raw_dtype if @raw_dtype
209
+ set_feature_lineage
209
210
 
210
211
  if in_raw_dataset?
211
- @raw_dtype = raw&.data&.to_series&.dtype
212
+ @raw_dtype = raw&.data&.to_series.try(:dtype)
212
213
  elsif already_computed?
213
214
  @raw_dtype = processed&.data&.to_series&.dtype
214
215
  end
@@ -202,7 +202,6 @@ module EasyML
202
202
  prepare!
203
203
  fit_features!(async: async)
204
204
  end
205
- after_fit_features unless async
206
205
  end
207
206
 
208
207
  def refresh(async: false)
@@ -212,7 +211,6 @@ module EasyML
212
211
  prepare
213
212
  fit_features(async: async)
214
213
  end
215
- after_fit_features unless async
216
214
  end
217
215
 
218
216
  def fit_features!(async: false, features: self.features)
@@ -227,11 +225,11 @@ module EasyML
227
225
  end
228
226
 
229
227
  def after_fit_features
228
+ puts "AFTER FIT FEATURES!"
230
229
  unlock!
231
230
  reload
232
231
  return if failed?
233
232
 
234
- features.update_all(needs_fit: false, fit_at: Time.current)
235
233
  actually_refresh
236
234
  end
237
235
 
@@ -232,13 +232,41 @@ module EasyML
232
232
  if async && job_count > 1
233
233
  EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
234
234
  else
235
- jobs.flatten.each do |job|
236
- EasyML::ComputeFeatureJob.perform(nil, job)
235
+ jobs.each do |feature_batch|
236
+ feature_batch.each do |batch_args|
237
+ EasyML::ComputeFeatureJob.perform(nil, batch_args)
238
+ end
239
+ feature = EasyML::Feature.find(feature_batch.first.dig(:feature_id))
240
+ feature.after_fit
237
241
  end
238
- features.each(&:after_fit) unless features.any?(&:failed?)
242
+ dataset.after_fit_features
239
243
  end
240
244
  end
241
245
 
246
+ def self.fit_one_batch(batch_id, batch_args = {})
247
+ batch_args.symbolize_keys!
248
+ feature_id = batch_args.dig(:feature_id)
249
+ feature = EasyML::Feature.find(feature_id)
250
+ dataset = feature.dataset
251
+
252
+ # Check if any feature has failed before proceeding
253
+ if dataset.features.any? { |f| f.workflow_status == "failed" }
254
+ return
255
+ end
256
+ feature.update(workflow_status: :analyzing) if feature.workflow_status == :ready
257
+ begin
258
+ feature.fit_batch(batch_args.merge!(batch_id: batch_id))
259
+ rescue => e
260
+ EasyML::Feature.fit_feature_failed(dataset, e)
261
+ raise e
262
+ end
263
+ end
264
+
265
+ def self.fit_feature_failed(dataset, e)
266
+ dataset.update(workflow_status: :failed)
267
+ EasyML::Event.handle_error(dataset, e)
268
+ end
269
+
242
270
  # Fit a single batch, used for testing the user's feature implementation
243
271
  def fit_batch(batch_args = {})
244
272
  batch_args.symbolize_keys!
@@ -301,7 +329,11 @@ module EasyML
301
329
  end
302
330
  return if df.blank?
303
331
 
304
- batch_df = adapter.fit(df, self, batch_args)
332
+ begin
333
+ batch_df = adapter.fit(df, self, batch_args)
334
+ rescue => e
335
+ raise "Feature #{feature_class}#fit failed: #{e.message}"
336
+ end
305
337
  if batch_df.present?
306
338
  store(batch_df)
307
339
  else
@@ -315,7 +347,11 @@ module EasyML
315
347
  return df if !adapter.respond_to?(:transform) && feature_store.empty?
316
348
 
317
349
  df_len_was = df.shape[0]
318
- result = adapter.transform(df, self)
350
+ begin
351
+ result = adapter.transform(df, self)
352
+ rescue => e
353
+ raise "Feature #{feature_class}#transform failed: #{e.message}"
354
+ end
319
355
  raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
320
356
  df_len_now = result.shape[0]
321
357
  raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
@@ -412,7 +448,7 @@ module EasyML
412
448
 
413
449
  def after_fit
414
450
  updates = {
415
- applied_at: Time.current,
451
+ fit_at: Time.current,
416
452
  needs_fit: false,
417
453
  workflow_status: :ready,
418
454
  }.compact
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc63"
4
+ VERSION = "0.2.0-rc65"
5
5
 
6
6
  module Version
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: easy_ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre.rc63
4
+ version: 0.2.0.pre.rc65
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brett Shollenberger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-02-06 00:00:00.000000000 Z
11
+ date: 2025-02-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord