easy_ml 0.2.0.pre.rc63 → 0.2.0.pre.rc68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 30ec865849f66547055d1835ccc18753308d9a6cd3c8a5862a01a20151b2e772
4
- data.tar.gz: df9c0eee094ea607370bd84f97f2f269919e08e07fac1a4e0a17f96bc0286093
3
+ metadata.gz: 2404bc2b6613d95627dc89265e76351f648d9b45e65f5935a71a0e2334d08d2c
4
+ data.tar.gz: 072f1d58d6e5d7864e3cdb365701c6fb6e635bc20e3e9ef415fe52772671ce76
5
5
  SHA512:
6
- metadata.gz: 6241d23ced8da76c9ca7e6789bfeccc75824b9e217fa01850f6ab6475ee77716b222d4c0661e965d69de8b95d9719d15ed67a2f1835c359ca3d186d193058b2c
7
- data.tar.gz: a92fdc6934bf28d0f6a18f5356d8ac76f5ee0195f96fbed9110dc1862c976922ec7e1141b65e7050d80c1b63e5a90e7a5903147f9001167e7e12e7c553e9339c
6
+ metadata.gz: d4e8ab597db1470630e555448fb057af07fab38ff5125fd7ce548db5ba71ca7abac26d1761fb892459890b5c8bce169cd9e165d80b4b91ec6a806dda1a36cbe5
7
+ data.tar.gz: 71cab456d196c15e03b16e2677cb3b64e08527b2b4acc20ac8ee6c0f0ceea671b74df0d61c174a914cfa85f7e476ad77f264ae78e6b604ed8f50ca2756d84956
@@ -40,14 +40,23 @@ module EasyML
40
40
  Resque.redis.rpush("batch:#{parent_id}:remaining", batch.to_json)
41
41
  end
42
42
 
43
- enqueue_batch(batch)
43
+ handle_batch(parent_id, batch)
44
+ end
45
+
46
+ def handle_batch(parent_id, batch)
47
+ if batch.size > 1
48
+ enqueue_batch(batch)
49
+ else
50
+ run_one_batch(parent_id, batch.first)
51
+ after_batch_hook(parent_id, batch)
52
+ end
44
53
  end
45
54
 
46
55
  def enqueue_next_batch(caller, parent_id)
47
56
  next_batch = Resque.redis.lpop("batch:#{parent_id}:remaining")
48
57
  payload = Resque.decode(next_batch)
49
58
 
50
- caller.enqueue_batch(payload)
59
+ caller.handle_batch(parent_id, payload)
51
60
  end
52
61
 
53
62
  def next_batch?(parent_id)
@@ -4,37 +4,21 @@ module EasyML
4
4
 
5
5
  @queue = :easy_ml
6
6
 
7
- def self.perform(batch_id, options = {})
8
- begin
9
- options.symbolize_keys!
10
- feature_id = options.dig(:feature_id)
11
- feature = EasyML::Feature.find(feature_id)
12
- dataset = feature.dataset
13
-
14
- # Check if any feature has failed before proceeding
15
- if dataset.features.any? { |f| f.workflow_status == "failed" }
16
- return
17
- end
18
-
19
- feature.update(workflow_status: :analyzing) if feature.workflow_status == :ready
20
- feature.fit_batch(options.merge!(batch_id: batch_id))
21
- rescue => e
22
- EasyML::Feature.transaction do
23
- return if dataset.reload.workflow_status == :failed
24
-
25
- feature.update(workflow_status: :failed)
26
- dataset.update(workflow_status: :failed)
27
- build_error_with_context(dataset, e, batch_id, feature)
28
- end
29
- end
7
+ def self.perform(batch_id, batch_args = {})
8
+ # This is very, very, very, very, very important
9
+ # if you don't dup the batch_args, resque-batched-job will
10
+ # fail in some non-obvious ways, because it will try to
11
+ # decode to match the original batch args EXACTLY.
12
+ #
13
+ # This will waste your time so please just don't remove this .dup!!!
14
+ #
15
+ # https://github.com/drfeelngood/resque-batched-job/blob/master/lib/resque/plugins/batched_job.rb#L86
16
+ batch_args = batch_args.dup
17
+ run_one_batch(batch_id, batch_args)
30
18
  end
31
19
 
32
- def self.build_error_with_context(dataset, error, batch_id, feature)
33
- error = EasyML::Event.handle_error(dataset, error)
34
- batch = feature.build_batch(batch_id: batch_id)
35
-
36
- # Convert any dataframes in the context to serialized form
37
- error.create_context(context: batch)
20
+ def self.run_one_batch(batch_id, batch_args)
21
+ EasyML::Feature.fit_one_batch(batch_id, batch_args)
38
22
  end
39
23
 
40
24
  def self.after_batch_hook(batch_id, *args)
@@ -16,14 +16,14 @@ module EasyML
16
16
  if dataset.features.needs_fit.any?
17
17
  dataset.fit_features(async: true)
18
18
  else
19
- dataset.actually_refresh
19
+ dataset.after_fit_features
20
20
  end
21
21
  rescue StandardError => e
22
22
  if Rails.env.test?
23
23
  raise e
24
24
  end
25
25
  dataset.update(workflow_status: :failed)
26
- handle_error(dataset, e)
26
+ EasyML::Event.handle_error(dataset, e)
27
27
  end
28
28
  end
29
29
  end
@@ -8,7 +8,6 @@ module EasyML
8
8
  @column = column
9
9
  @dataset = column.dataset
10
10
  @preprocessing_step = preprocessing_step.with_indifferent_access
11
- validate_preprocessing_step!
12
11
  end
13
12
 
14
13
  def inspect
@@ -72,31 +71,6 @@ module EasyML
72
71
 
73
72
  EasyML::Column::Imputers::OrdinalEncoder.new(column, preprocessing_step).decode_labels(df)
74
73
  end
75
-
76
- private
77
-
78
- def validate_preprocessing_step!
79
- validate_params!
80
- validate_method!
81
- end
82
-
83
- def validate_params!
84
- return unless preprocessing_step[:params]
85
-
86
- preprocessing_step[:params].keys.each do |param|
87
- unless Imputers.supported_params.include?(param.to_sym)
88
- raise ArgumentError, "Unsupported preprocessing parameter '#{param}'. Supported parameters are: #{Imputers.supported_params.join(", ")}"
89
- end
90
- end
91
- end
92
-
93
- def validate_method!
94
- return unless preprocessing_step[:method]
95
-
96
- unless Imputers.supported_methods.include?(preprocessing_step[:method].to_sym)
97
- raise ArgumentError, "Unsupported preprocessing method '#{preprocessing_step[:method]}'. Supported methods are: #{Imputers.supported_methods.join(", ")}"
98
- end
99
- end
100
74
  end
101
75
  end
102
76
  end
@@ -45,7 +45,7 @@ module EasyML
45
45
  end
46
46
 
47
47
  def data(**kwargs)
48
- if column.is_computed?
48
+ if column.is_computed? && !column.in_raw_dataset?
49
49
  Selector.new(column, :processed).send(:select, :data, **kwargs)
50
50
  else
51
51
  select(:data, **kwargs)
@@ -43,6 +43,7 @@ module EasyML
43
43
  before_save :set_defaults
44
44
  before_save :set_feature_lineage
45
45
  before_save :set_polars_datatype
46
+ after_find :ensure_feature_exists
46
47
 
47
48
  # Scopes
48
49
  scope :visible, -> { where(hidden: false) }
@@ -108,6 +109,13 @@ module EasyML
108
109
  }
109
110
  scope :is_learning, -> { where(is_learning: true) }
110
111
 
112
+ def ensure_feature_exists
113
+ if feature && !feature.has_code?
114
+ feature.destroy
115
+ update(feature_id: nil)
116
+ end
117
+ end
118
+
111
119
  def display_attributes
112
120
  attributes.except(:statistics)
113
121
  end
@@ -206,9 +214,10 @@ module EasyML
206
214
 
207
215
  def raw_dtype
208
216
  return @raw_dtype if @raw_dtype
217
+ set_feature_lineage
209
218
 
210
219
  if in_raw_dataset?
211
- @raw_dtype = raw&.data&.to_series&.dtype
220
+ @raw_dtype = raw&.data&.to_series.try(:dtype)
212
221
  elsif already_computed?
213
222
  @raw_dtype = processed&.data&.to_series&.dtype
214
223
  end
@@ -284,6 +293,8 @@ module EasyML
284
293
  alias_method :feature, :computing_feature
285
294
 
286
295
  def set_feature_lineage
296
+ return if dataset.nil?
297
+
287
298
  if dataset.features.computed_column_names.include?(name)
288
299
  if computed_by.nil?
289
300
  assign_attributes(
@@ -202,7 +202,6 @@ module EasyML
202
202
  prepare!
203
203
  fit_features!(async: async)
204
204
  end
205
- after_fit_features unless async
206
205
  end
207
206
 
208
207
  def refresh(async: false)
@@ -212,7 +211,6 @@ module EasyML
212
211
  prepare
213
212
  fit_features(async: async)
214
213
  end
215
- after_fit_features unless async
216
214
  end
217
215
 
218
216
  def fit_features!(async: false, features: self.features)
@@ -221,7 +219,7 @@ module EasyML
221
219
 
222
220
  def fit_features(async: false, features: self.features, force: false)
223
221
  features_to_compute = force ? features : features.needs_fit
224
- return if features_to_compute.empty?
222
+ return after_fit_features if features_to_compute.empty?
225
223
 
226
224
  features.first.fit(features: features_to_compute, async: async)
227
225
  end
@@ -231,7 +229,6 @@ module EasyML
231
229
  reload
232
230
  return if failed?
233
231
 
234
- features.update_all(needs_fit: false, fit_at: Time.current)
235
232
  actually_refresh
236
233
  end
237
234
 
@@ -82,7 +82,7 @@ module EasyML
82
82
  where(id: fittable.map(&:id))
83
83
  end
84
84
  scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
85
- scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
85
+ scope :ready_to_apply, -> { where(needs_fit: false).where.not(id: has_changes.map(&:id)) }
86
86
 
87
87
  before_save :apply_defaults, if: :new_record?
88
88
  before_save :update_sha
@@ -95,6 +95,10 @@ module EasyML
95
95
  raise InvalidFeatureError, "Invalid feature class: #{feature_class}"
96
96
  end
97
97
 
98
+ def has_code?
99
+ feature_klass.present?
100
+ end
101
+
98
102
  def adapter
99
103
  @adapter ||= feature_klass.new
100
104
  end
@@ -232,13 +236,54 @@ module EasyML
232
236
  if async && job_count > 1
233
237
  EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
234
238
  else
235
- jobs.flatten.each do |job|
236
- EasyML::ComputeFeatureJob.perform(nil, job)
239
+ jobs.each do |feature_batch|
240
+ feature_batch.each do |batch_args|
241
+ EasyML::ComputeFeatureJob.perform(nil, batch_args)
242
+ end
243
+ feature = EasyML::Feature.find(feature_batch.first.dig(:feature_id))
244
+ feature.after_fit
237
245
  end
238
- features.each(&:after_fit) unless features.any?(&:failed?)
246
+ dataset.after_fit_features
239
247
  end
240
248
  end
241
249
 
250
+ def self.fit_one_batch(batch_id, batch_args = {})
251
+ batch_args.symbolize_keys!
252
+ feature_id = batch_args.dig(:feature_id)
253
+ feature = EasyML::Feature.find(feature_id)
254
+ dataset = feature.dataset
255
+
256
+ # Check if any feature has failed before proceeding
257
+ return if dataset.features.any? { |f| f.workflow_status == "failed" }
258
+
259
+ feature.update(workflow_status: :analyzing) if feature.workflow_status == :ready
260
+ begin
261
+ feature.fit_batch(batch_args.merge!(batch_id: batch_id))
262
+ rescue => e
263
+ EasyML::Feature.transaction do
264
+ return if dataset.reload.workflow_status == :failed
265
+
266
+ feature.update(workflow_status: :failed)
267
+ dataset.update(workflow_status: :failed)
268
+ build_error_with_context(dataset, e, batch_id, feature)
269
+ end
270
+ raise e
271
+ end
272
+ end
273
+
274
+ def self.build_error_with_context(dataset, error, batch_id, feature)
275
+ error = EasyML::Event.handle_error(dataset, error)
276
+ batch = feature.build_batch(batch_id: batch_id)
277
+
278
+ # Convert any dataframes in the context to serialized form
279
+ error.create_context(context: batch)
280
+ end
281
+
282
+ def self.fit_feature_failed(dataset, e)
283
+ dataset.update(workflow_status: :failed)
284
+ EasyML::Event.handle_error(dataset, e)
285
+ end
286
+
242
287
  # Fit a single batch, used for testing the user's feature implementation
243
288
  def fit_batch(batch_args = {})
244
289
  batch_args.symbolize_keys!
@@ -301,7 +346,11 @@ module EasyML
301
346
  end
302
347
  return if df.blank?
303
348
 
304
- batch_df = adapter.fit(df, self, batch_args)
349
+ begin
350
+ batch_df = adapter.fit(df, self, batch_args)
351
+ rescue => e
352
+ raise "Feature #{feature_class}#fit failed: #{e.message}"
353
+ end
305
354
  if batch_df.present?
306
355
  store(batch_df)
307
356
  else
@@ -315,7 +364,11 @@ module EasyML
315
364
  return df if !adapter.respond_to?(:transform) && feature_store.empty?
316
365
 
317
366
  df_len_was = df.shape[0]
318
- result = adapter.transform(df, self)
367
+ begin
368
+ result = adapter.transform(df, self)
369
+ rescue => e
370
+ raise "Feature #{feature_class}#transform failed: #{e.message}"
371
+ end
319
372
  raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
320
373
  df_len_now = result.shape[0]
321
374
  raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
@@ -411,8 +464,10 @@ module EasyML
411
464
  end
412
465
 
413
466
  def after_fit
467
+ update_sha
468
+
414
469
  updates = {
415
- applied_at: Time.current,
470
+ fit_at: Time.current,
416
471
  needs_fit: false,
417
472
  workflow_status: :ready,
418
473
  }.compact
@@ -472,7 +527,11 @@ module EasyML
472
527
  end
473
528
 
474
529
  def feature_klass
475
- @feature_klass ||= EasyML::Features::Registry.find(feature_class.to_s).dig(:feature_class).constantize
530
+ begin
531
+ @feature_klass ||= EasyML::Features::Registry.find(feature_class.to_s).dig(:feature_class).constantize
532
+ rescue => e
533
+ nil
534
+ end
476
535
  end
477
536
 
478
537
  def config
@@ -15,10 +15,10 @@ module EasyML
15
15
  max_key = df[primary_key].max
16
16
  batch_size = feature.batch_size || 10_000
17
17
 
18
- # Try to parse as integers if they're strings
19
18
  begin
20
- min_key = min_key.to_i if min_key.is_a?(String)
21
- max_key = max_key.to_i if max_key.is_a?(String)
19
+ # We are intentionally not using to_i, so it will raise an error for keys like "A1"
20
+ min_key = Integer(min_key) if min_key.is_a?(String)
21
+ max_key = Integer(max_key) if max_key.is_a?(String)
22
22
  rescue ArgumentError
23
23
  return store_without_partitioning(df)
24
24
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc63"
4
+ VERSION = "0.2.0-rc68"
5
5
 
6
6
  module Version
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: easy_ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre.rc63
4
+ version: 0.2.0.pre.rc68
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brett Shollenberger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-02-06 00:00:00.000000000 Z
11
+ date: 2025-02-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
@@ -553,7 +553,6 @@ files:
553
553
  - app/jobs/easy_ml/clean_job.rb
554
554
  - app/jobs/easy_ml/compute_feature_job.rb
555
555
  - app/jobs/easy_ml/deploy_job.rb
556
- - app/jobs/easy_ml/finalize_feature_job.rb
557
556
  - app/jobs/easy_ml/refresh_dataset_job.rb
558
557
  - app/jobs/easy_ml/schedule_retraining_job.rb
559
558
  - app/jobs/easy_ml/sync_datasource_job.rb
@@ -1,13 +0,0 @@
1
- module EasyML
2
- class FinalizeFeatureJob < ApplicationJob
3
- queue_as :features
4
-
5
- def perform(feature_id)
6
- feature = EasyML::Feature.find(feature_id)
7
- feature.update!(
8
- applied_at: Time.current,
9
- needs_fit: false,
10
- )
11
- end
12
- end
13
- end