easy_ml 0.2.0.pre.rc62 → 0.2.0.pre.rc65

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ecfdf6e2547bfaeab500da7ce4cc8ba417dbab1eac64ed74c422fa1c3bf3db5c
4
- data.tar.gz: be43f29853adc807cf9acb6ca92aee54d237d17fb3e858ca39683227ab976241
3
+ metadata.gz: d6993d639004ee88981816cf11422f458d2fa5caa121e760d075c7a73ae70195
4
+ data.tar.gz: 0e60804c7d59f8c3402be88b6b6ae5e24a7c9542875cbc29f606bbd500227b1f
5
5
  SHA512:
6
- metadata.gz: 7f1c9ae555b2edc3957836395630c1515ad47f2429f1037e38b06c8a14e48a46fdc460fa7816c9686c7590a21ddfe92ff93f84068a8e5dff8fc36adc81fdbd14
7
- data.tar.gz: e1f132a13141f76f4af4ae22893f4f6d2451b91bbaa3fcc344c3f49eb35bac21f13325460b002f84913e903b359314949817307c90cfe9ba6c0a622e5e0ea163
6
+ metadata.gz: b11150da87d6dafb5d0a71f0c9b8391012a388ba51eb17544dca044d8616b2d9898fa65ef37e7c8cb22f627669e648b23edc37f6582f5aa40d4619228c57ed02
7
+ data.tar.gz: 7fd29a43e9a2a15b3388e2c592fe5772d15e394103d7c0651fbe0404abd7bd8637e8ff32f3c38cac4220ce815ab49f369c5767d00dab0cd5ed060a924d3fa8bb
@@ -40,14 +40,23 @@ module EasyML
40
40
  Resque.redis.rpush("batch:#{parent_id}:remaining", batch.to_json)
41
41
  end
42
42
 
43
- enqueue_batch(batch)
43
+ handle_batch(parent_id, batch)
44
+ end
45
+
46
+ def handle_batch(parent_id, batch)
47
+ if batch.size > 1
48
+ enqueue_batch(batch)
49
+ else
50
+ run_one_batch(parent_id, batch.first)
51
+ after_batch_hook(parent_id, batch)
52
+ end
44
53
  end
45
54
 
46
55
  def enqueue_next_batch(caller, parent_id)
47
56
  next_batch = Resque.redis.lpop("batch:#{parent_id}:remaining")
48
57
  payload = Resque.decode(next_batch)
49
58
 
50
- caller.enqueue_batch(payload)
59
+ caller.handle_batch(parent_id, payload)
51
60
  end
52
61
 
53
62
  def next_batch?(parent_id)
@@ -4,20 +4,18 @@ module EasyML
4
4
 
5
5
  @queue = :easy_ml
6
6
 
7
- def self.perform(batch_id, options = {})
7
+ def self.perform(batch_id, batch_args = {})
8
8
  begin
9
- options.symbolize_keys!
10
- feature_id = options.dig(:feature_id)
11
- feature = EasyML::Feature.find(feature_id)
12
- dataset = feature.dataset
13
-
14
- # Check if any feature has failed before proceeding
15
- if dataset.features.any? { |f| f.workflow_status == "failed" }
16
- return
17
- end
18
-
19
- feature.update(workflow_status: :analyzing) if feature.workflow_status == :ready
20
- feature.fit_batch(options.merge!(batch_id: batch_id))
9
+ # This is very, very, very, very, very important
10
+ # if you don't dup the batch_args, resque-batched-job will
11
+ # fail in some non-obvious ways, because it will try to
12
+ # decode to match the original batch args EXACTLY.
13
+ #
14
+ # This will waste your time so please just don't remove this .dup!!!
15
+ #
16
+ # https://github.com/drfeelngood/resque-batched-job/blob/master/lib/resque/plugins/batched_job.rb#L86
17
+ batch_args = batch_args.dup
18
+ run_one_batch(batch_id, batch_args)
21
19
  rescue => e
22
20
  EasyML::Feature.transaction do
23
21
  return if dataset.reload.workflow_status == :failed
@@ -29,6 +27,10 @@ module EasyML
29
27
  end
30
28
  end
31
29
 
30
+ def self.run_one_batch(batch_id, batch_args)
31
+ EasyML::Feature.fit_one_batch(batch_id, batch_args)
32
+ end
33
+
32
34
  def self.build_error_with_context(dataset, error, batch_id, feature)
33
35
  error = EasyML::Event.handle_error(dataset, error)
34
36
  batch = feature.build_batch(batch_id: batch_id)
@@ -16,14 +16,14 @@ module EasyML
16
16
  if dataset.features.needs_fit.any?
17
17
  dataset.fit_features(async: true)
18
18
  else
19
- dataset.actually_refresh
19
+ dataset.after_fit_features
20
20
  end
21
21
  rescue StandardError => e
22
22
  if Rails.env.test?
23
23
  raise e
24
24
  end
25
25
  dataset.update(workflow_status: :failed)
26
- handle_error(dataset, e)
26
+ EasyML::Event.handle_error(dataset, e)
27
27
  end
28
28
  end
29
29
  end
@@ -8,7 +8,6 @@ module EasyML
8
8
  @column = column
9
9
  @dataset = column.dataset
10
10
  @preprocessing_step = preprocessing_step.with_indifferent_access
11
- validate_preprocessing_step!
12
11
  end
13
12
 
14
13
  def inspect
@@ -72,31 +71,6 @@ module EasyML
72
71
 
73
72
  EasyML::Column::Imputers::OrdinalEncoder.new(column, preprocessing_step).decode_labels(df)
74
73
  end
75
-
76
- private
77
-
78
- def validate_preprocessing_step!
79
- validate_params!
80
- validate_method!
81
- end
82
-
83
- def validate_params!
84
- return unless preprocessing_step[:params]
85
-
86
- preprocessing_step[:params].keys.each do |param|
87
- unless Imputers.supported_params.include?(param.to_sym)
88
- raise ArgumentError, "Unsupported preprocessing parameter '#{param}'. Supported parameters are: #{Imputers.supported_params.join(", ")}"
89
- end
90
- end
91
- end
92
-
93
- def validate_method!
94
- return unless preprocessing_step[:method]
95
-
96
- unless Imputers.supported_methods.include?(preprocessing_step[:method].to_sym)
97
- raise ArgumentError, "Unsupported preprocessing method '#{preprocessing_step[:method]}'. Supported methods are: #{Imputers.supported_methods.join(", ")}"
98
- end
99
- end
100
74
  end
101
75
  end
102
76
  end
@@ -11,7 +11,7 @@ module EasyML
11
11
  end
12
12
 
13
13
  def timestamp
14
- column.feature.fit_at
14
+ column.feature.fit_at || column.feature.applied_at
15
15
  end
16
16
 
17
17
  def check
@@ -45,7 +45,7 @@ module EasyML
45
45
  end
46
46
 
47
47
  def data(**kwargs)
48
- if column.is_computed?
48
+ if column.is_computed? && !column.in_raw_dataset?
49
49
  Selector.new(column, :processed).send(:select, :data, **kwargs)
50
50
  else
51
51
  select(:data, **kwargs)
@@ -141,7 +141,13 @@ module EasyML
141
141
  assign_attributes(datatype: processed.data.to_series.dtype)
142
142
  end
143
143
  set_sample_values
144
- assign_attributes(statistics: (read_attribute(:statistics) || {}).symbolize_keys.merge!(learner.learn(type: type).symbolize_keys))
144
+ new_stats = learner.learn(type: type).symbolize_keys
145
+
146
+ if !in_raw_dataset?
147
+ new_stats[:raw] = new_stats[:processed]
148
+ end
149
+
150
+ assign_attributes(statistics: (read_attribute(:statistics) || {}).symbolize_keys.merge!(new_stats))
145
151
  assign_attributes(
146
152
  learned_at: UTC.now,
147
153
  last_datasource_sha: dataset.last_datasource_sha,
@@ -200,9 +206,10 @@ module EasyML
200
206
 
201
207
  def raw_dtype
202
208
  return @raw_dtype if @raw_dtype
209
+ set_feature_lineage
203
210
 
204
211
  if in_raw_dataset?
205
- @raw_dtype = raw&.data&.to_series&.dtype
212
+ @raw_dtype = raw&.data&.to_series.try(:dtype)
206
213
  elsif already_computed?
207
214
  @raw_dtype = processed&.data&.to_series&.dtype
208
215
  end
@@ -410,9 +417,9 @@ module EasyML
410
417
 
411
418
  case datatype&.to_sym
412
419
  when :float
413
- Float(value)
420
+ value.to_f
414
421
  when :integer
415
- Integer(value)
422
+ value.to_i
416
423
  when :boolean
417
424
  ActiveModel::Type::Boolean.new.cast(value)
418
425
  when :datetime
@@ -202,7 +202,6 @@ module EasyML
202
202
  prepare!
203
203
  fit_features!(async: async)
204
204
  end
205
- after_fit_features unless async
206
205
  end
207
206
 
208
207
  def refresh(async: false)
@@ -212,7 +211,6 @@ module EasyML
212
211
  prepare
213
212
  fit_features(async: async)
214
213
  end
215
- after_fit_features unless async
216
214
  end
217
215
 
218
216
  def fit_features!(async: false, features: self.features)
@@ -227,11 +225,11 @@ module EasyML
227
225
  end
228
226
 
229
227
  def after_fit_features
228
+ puts "AFTER FIT FEATURES!"
230
229
  unlock!
231
230
  reload
232
231
  return if failed?
233
232
 
234
- features.update_all(needs_fit: false, fit_at: Time.current)
235
233
  actually_refresh
236
234
  end
237
235
 
@@ -232,13 +232,41 @@ module EasyML
232
232
  if async && job_count > 1
233
233
  EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
234
234
  else
235
- jobs.flatten.each do |job|
236
- EasyML::ComputeFeatureJob.perform(nil, job)
235
+ jobs.each do |feature_batch|
236
+ feature_batch.each do |batch_args|
237
+ EasyML::ComputeFeatureJob.perform(nil, batch_args)
238
+ end
239
+ feature = EasyML::Feature.find(feature_batch.first.dig(:feature_id))
240
+ feature.after_fit
237
241
  end
238
- features.each(&:after_fit) unless features.any?(&:failed?)
242
+ dataset.after_fit_features
239
243
  end
240
244
  end
241
245
 
246
+ def self.fit_one_batch(batch_id, batch_args = {})
247
+ batch_args.symbolize_keys!
248
+ feature_id = batch_args.dig(:feature_id)
249
+ feature = EasyML::Feature.find(feature_id)
250
+ dataset = feature.dataset
251
+
252
+ # Check if any feature has failed before proceeding
253
+ if dataset.features.any? { |f| f.workflow_status == "failed" }
254
+ return
255
+ end
256
+ feature.update(workflow_status: :analyzing) if feature.workflow_status == :ready
257
+ begin
258
+ feature.fit_batch(batch_args.merge!(batch_id: batch_id))
259
+ rescue => e
260
+ EasyML::Feature.fit_feature_failed(dataset, e)
261
+ raise e
262
+ end
263
+ end
264
+
265
+ def self.fit_feature_failed(dataset, e)
266
+ dataset.update(workflow_status: :failed)
267
+ EasyML::Event.handle_error(dataset, e)
268
+ end
269
+
242
270
  # Fit a single batch, used for testing the user's feature implementation
243
271
  def fit_batch(batch_args = {})
244
272
  batch_args.symbolize_keys!
@@ -301,7 +329,11 @@ module EasyML
301
329
  end
302
330
  return if df.blank?
303
331
 
304
- batch_df = adapter.fit(df, self, batch_args)
332
+ begin
333
+ batch_df = adapter.fit(df, self, batch_args)
334
+ rescue => e
335
+ raise "Feature #{feature_class}#fit failed: #{e.message}"
336
+ end
305
337
  if batch_df.present?
306
338
  store(batch_df)
307
339
  else
@@ -315,7 +347,11 @@ module EasyML
315
347
  return df if !adapter.respond_to?(:transform) && feature_store.empty?
316
348
 
317
349
  df_len_was = df.shape[0]
318
- result = adapter.transform(df, self)
350
+ begin
351
+ result = adapter.transform(df, self)
352
+ rescue => e
353
+ raise "Feature #{feature_class}#transform failed: #{e.message}"
354
+ end
319
355
  raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
320
356
  df_len_now = result.shape[0]
321
357
  raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
@@ -412,7 +448,7 @@ module EasyML
412
448
 
413
449
  def after_fit
414
450
  updates = {
415
- applied_at: Time.current,
451
+ fit_at: Time.current,
416
452
  needs_fit: false,
417
453
  workflow_status: :ready,
418
454
  }.compact
@@ -17,8 +17,8 @@ module EasyML
17
17
 
18
18
  # Try to parse as integers if they're strings
19
19
  begin
20
- min_key = Integer(min_key) if min_key.is_a?(String)
21
- max_key = Integer(max_key) if max_key.is_a?(String)
20
+ min_key = min_key.to_i if min_key.is_a?(String)
21
+ max_key = max_key.to_i if max_key.is_a?(String)
22
22
  rescue ArgumentError
23
23
  return store_without_partitioning(df)
24
24
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc62"
4
+ VERSION = "0.2.0-rc65"
5
5
 
6
6
  module Version
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: easy_ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre.rc62
4
+ version: 0.2.0.pre.rc65
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brett Shollenberger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-02-06 00:00:00.000000000 Z
11
+ date: 2025-02-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord