easy_ml 0.2.0.pre.rc65 → 0.2.0.pre.rc69
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +4 -4
- data/app/jobs/easy_ml/compute_feature_job.rb +11 -28
- data/app/models/easy_ml/column.rb +10 -2
- data/app/models/easy_ml/dataset.rb +8 -3
- data/app/models/easy_ml/datasource.rb +2 -0
- data/app/models/easy_ml/feature.rb +29 -6
- data/lib/easy_ml/feature_store.rb +3 -3
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-v1q2Ux1T.js → Application.tsx-CibZcrBc.js} +36 -36
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-v1q2Ux1T.js.map → Application.tsx-CibZcrBc.js.map} +1 -1
- metadata +3 -4
- data/app/jobs/easy_ml/finalize_feature_job.rb +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b4c3878d6cb51daa13de6d41b0480ce8d5f5288266e68866d9e2532de3d372b5
|
4
|
+
data.tar.gz: 62b963738afea40ffa9c00624164c8293b4b6994cd3c21e67162c8e61715c0de
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e8a6d8ee4af5fbac1f45b79e5edee2e8f9f4e3807616d05ab98ea9bc3ae42f08d546fb1a7e14b22338df8961716cf54f54300eb33d0a049582e64b7f54ea8d86
|
7
|
+
data.tar.gz: 61b7efd2451e0189f9f8ebb7149ca7707723bdf6cfa28e232b88fe0b286b86d6607fbbc4f222aaf97a7d175ef0e947041b9dd61ec2ef4bcec8e922561dce6c27
|
@@ -264,18 +264,18 @@ export function PreprocessingConfig({
|
|
264
264
|
const renderStrategySpecificInfo = (type: 'training' | 'inference') => {
|
265
265
|
const strategy = type === 'training' ? training : inference;
|
266
266
|
let content;
|
267
|
-
if (strategy.method === 'most_frequent' && column.statistics?.raw.most_frequent_value) {
|
267
|
+
if (strategy.method === 'most_frequent' && column.statistics?.raw.most_frequent_value !== undefined) {
|
268
268
|
content = `Most Frequent Value: ${column.statistics.raw.most_frequent_value}`
|
269
269
|
} else if (strategy.method === 'ffill') {
|
270
270
|
const lastValue = column.statistics?.raw.last_value;
|
271
|
-
if (lastValue !== undefined
|
271
|
+
if (lastValue !== undefined) {
|
272
272
|
content = `Forward Fill using Last Value: ${lastValue}`;
|
273
273
|
} else {
|
274
274
|
content = 'Set date column & apply preprocessing to see last value';
|
275
275
|
}
|
276
|
-
} else if (strategy.method === 'median' && column.statistics?.raw?.median !== undefined
|
276
|
+
} else if (strategy.method === 'median' && column.statistics?.raw?.median !== undefined) {
|
277
277
|
content = `Median: ${column.statistics.raw.median}`
|
278
|
-
} else if (strategy.method === 'mean' && column.statistics?.raw?.mean !== undefined
|
278
|
+
} else if (strategy.method === 'mean' && column.statistics?.raw?.mean !== undefined) {
|
279
279
|
content = `Mean: ${column.statistics.raw.mean}`
|
280
280
|
} else {
|
281
281
|
return null;
|
@@ -5,40 +5,23 @@ module EasyML
|
|
5
5
|
@queue = :easy_ml
|
6
6
|
|
7
7
|
def self.perform(batch_id, batch_args = {})
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
rescue => e
|
20
|
-
EasyML::Feature.transaction do
|
21
|
-
return if dataset.reload.workflow_status == :failed
|
22
|
-
|
23
|
-
feature.update(workflow_status: :failed)
|
24
|
-
dataset.update(workflow_status: :failed)
|
25
|
-
build_error_with_context(dataset, e, batch_id, feature)
|
26
|
-
end
|
27
|
-
end
|
8
|
+
# This is very, very, very, very, very important
|
9
|
+
# if you don't dup the batch_args, resque-batched-job will
|
10
|
+
# fail in some non-obvious ways, because it will try to
|
11
|
+
# decode to match the original batch args EXACTLY.
|
12
|
+
#
|
13
|
+
# This will waste your time so please just don't remove this .dup!!!
|
14
|
+
#
|
15
|
+
# https://github.com/drfeelngood/resque-batched-job/blob/master/lib/resque/plugins/batched_job.rb#L86
|
16
|
+
batch_args = batch_args.dup
|
17
|
+
puts "Running batch #{batch_id} with args #{batch_args}"
|
18
|
+
run_one_batch(batch_id, batch_args)
|
28
19
|
end
|
29
20
|
|
30
21
|
def self.run_one_batch(batch_id, batch_args)
|
31
22
|
EasyML::Feature.fit_one_batch(batch_id, batch_args)
|
32
23
|
end
|
33
24
|
|
34
|
-
def self.build_error_with_context(dataset, error, batch_id, feature)
|
35
|
-
error = EasyML::Event.handle_error(dataset, error)
|
36
|
-
batch = feature.build_batch(batch_id: batch_id)
|
37
|
-
|
38
|
-
# Convert any dataframes in the context to serialized form
|
39
|
-
error.create_context(context: batch)
|
40
|
-
end
|
41
|
-
|
42
25
|
def self.after_batch_hook(batch_id, *args)
|
43
26
|
batch_args = fetch_batch_arguments(batch_id).flatten.map(&:symbolize_keys)
|
44
27
|
feature_ids = batch_args.pluck(:feature_id).uniq
|
@@ -43,6 +43,7 @@ module EasyML
|
|
43
43
|
before_save :set_defaults
|
44
44
|
before_save :set_feature_lineage
|
45
45
|
before_save :set_polars_datatype
|
46
|
+
after_find :ensure_feature_exists
|
46
47
|
|
47
48
|
# Scopes
|
48
49
|
scope :visible, -> { where(hidden: false) }
|
@@ -108,6 +109,13 @@ module EasyML
|
|
108
109
|
}
|
109
110
|
scope :is_learning, -> { where(is_learning: true) }
|
110
111
|
|
112
|
+
def ensure_feature_exists
|
113
|
+
if feature && !feature.has_code?
|
114
|
+
feature.destroy
|
115
|
+
update(feature_id: nil)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
111
119
|
def display_attributes
|
112
120
|
attributes.except(:statistics)
|
113
121
|
end
|
@@ -282,9 +290,9 @@ module EasyML
|
|
282
290
|
end
|
283
291
|
end
|
284
292
|
|
285
|
-
alias_method :feature, :computing_feature
|
286
|
-
|
287
293
|
def set_feature_lineage
|
294
|
+
return if dataset.nil?
|
295
|
+
|
288
296
|
if dataset.features.computed_column_names.include?(name)
|
289
297
|
if computed_by.nil?
|
290
298
|
assign_attributes(
|
@@ -112,7 +112,7 @@ module EasyML
|
|
112
112
|
end
|
113
113
|
|
114
114
|
def schema
|
115
|
-
read_attribute(:schema) || datasource.schema
|
115
|
+
read_attribute(:schema) || datasource.schema || datasource.after_sync.schema
|
116
116
|
end
|
117
117
|
|
118
118
|
def processed_schema
|
@@ -186,9 +186,12 @@ module EasyML
|
|
186
186
|
|
187
187
|
def actually_refresh
|
188
188
|
refreshing do
|
189
|
+
puts "actually_refresh"
|
189
190
|
learn(delete: false) # After syncing datasource, learn new statistics + sync columns
|
190
191
|
process_data
|
192
|
+
puts "process_data"
|
191
193
|
fully_reload
|
194
|
+
puts "Learning..."
|
192
195
|
learn
|
193
196
|
learn_statistics(type: :processed) # After processing data, we learn any new statistics
|
194
197
|
now = UTC.now
|
@@ -208,7 +211,9 @@ module EasyML
|
|
208
211
|
return refresh_async if async
|
209
212
|
|
210
213
|
refreshing do
|
214
|
+
puts "prepare.."
|
211
215
|
prepare
|
216
|
+
puts "fit features..."
|
212
217
|
fit_features(async: async)
|
213
218
|
end
|
214
219
|
end
|
@@ -219,13 +224,13 @@ module EasyML
|
|
219
224
|
|
220
225
|
def fit_features(async: false, features: self.features, force: false)
|
221
226
|
features_to_compute = force ? features : features.needs_fit
|
222
|
-
return if features_to_compute.empty?
|
227
|
+
return after_fit_features if features_to_compute.empty?
|
223
228
|
|
224
229
|
features.first.fit(features: features_to_compute, async: async)
|
225
230
|
end
|
226
231
|
|
227
232
|
def after_fit_features
|
228
|
-
puts "
|
233
|
+
puts "after fit features..."
|
229
234
|
unlock!
|
230
235
|
reload
|
231
236
|
return if failed?
|
@@ -122,10 +122,12 @@ module EasyML
|
|
122
122
|
self.refreshed_at = Time.now
|
123
123
|
self.sha = adapter.sha
|
124
124
|
save
|
125
|
+
self.schema
|
125
126
|
end
|
126
127
|
|
127
128
|
def refresh
|
128
129
|
unless adapter.needs_refresh?
|
130
|
+
after_sync if schema.nil?
|
129
131
|
update(sha: adapter.sha) if sha.nil?
|
130
132
|
update!(is_syncing: false)
|
131
133
|
return
|
@@ -82,7 +82,7 @@ module EasyML
|
|
82
82
|
where(id: fittable.map(&:id))
|
83
83
|
end
|
84
84
|
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
|
85
|
-
scope :ready_to_apply, -> { where.not(id:
|
85
|
+
scope :ready_to_apply, -> { where(needs_fit: false).where.not(id: has_changes.map(&:id)) }
|
86
86
|
|
87
87
|
before_save :apply_defaults, if: :new_record?
|
88
88
|
before_save :update_sha
|
@@ -95,6 +95,10 @@ module EasyML
|
|
95
95
|
raise InvalidFeatureError, "Invalid feature class: #{feature_class}"
|
96
96
|
end
|
97
97
|
|
98
|
+
def has_code?
|
99
|
+
feature_klass.present?
|
100
|
+
end
|
101
|
+
|
98
102
|
def adapter
|
99
103
|
@adapter ||= feature_klass.new
|
100
104
|
end
|
@@ -250,18 +254,31 @@ module EasyML
|
|
250
254
|
dataset = feature.dataset
|
251
255
|
|
252
256
|
# Check if any feature has failed before proceeding
|
253
|
-
if dataset.features.any? { |f| f.workflow_status == "failed" }
|
254
|
-
|
255
|
-
end
|
257
|
+
return if dataset.features.any? { |f| f.workflow_status == "failed" }
|
258
|
+
|
256
259
|
feature.update(workflow_status: :analyzing) if feature.workflow_status == :ready
|
257
260
|
begin
|
258
261
|
feature.fit_batch(batch_args.merge!(batch_id: batch_id))
|
259
262
|
rescue => e
|
260
|
-
EasyML::Feature.
|
263
|
+
EasyML::Feature.transaction do
|
264
|
+
return if dataset.reload.workflow_status == :failed
|
265
|
+
|
266
|
+
feature.update(workflow_status: :failed)
|
267
|
+
dataset.update(workflow_status: :failed)
|
268
|
+
build_error_with_context(dataset, e, batch_id, feature)
|
269
|
+
end
|
261
270
|
raise e
|
262
271
|
end
|
263
272
|
end
|
264
273
|
|
274
|
+
def self.build_error_with_context(dataset, error, batch_id, feature)
|
275
|
+
error = EasyML::Event.handle_error(dataset, error)
|
276
|
+
batch = feature.build_batch(batch_id: batch_id)
|
277
|
+
|
278
|
+
# Convert any dataframes in the context to serialized form
|
279
|
+
error.create_context(context: batch)
|
280
|
+
end
|
281
|
+
|
265
282
|
def self.fit_feature_failed(dataset, e)
|
266
283
|
dataset.update(workflow_status: :failed)
|
267
284
|
EasyML::Event.handle_error(dataset, e)
|
@@ -447,6 +464,8 @@ module EasyML
|
|
447
464
|
end
|
448
465
|
|
449
466
|
def after_fit
|
467
|
+
update_sha
|
468
|
+
|
450
469
|
updates = {
|
451
470
|
fit_at: Time.current,
|
452
471
|
needs_fit: false,
|
@@ -508,7 +527,11 @@ module EasyML
|
|
508
527
|
end
|
509
528
|
|
510
529
|
def feature_klass
|
511
|
-
|
530
|
+
begin
|
531
|
+
@feature_klass ||= EasyML::Features::Registry.find(feature_class.to_s).dig(:feature_class).constantize
|
532
|
+
rescue => e
|
533
|
+
nil
|
534
|
+
end
|
512
535
|
end
|
513
536
|
|
514
537
|
def config
|
@@ -15,10 +15,10 @@ module EasyML
|
|
15
15
|
max_key = df[primary_key].max
|
16
16
|
batch_size = feature.batch_size || 10_000
|
17
17
|
|
18
|
-
# Try to parse as integers if they're strings
|
19
18
|
begin
|
20
|
-
|
21
|
-
|
19
|
+
# We are intentionally not using to_i, so it will raise an error for keys like "A1"
|
20
|
+
min_key = Integer(min_key) if min_key.is_a?(String)
|
21
|
+
max_key = Integer(max_key) if max_key.is_a?(String)
|
22
22
|
rescue ArgumentError
|
23
23
|
return store_without_partitioning(df)
|
24
24
|
end
|
data/lib/easy_ml/version.rb
CHANGED