easy_ml 0.2.0.pre.rc91 → 0.2.0.pre.rc93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/jobs/easy_ml/refresh_dataset_job.rb +7 -11
- data/app/models/easy_ml/column.rb +7 -7
- data/app/models/easy_ml/column_list.rb +3 -6
- data/app/models/easy_ml/dataset/learner.rb +22 -11
- data/app/models/easy_ml/dataset.rb +20 -7
- data/app/models/easy_ml/event.rb +18 -3
- data/app/models/easy_ml/feature.rb +6 -7
- data/app/models/easy_ml/lineage.rb +3 -3
- data/app/models/easy_ml/model.rb +9 -7
- data/app/models/easy_ml/pca_model.rb +4 -1
- data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_dataset_names.rb.tt +0 -5
- data/lib/easy_ml/timing.rb +1 -3
- data/lib/easy_ml/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f402f4f0d35dd702c8e97c841f22681626cff3c1ac030991cde884788b31833b
|
4
|
+
data.tar.gz: 1666649bca8f2ab7bae0ec7a5faa3a4561f85db5b22ffdda92e472f7661fab85
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fa77810edaf759b75240f57a8f6dcc99a54043326ba92ff95c6df6732bd614bb932a5558722d690544367b21c929755b249ff8ede56581bbb6c7abe37688db63
|
7
|
+
data.tar.gz: 7df62d34d6fd6dcd83eef86ab27dea0581bcbbe5c384c9fae00399661a62a882d3f163807171128d0efcc1026cd887789f961d9ce9e99ecaace8f91432f9193b
|
@@ -14,18 +14,14 @@ module EasyML
|
|
14
14
|
create_event(dataset, "started")
|
15
15
|
|
16
16
|
dataset.unlock!
|
17
|
-
dataset.
|
18
|
-
|
19
|
-
dataset.
|
20
|
-
|
21
|
-
|
17
|
+
dataset.refreshing do
|
18
|
+
dataset.prepare
|
19
|
+
if dataset.features.needs_fit.any?
|
20
|
+
dataset.fit_features(async: true)
|
21
|
+
else
|
22
|
+
dataset.after_fit_features
|
23
|
+
end
|
22
24
|
end
|
23
|
-
rescue StandardError => e
|
24
|
-
if Rails.env.test?
|
25
|
-
raise e
|
26
|
-
end
|
27
|
-
dataset.update(workflow_status: :failed)
|
28
|
-
EasyML::Event.handle_error(dataset, e)
|
29
25
|
end
|
30
26
|
end
|
31
27
|
end
|
@@ -515,17 +515,17 @@ module EasyML
|
|
515
515
|
end
|
516
516
|
|
517
517
|
def cast_statement(df, df_col, expected_dtype)
|
518
|
-
expected_dtype = expected_dtype.is_a?(Polars::DataType) ? expected_dtype : expected_dtype
|
518
|
+
expected_dtype = expected_dtype.is_a?(Polars::DataType) ? expected_dtype.class : expected_dtype
|
519
519
|
actual_type = df[df_col].dtype
|
520
520
|
|
521
|
-
cast_statement = case expected_dtype
|
522
|
-
when Polars::Boolean
|
523
|
-
case actual_type
|
524
|
-
when Polars::Boolean
|
521
|
+
cast_statement = case expected_dtype.to_s
|
522
|
+
when "Polars::Boolean"
|
523
|
+
case actual_type.to_s
|
524
|
+
when "Polars::Boolean"
|
525
525
|
Polars.col(df_col).cast(expected_dtype)
|
526
|
-
when Polars::
|
526
|
+
when "Polars::Utf8", "Polars::Categorical", "Polars::String"
|
527
527
|
Polars.col(df_col).eq("true").cast(expected_dtype)
|
528
|
-
when Polars::Null
|
528
|
+
when "Polars::Null"
|
529
529
|
Polars.col(df_col)
|
530
530
|
else
|
531
531
|
raise "Unexpected dtype: #{actual_type} for column: #{df_col}"
|
@@ -104,11 +104,7 @@ module EasyML
|
|
104
104
|
expected_dtype = schema[df_col.to_sym]
|
105
105
|
db_col.cast_statement(df, df_col, expected_dtype)
|
106
106
|
end
|
107
|
-
|
108
|
-
df = df.with_columns(cast_statements)
|
109
|
-
rescue => e
|
110
|
-
binding.pry
|
111
|
-
end
|
107
|
+
df = df.with_columns(cast_statements)
|
112
108
|
end
|
113
109
|
|
114
110
|
def cast(processed_or_raw)
|
@@ -165,7 +161,8 @@ module EasyML
|
|
165
161
|
end
|
166
162
|
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
|
167
163
|
|
168
|
-
|
164
|
+
cols = where(id: cols_to_learn.map(&:id)).includes(:lineages, :feature)
|
165
|
+
lineage = cols.flat_map do |col|
|
169
166
|
EasyML::Lineage.learn(col)
|
170
167
|
end.compact
|
171
168
|
EasyML::Lineage.import(lineage, on_duplicate_key_update: { columns: %i[ column_id key occurred_at description ] })
|
@@ -2,18 +2,15 @@ module EasyML
|
|
2
2
|
class Dataset
|
3
3
|
class Learner
|
4
4
|
include EasyML::Timing
|
5
|
-
attr_accessor :dataset, :columns, :type, :computed, :raw_columns, :statistics
|
5
|
+
attr_accessor :dataset, :columns, :all_columns, :type, :computed, :raw_columns, :statistics
|
6
6
|
|
7
7
|
def initialize(dataset, type: :raw)
|
8
8
|
@dataset = dataset
|
9
|
-
@columns = dataset.columns.reload.needs_learn.sort_by(&:name)
|
10
|
-
|
11
|
-
if computed
|
12
|
-
@columns = @columns.computed
|
13
|
-
end
|
14
|
-
|
15
|
-
@columns = @columns.select(&:persisted?).reject(&:empty?)
|
9
|
+
@columns = dataset.columns.reload.needs_learn.includes(:feature).sort_by(&:name)
|
16
10
|
@type = type
|
11
|
+
@all_columns = @columns.dup
|
12
|
+
@columns = @columns.select(&:persisted?)
|
13
|
+
@columns = @columns.select { |c| available_columns.include?(c.name) }
|
17
14
|
end
|
18
15
|
|
19
16
|
def learn
|
@@ -23,6 +20,10 @@ module EasyML
|
|
23
20
|
save_statistics
|
24
21
|
end
|
25
22
|
|
23
|
+
def available_columns
|
24
|
+
@available_columns ||= dataset.send(type).data(lazy: true).schema.keys & columns.map(&:name)
|
25
|
+
end
|
26
|
+
|
26
27
|
private
|
27
28
|
|
28
29
|
def fit_models
|
@@ -35,10 +36,20 @@ module EasyML
|
|
35
36
|
end
|
36
37
|
end
|
37
38
|
|
39
|
+
def get_sample_values
|
40
|
+
needs_sample = EasyML::Column.where(id: columns.map(&:id)).where(sample_values: nil).map(&:name)
|
41
|
+
sampleable_cols = available_columns & needs_sample
|
42
|
+
selects = sampleable_cols.map do |col|
|
43
|
+
Polars.col(col).filter(Polars.col(col).is_not_null).limit(5).alias(col)
|
44
|
+
end
|
45
|
+
df = dataset.send(type).train(all_columns: true, lazy: true).select(selects).collect.to_h.transform_values(&:to_a)
|
46
|
+
end
|
47
|
+
|
38
48
|
def save_statistics
|
39
|
-
|
49
|
+
samples = get_sample_values
|
50
|
+
all_columns.each do |col|
|
40
51
|
col.merge_statistics(statistics.dig(col.name))
|
41
|
-
col.
|
52
|
+
col.assign_attributes(sample_values: samples[col.name]) if samples[col.name].present?
|
42
53
|
col.assign_attributes(
|
43
54
|
learned_at: EasyML::Support::UTC.now,
|
44
55
|
last_datasource_sha: col.dataset.last_datasource_sha,
|
@@ -47,7 +58,7 @@ module EasyML
|
|
47
58
|
)
|
48
59
|
end
|
49
60
|
|
50
|
-
EasyML::Column.import(
|
61
|
+
EasyML::Column.import(all_columns, on_duplicate_key_update: { columns: %i[
|
51
62
|
statistics
|
52
63
|
learned_at
|
53
64
|
sample_values
|
@@ -362,15 +362,13 @@ module EasyML
|
|
362
362
|
update(workflow_status: "analyzing")
|
363
363
|
fully_reload
|
364
364
|
yield
|
365
|
-
ensure
|
366
|
-
unlock!
|
367
365
|
end
|
368
366
|
rescue => e
|
369
367
|
update(workflow_status: "failed")
|
370
|
-
|
371
|
-
puts line
|
372
|
-
end
|
368
|
+
EasyML::Event.handle_error(self, e)
|
373
369
|
raise e
|
370
|
+
ensure
|
371
|
+
unlock!
|
374
372
|
end
|
375
373
|
end
|
376
374
|
|
@@ -797,8 +795,7 @@ module EasyML
|
|
797
795
|
df = df.clone
|
798
796
|
df = apply_features(df)
|
799
797
|
processed.save(:train, df)
|
800
|
-
|
801
|
-
learn_statistics(type: :processed, computed: true)
|
798
|
+
learn_statistics(type: :processed)
|
802
799
|
processed.cleanup
|
803
800
|
end
|
804
801
|
|
@@ -900,5 +897,21 @@ module EasyML
|
|
900
897
|
def underscored_name
|
901
898
|
name.gsub(/\s{2,}/, " ").gsub(/\s/, "_").downcase
|
902
899
|
end
|
900
|
+
|
901
|
+
TIME_METHODS = %w(
|
902
|
+
refresh
|
903
|
+
prepare_features
|
904
|
+
refresh_datasource
|
905
|
+
split_data
|
906
|
+
fit
|
907
|
+
normalize_all
|
908
|
+
learn
|
909
|
+
learn_statistics
|
910
|
+
fit_features
|
911
|
+
)
|
912
|
+
include EasyML::Timing
|
913
|
+
TIME_METHODS.each do |method|
|
914
|
+
measure_method_timing method
|
915
|
+
end
|
903
916
|
end
|
904
917
|
end
|
data/app/models/easy_ml/event.rb
CHANGED
@@ -57,11 +57,26 @@ module EasyML
|
|
57
57
|
end
|
58
58
|
|
59
59
|
def self.easy_ml_context(stacktrace)
|
60
|
-
stacktrace.select
|
60
|
+
stacktrace.select(&MATCH_EASY_ML_CONTEXT)
|
61
|
+
end
|
62
|
+
|
63
|
+
MATCH_USER_CONTEXT = proc { |loc| !loc.match?(/features|evaluators/) && !loc.match?(/easy_ml/) }
|
64
|
+
MATCH_EASY_ML_CONTEXT = proc { |loc| loc.match?(/easy_ml/) }
|
65
|
+
|
66
|
+
def self.user_context?(stacktrace)
|
67
|
+
stacktrace.any?(&MATCH_USER_CONTEXT)
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.get_context(stacktrace)
|
71
|
+
if user_context?(stacktrace)
|
72
|
+
stacktrace
|
73
|
+
else
|
74
|
+
easy_ml_context(stacktrace)
|
75
|
+
end
|
61
76
|
end
|
62
77
|
|
63
78
|
def self.called_by?(matcher)
|
64
|
-
|
79
|
+
get_context(caller).any? { |line| line.match?(matcher) }
|
65
80
|
end
|
66
81
|
|
67
82
|
def self.format_stacktrace(error)
|
@@ -69,7 +84,7 @@ module EasyML
|
|
69
84
|
|
70
85
|
topline = error.inspect
|
71
86
|
|
72
|
-
stacktrace =
|
87
|
+
stacktrace = get_context(error.backtrace)
|
73
88
|
|
74
89
|
%(#{topline}
|
75
90
|
|
@@ -250,12 +250,14 @@ module EasyML
|
|
250
250
|
if async && job_count > 1
|
251
251
|
EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
|
252
252
|
else
|
253
|
-
|
253
|
+
feature_idx = ordered_features.index_by(&:id)
|
254
|
+
change_set = jobs.map do |feature_batch|
|
254
255
|
feature_batch.each do |batch_args|
|
255
256
|
EasyML::ComputeFeatureJob.perform(nil, batch_args)
|
256
257
|
end
|
257
|
-
feature =
|
258
|
+
feature = feature_idx[feature_batch.first.dig(:feature_id)]
|
258
259
|
feature.after_fit
|
260
|
+
feature
|
259
261
|
end
|
260
262
|
dataset.after_fit_features
|
261
263
|
end
|
@@ -380,11 +382,8 @@ module EasyML
|
|
380
382
|
|
381
383
|
df_len_was = df.shape[0]
|
382
384
|
orig_df = df.clone
|
383
|
-
|
384
|
-
|
385
|
-
rescue => e
|
386
|
-
raise "Feature #{feature_class}#transform failed: #{e.message}"
|
387
|
-
end
|
385
|
+
result = adapter.transform(df, self)
|
386
|
+
|
388
387
|
raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
|
389
388
|
df_len_now = result.shape[0]
|
390
389
|
missing_columns = orig_df.columns - result.columns
|
@@ -18,8 +18,8 @@ module EasyML
|
|
18
18
|
def learn(column)
|
19
19
|
@lineage = EasyML::Column::Lineage.new(column).lineage
|
20
20
|
|
21
|
-
existing_lineage =
|
22
|
-
missing_lineage = @lineage.select { |l| !existing_lineage.
|
21
|
+
existing_lineage = column.lineages.index_by(&:key)
|
22
|
+
missing_lineage = @lineage.select { |l| !existing_lineage.key?(l[:key].to_s) }
|
23
23
|
|
24
24
|
missing_lineage = missing_lineage.map { |l|
|
25
25
|
EasyML::Lineage.new(
|
@@ -29,7 +29,7 @@ module EasyML
|
|
29
29
|
description: l[:description],
|
30
30
|
)
|
31
31
|
}
|
32
|
-
existing_lineage = existing_lineage.map do |lineage|
|
32
|
+
existing_lineage = existing_lineage.map do |key, lineage|
|
33
33
|
matching_lineage = @lineage.detect { |ll| ll[:key].to_sym == lineage.key.to_sym }
|
34
34
|
|
35
35
|
lineage&.assign_attributes(
|
data/app/models/easy_ml/model.rb
CHANGED
@@ -556,13 +556,15 @@ module EasyML
|
|
556
556
|
self.sha = model_file.sha
|
557
557
|
save
|
558
558
|
dataset.upload_remote_files
|
559
|
-
snapshot
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
559
|
+
model_snapshot = snapshot
|
560
|
+
|
561
|
+
# Prepare the model to be retrained (reset values so they don't conflict with our snapshotted version)
|
562
|
+
bump_version(force: true)
|
563
|
+
dataset.bump_versions(version)
|
564
|
+
self.model_file = new_model_file!
|
565
|
+
save
|
566
|
+
|
567
|
+
model_snapshot
|
566
568
|
end
|
567
569
|
|
568
570
|
def cannot_deploy_reasons
|
@@ -4,10 +4,5 @@ class AddUniqueConstraintToDatasetNames < ActiveRecord::Migration[<%= ActiveReco
|
|
4
4
|
remove_index :easy_ml_datasets, :name
|
5
5
|
end
|
6
6
|
add_index :easy_ml_datasets, :name, unique: true
|
7
|
-
|
8
|
-
if index_exists?(:easy_ml_dataset_histories, :name)
|
9
|
-
remove_index :easy_ml_dataset_histories, :name
|
10
|
-
end
|
11
|
-
add_index :easy_ml_dataset_histories, :name, unique: true
|
12
7
|
end
|
13
8
|
end
|
data/lib/easy_ml/timing.rb
CHANGED
@@ -19,9 +19,7 @@ module EasyML
|
|
19
19
|
result = send(method_alias, *args, **kwargs, &block)
|
20
20
|
ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
21
21
|
elapsed = ending - starting
|
22
|
-
|
23
|
-
puts "#{method_name} took #{elapsed.round(2)} seconds"
|
24
|
-
end
|
22
|
+
puts "#{method_name} took #{elapsed} seconds"
|
25
23
|
# StatsD.measure("#{Rails.env}.#{prefix.present? ? "#{prefix}." : ""}#{method_name}.timing", elapsed)
|
26
24
|
result
|
27
25
|
end
|
data/lib/easy_ml/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: easy_ml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.pre.
|
4
|
+
version: 0.2.0.pre.rc93
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brett Shollenberger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-03-
|
11
|
+
date: 2025-03-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|