easy_ml 0.2.0.pre.rc57 → 0.2.0.pre.rc58
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/apis_controller.rb +8 -0
- data/app/controllers/easy_ml/models_controller.rb +3 -0
- data/app/controllers/easy_ml/predictions_controller.rb +10 -5
- data/app/frontend/components/ModelForm.tsx +1 -1
- data/app/frontend/components/SearchableSelect.tsx +0 -1
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +1 -1
- data/app/frontend/pages/DatasourcesPage.tsx +0 -2
- data/app/jobs/easy_ml/compute_feature_job.rb +1 -0
- data/app/models/easy_ml/column.rb +42 -4
- data/app/models/easy_ml/column_history.rb +5 -1
- data/app/models/easy_ml/column_list.rb +43 -11
- data/app/models/easy_ml/dataset.rb +45 -25
- data/app/models/easy_ml/datasource.rb +1 -0
- data/app/models/easy_ml/feature.rb +10 -3
- data/app/models/easy_ml/model.rb +25 -4
- data/app/models/easy_ml/model_history.rb +1 -0
- data/app/models/easy_ml/retraining_run.rb +1 -0
- data/config/initializers/inflections.rb +2 -0
- data/config/routes.rb +3 -0
- data/lib/easy_ml/core/tuner.rb +1 -1
- data/lib/easy_ml/data/preprocessor.rb +10 -53
- data/lib/easy_ml/data/splits/in_memory_split.rb +4 -0
- data/lib/easy_ml/data/statistics_learner.rb +79 -14
- data/lib/easy_ml/data/synced_directory.rb +4 -2
- data/lib/easy_ml/predict.rb +13 -2
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +3 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DTZ2348z.js → Application.tsx-DmkdJsDd.js} +34 -34
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DTZ2348z.js.map → Application.tsx-DmkdJsDd.js.map} +1 -1
- metadata +8 -4
data/lib/easy_ml/core/tuner.rb
CHANGED
@@ -173,7 +173,7 @@ module EasyML
|
|
173
173
|
end
|
174
174
|
raise ArgumentError, "Objectives required for EasyML::Core::Tuner" unless objective.present?
|
175
175
|
|
176
|
-
self.metrics = EasyML::Model.new(task: task).
|
176
|
+
self.metrics = EasyML::Model.new(task: task).default_metrics if metrics.nil? || metrics.empty?
|
177
177
|
end
|
178
178
|
end
|
179
179
|
end
|
@@ -90,46 +90,19 @@ module EasyML::Data
|
|
90
90
|
df
|
91
91
|
end
|
92
92
|
|
93
|
-
def
|
94
|
-
preprocessing_steps ||= {}
|
95
|
-
preprocessing_steps.deep_symbolize_keys!
|
96
|
-
|
97
|
-
allowed_categories = {}
|
98
|
-
(preprocessing_steps[:training] || {}).each_key do |col|
|
99
|
-
next unless [
|
100
|
-
preprocessing_steps.dig(:training, col, :params, :ordinal_encoding),
|
101
|
-
preprocessing_steps.dig(:training, col, :params, :one_hot),
|
102
|
-
preprocessing_steps.dig(:training, col, :method).to_sym == :categorical,
|
103
|
-
].any?
|
104
|
-
|
105
|
-
cat_min = preprocessing_steps.dig(:training, col, :params, :categorical_min) || 1
|
106
|
-
val_counts = df[col].value_counts
|
107
|
-
allowed_categories[col] = val_counts[val_counts["count"] >= cat_min][col].to_a.compact
|
108
|
-
end
|
109
|
-
allowed_categories
|
110
|
-
end
|
111
|
-
|
112
|
-
def fit(df)
|
93
|
+
def fit(df, precomputed_stats = {})
|
113
94
|
return if df.nil?
|
114
95
|
return if preprocessing_steps.nil? || preprocessing_steps.keys.none?
|
115
96
|
|
116
97
|
preprocessing_steps.deep_symbolize_keys!
|
117
98
|
df = apply_clip(df, preprocessing_steps)
|
118
|
-
allowed_categories = learn_categorical_min(df, preprocessing_steps)
|
119
|
-
|
120
|
-
self.statistics = StatisticsLearner.learn_df(df, dataset: dataset).deep_symbolize_keys
|
121
99
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
statistics[col][:allowed_categories] = categories
|
126
|
-
statistics[col].merge!(
|
127
|
-
fit_categorical(df[col], preprocessing_steps)
|
128
|
-
)
|
129
|
-
end
|
100
|
+
self.statistics = StatisticsLearner.learn_df(df, dataset: dataset, type: :raw).deep_symbolize_keys.merge!(
|
101
|
+
precomputed_stats
|
102
|
+
).deep_symbolize_keys
|
130
103
|
end
|
131
104
|
|
132
|
-
def postprocess(df, inference: false)
|
105
|
+
def postprocess(df, inference: false, computed: false)
|
133
106
|
puts "Postprocessing..." if verbose
|
134
107
|
return df if preprocessing_steps.nil? || preprocessing_steps.keys.none?
|
135
108
|
|
@@ -139,6 +112,11 @@ module EasyML::Data
|
|
139
112
|
preprocessing_steps[:training]
|
140
113
|
end
|
141
114
|
|
115
|
+
if computed
|
116
|
+
computed_cols = dataset.columns.computed.map(&:name).map(&:to_sym)
|
117
|
+
steps = steps.deep_dup.slice(*computed_cols)
|
118
|
+
end
|
119
|
+
|
142
120
|
df = apply_transformations(df, steps)
|
143
121
|
|
144
122
|
puts "Postprocessing complete." if @verbose
|
@@ -260,27 +238,6 @@ module EasyML::Data
|
|
260
238
|
)
|
261
239
|
end
|
262
240
|
|
263
|
-
def fit_categorical(series, _preprocessing_steps)
|
264
|
-
value_counts = series.value_counts
|
265
|
-
column_names = value_counts.columns
|
266
|
-
value_column = column_names[0]
|
267
|
-
count_column = column_names[1]
|
268
|
-
|
269
|
-
as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
|
270
|
-
label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
|
271
|
-
h.tap do
|
272
|
-
h[k] = i
|
273
|
-
end
|
274
|
-
end
|
275
|
-
label_decoder = label_encoder.invert
|
276
|
-
|
277
|
-
{
|
278
|
-
value: as_hash,
|
279
|
-
label_encoder: label_encoder,
|
280
|
-
label_decoder: label_decoder,
|
281
|
-
}
|
282
|
-
end
|
283
|
-
|
284
241
|
def prepare_for_imputation(df, col)
|
285
242
|
df = df.with_column(Polars.col(col).cast(Polars::Float64))
|
286
243
|
df.with_column(Polars.when(Polars.col(col).is_null).then(Float::NAN).otherwise(Polars.col(col)).alias(col))
|
@@ -9,15 +9,16 @@ module EasyML::Data
|
|
9
9
|
@verbose = options[:verbose]
|
10
10
|
end
|
11
11
|
|
12
|
-
def self.learn(df, dataset
|
13
|
-
new(df, dataset).learn
|
12
|
+
def self.learn(df, dataset, type)
|
13
|
+
new(df, dataset, type).learn
|
14
14
|
end
|
15
15
|
|
16
|
-
attr_reader :df, :dataset
|
16
|
+
attr_reader :df, :dataset, :type
|
17
17
|
|
18
|
-
def initialize(df, dataset)
|
18
|
+
def initialize(df, dataset, type)
|
19
19
|
@df = df
|
20
20
|
@dataset = dataset
|
21
|
+
@type = type.to_sym
|
21
22
|
end
|
22
23
|
|
23
24
|
def learn
|
@@ -27,18 +28,73 @@ module EasyML::Data
|
|
27
28
|
def learn_split(split)
|
28
29
|
df = split.read(:all)
|
29
30
|
train_df = split.read(:train)
|
30
|
-
all_stats = learn_df(df
|
31
|
-
train_stats = learn_df(train_df
|
31
|
+
all_stats = learn_df(df)
|
32
|
+
train_stats = learn_df(train_df)
|
32
33
|
|
33
34
|
all_stats.reduce({}) do |output, (k, _)|
|
34
35
|
output.tap do
|
35
36
|
output[k] = all_stats[k].slice(:num_rows, :null_count, :unique_count, :counts).merge!(
|
36
|
-
train_stats[k].slice(:mean, :median, :min, :max, :std,
|
37
|
+
train_stats[k].slice(:mean, :median, :min, :max, :std,
|
38
|
+
:last_value, :most_frequent_value, :last_known_value,
|
39
|
+
:allowed_categories, :label_encoder, :label_decoder)
|
37
40
|
)
|
38
41
|
end
|
39
42
|
end
|
40
43
|
end
|
41
44
|
|
45
|
+
def learn_categorical(df)
|
46
|
+
allowed_categories = learn_allowed_categories(df)
|
47
|
+
allowed_categories.reduce({}) do |statistics, (col, categories)|
|
48
|
+
statistics.tap do
|
49
|
+
statistics[col] ||= {}
|
50
|
+
statistics[col][:allowed_categories] = categories
|
51
|
+
statistics[col].merge!(
|
52
|
+
learn_categorical_encoder_decoder(df[col])
|
53
|
+
)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def learn_categorical_encoder_decoder(series)
|
59
|
+
value_counts = series.value_counts
|
60
|
+
column_names = value_counts.columns
|
61
|
+
value_column = column_names[0]
|
62
|
+
count_column = column_names[1]
|
63
|
+
|
64
|
+
as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
|
65
|
+
label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
|
66
|
+
h.tap do
|
67
|
+
h[k] = i
|
68
|
+
end
|
69
|
+
end
|
70
|
+
label_decoder = label_encoder.invert
|
71
|
+
|
72
|
+
{
|
73
|
+
value: as_hash,
|
74
|
+
label_encoder: label_encoder,
|
75
|
+
label_decoder: label_decoder,
|
76
|
+
}
|
77
|
+
end
|
78
|
+
|
79
|
+
def learn_allowed_categories(df)
|
80
|
+
preprocessing_steps = dataset.preprocessing_steps || {}
|
81
|
+
preprocessing_steps.deep_symbolize_keys!
|
82
|
+
|
83
|
+
allowed_categories = {}
|
84
|
+
(preprocessing_steps[:training] || {}).each_key do |col|
|
85
|
+
next unless [
|
86
|
+
preprocessing_steps.dig(:training, col, :params, :ordinal_encoding),
|
87
|
+
preprocessing_steps.dig(:training, col, :params, :one_hot),
|
88
|
+
preprocessing_steps.dig(:training, col, :method).to_sym == :categorical,
|
89
|
+
].any?
|
90
|
+
|
91
|
+
cat_min = preprocessing_steps.dig(:training, col, :params, :categorical_min) || 1
|
92
|
+
val_counts = df[col].value_counts
|
93
|
+
allowed_categories[col] = val_counts[val_counts["count"] >= cat_min][col].to_a.compact
|
94
|
+
end
|
95
|
+
allowed_categories
|
96
|
+
end
|
97
|
+
|
42
98
|
def last_known_value(df, col, date_col)
|
43
99
|
return nil if df.empty? || !df.columns.include?(date_col)
|
44
100
|
|
@@ -53,13 +109,22 @@ module EasyML::Data
|
|
53
109
|
last_value
|
54
110
|
end
|
55
111
|
|
56
|
-
def learn_df(df
|
57
|
-
|
112
|
+
def learn_df(df)
|
113
|
+
return if df.nil?
|
114
|
+
|
115
|
+
stats = learn_base_stats(df, dataset: dataset).stringify_keys
|
116
|
+
if type == :raw
|
117
|
+
categorical = learn_categorical(df).stringify_keys
|
118
|
+
categorical.each { |k, v| stats[k].merge!(v) }
|
119
|
+
end
|
120
|
+
stats
|
58
121
|
end
|
59
122
|
|
60
|
-
def self.learn_df(df, dataset: nil)
|
61
|
-
|
123
|
+
def self.learn_df(df, dataset: nil, type: :raw)
|
124
|
+
new(df, dataset, type).learn_df(df)
|
125
|
+
end
|
62
126
|
|
127
|
+
def learn_base_stats(df, dataset: nil)
|
63
128
|
base_stats = describe_to_h(df).deep_symbolize_keys
|
64
129
|
|
65
130
|
# Add basic column statistics first
|
@@ -103,16 +168,16 @@ module EasyML::Data
|
|
103
168
|
end
|
104
169
|
end
|
105
170
|
|
106
|
-
def
|
171
|
+
def id_column?(column)
|
107
172
|
col = column.to_s.downcase
|
108
173
|
col.match?(/^id$/) || col.match?(/.*_id/)
|
109
174
|
end
|
110
175
|
|
111
|
-
def
|
176
|
+
def last_value(df, col, date_col)
|
112
177
|
df.filter(Polars.col(col).is_not_null).sort(date_col)[col][-1]
|
113
178
|
end
|
114
179
|
|
115
|
-
def
|
180
|
+
def describe_to_h(df)
|
116
181
|
init_h = df.describe.to_h
|
117
182
|
rows = init_h.values.map(&:to_a)
|
118
183
|
keys = rows.first
|
@@ -127,8 +127,10 @@ module EasyML
|
|
127
127
|
)
|
128
128
|
|
129
129
|
Rails.logger.info("Downloaded #{object.key} to #{local_file_path}")
|
130
|
-
|
131
|
-
|
130
|
+
if object.key.end_with?(".gz")
|
131
|
+
ungzipped_file_path = ungzip_file(local_file_path)
|
132
|
+
Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
|
133
|
+
end
|
132
134
|
rescue Aws::S3::Errors::ServiceError, Net::OpenTimeout, Net::ReadTimeout, StandardError => e
|
133
135
|
Rails.logger.error("Failed to process #{object.key}: #{e.message}")
|
134
136
|
raise e
|
data/lib/easy_ml/predict.rb
CHANGED
@@ -10,11 +10,17 @@ module EasyML
|
|
10
10
|
@models = {}
|
11
11
|
end
|
12
12
|
|
13
|
-
def self.
|
13
|
+
def self.normalize_input(df)
|
14
14
|
if df.is_a?(Hash)
|
15
15
|
df = Polars::DataFrame.new(df)
|
16
16
|
end
|
17
|
+
df
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.predict(model_name, df, serialize: false)
|
21
|
+
df = normalize_input(df)
|
17
22
|
raw_input = df.to_hashes
|
23
|
+
|
18
24
|
df = instance.normalize(model_name, df)
|
19
25
|
normalized_input = df.to_hashes
|
20
26
|
preds = instance.predict(model_name, df)
|
@@ -52,6 +58,11 @@ module EasyML
|
|
52
58
|
get_model(model_name).predict(df)
|
53
59
|
end
|
54
60
|
|
61
|
+
def self.validate_input(model_name, df)
|
62
|
+
df = normalize_input(df)
|
63
|
+
instance.get_model(model_name).dataset.validate_input(df)
|
64
|
+
end
|
65
|
+
|
55
66
|
def normalize(model_name, df)
|
56
67
|
get_model(model_name).dataset.normalize(df, inference: true)
|
57
68
|
end
|
@@ -72,7 +83,7 @@ module EasyML
|
|
72
83
|
private
|
73
84
|
|
74
85
|
def load_model(model_name)
|
75
|
-
current_model = EasyML::Model.find_by!(
|
86
|
+
current_model = EasyML::Model.find_by!(slug: model_name).inference_version
|
76
87
|
|
77
88
|
# Load new model if not loaded or different version
|
78
89
|
model_not_loaded = models[model_name].nil?
|
@@ -41,6 +41,9 @@ module EasyML
|
|
41
41
|
add_workflow_status_to_easy_ml_features
|
42
42
|
drop_path_from_easy_ml_model_files
|
43
43
|
add_is_date_column_to_easy_ml_columns
|
44
|
+
add_computed_columns_to_easy_ml_columns
|
45
|
+
add_slug_to_easy_ml_models
|
46
|
+
add_default_to_is_target
|
44
47
|
].freeze
|
45
48
|
|
46
49
|
# Specify the next migration number
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class AddComputedColumnsToEasyMLColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
2
|
+
def change
|
3
|
+
add_column :easy_ml_columns, :computed_by, :string
|
4
|
+
add_column :easy_ml_columns, :is_computed, :boolean, default: false
|
5
|
+
|
6
|
+
add_index :easy_ml_columns, :computed_by
|
7
|
+
add_index :easy_ml_columns, :is_computed
|
8
|
+
|
9
|
+
add_column :easy_ml_column_histories, :computed_by, :string
|
10
|
+
add_index :easy_ml_column_histories, :computed_by
|
11
|
+
add_column :easy_ml_column_histories, :is_computed, :boolean, default: false
|
12
|
+
add_index :easy_ml_column_histories, :is_computed
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
class AddSlugToEasyMLModels < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
2
|
+
def change
|
3
|
+
add_column :easy_ml_models, :slug, :string
|
4
|
+
add_index :easy_ml_models, :slug, unique: true
|
5
|
+
|
6
|
+
reversible do |dir|
|
7
|
+
dir.up do
|
8
|
+
execute <<-SQL
|
9
|
+
UPDATE easy_ml_models
|
10
|
+
SET slug = LOWER(REPLACE(name, ' ', '_'))
|
11
|
+
SQL
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
change_column_null :easy_ml_models, :slug, false
|
16
|
+
|
17
|
+
add_column :easy_ml_model_histories, :slug, :string
|
18
|
+
add_index :easy_ml_model_histories, :slug
|
19
|
+
end
|
20
|
+
end
|
data/lib/easy_ml/version.rb
CHANGED