easy_ml 0.2.0.pre.rc56 → 0.2.0.pre.rc58
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/apis_controller.rb +8 -0
- data/app/controllers/easy_ml/models_controller.rb +3 -0
- data/app/controllers/easy_ml/predictions_controller.rb +10 -5
- data/app/frontend/components/ModelForm.tsx +1 -1
- data/app/frontend/components/SearchableSelect.tsx +0 -1
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +1 -1
- data/app/frontend/pages/DatasourcesPage.tsx +0 -2
- data/app/jobs/easy_ml/compute_feature_job.rb +1 -0
- data/app/models/easy_ml/column.rb +55 -4
- data/app/models/easy_ml/column_history.rb +5 -1
- data/app/models/easy_ml/column_list.rb +46 -14
- data/app/models/easy_ml/dataset.rb +47 -27
- data/app/models/easy_ml/datasource.rb +1 -0
- data/app/models/easy_ml/feature.rb +10 -3
- data/app/models/easy_ml/model.rb +30 -6
- data/app/models/easy_ml/model_history.rb +1 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +4 -3
- data/app/models/easy_ml/retraining_run.rb +1 -0
- data/config/initializers/inflections.rb +2 -0
- data/config/routes.rb +3 -0
- data/lib/easy_ml/core/evaluators/base_evaluator.rb +1 -1
- data/lib/easy_ml/core/evaluators/classification_evaluators.rb +9 -9
- data/lib/easy_ml/core/evaluators/regression_evaluators.rb +4 -4
- data/lib/easy_ml/core/model_evaluator.rb +18 -3
- data/lib/easy_ml/core/tuner.rb +23 -17
- data/lib/easy_ml/data/preprocessor.rb +10 -53
- data/lib/easy_ml/data/splits/in_memory_split.rb +4 -0
- data/lib/easy_ml/data/statistics_learner.rb +79 -14
- data/lib/easy_ml/data/synced_directory.rb +4 -2
- data/lib/easy_ml/predict.rb +13 -2
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +3 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DTZ2348z.js → Application.tsx-DmkdJsDd.js} +34 -34
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DTZ2348z.js.map → Application.tsx-DmkdJsDd.js.map} +1 -1
- metadata +8 -4
data/app/models/easy_ml/model.rb
CHANGED
@@ -17,6 +17,7 @@
|
|
17
17
|
# is_training :boolean
|
18
18
|
# created_at :datetime not null
|
19
19
|
# updated_at :datetime not null
|
20
|
+
# slug :string not null
|
20
21
|
#
|
21
22
|
require_relative "models/hyperparameters"
|
22
23
|
|
@@ -66,6 +67,7 @@ module EasyML
|
|
66
67
|
after_initialize :bump_version, if: -> { new_record? }
|
67
68
|
after_initialize :set_defaults, if: -> { new_record? }
|
68
69
|
before_save :save_model_file, if: -> { is_fit? && !is_history_class? && model_changed? && !@skip_save_model_file }
|
70
|
+
before_validation :set_slug, if: :name_changed?
|
69
71
|
|
70
72
|
VALID_TASKS = %i[regression classification].freeze
|
71
73
|
|
@@ -91,6 +93,7 @@ module EasyML
|
|
91
93
|
}
|
92
94
|
validates :model_type, inclusion: { in: MODEL_NAMES }
|
93
95
|
validates :dataset_id, presence: true
|
96
|
+
validates :slug, presence: true, uniqueness: true
|
94
97
|
validate :validate_metrics_allowed
|
95
98
|
before_save :set_root_dir
|
96
99
|
|
@@ -189,6 +192,7 @@ module EasyML
|
|
189
192
|
evaluator: evaluator,
|
190
193
|
model: self,
|
191
194
|
dataset: dataset,
|
195
|
+
metrics: metrics,
|
192
196
|
}.compact
|
193
197
|
tuner.merge!(extra_params)
|
194
198
|
tuner_instance = EasyML::Core::Tuner.new(tuner)
|
@@ -307,7 +311,6 @@ module EasyML
|
|
307
311
|
|
308
312
|
dataset.refresh
|
309
313
|
adapter.fit(tuning: tuning, x_train: x_train, y_train: y_train, x_valid: x_valid, y_valid: y_valid, &progress_block)
|
310
|
-
@is_fit = true
|
311
314
|
end
|
312
315
|
|
313
316
|
def batch_args
|
@@ -334,11 +337,8 @@ module EasyML
|
|
334
337
|
|
335
338
|
def fit_in_batches(tuning: false, batch_size: nil, batch_overlap: nil, batch_key: nil, checkpoint_dir: Rails.root.join("tmp", "xgboost_checkpoints"), &progress_block)
|
336
339
|
adapter.fit_in_batches(tuning: tuning, batch_size: batch_size, batch_overlap: batch_overlap, batch_key: batch_key, checkpoint_dir: checkpoint_dir, &progress_block)
|
337
|
-
@is_fit = true
|
338
340
|
end
|
339
341
|
|
340
|
-
attr_accessor :is_fit
|
341
|
-
|
342
342
|
def is_fit?
|
343
343
|
model_file = get_model_file
|
344
344
|
return true if model_file.present? && model_file.fit?
|
@@ -354,15 +354,16 @@ module EasyML
|
|
354
354
|
dataset.decode_labels(ys, col: col)
|
355
355
|
end
|
356
356
|
|
357
|
-
def evaluate(y_pred: nil, y_true: nil, x_true: nil, evaluator: nil)
|
357
|
+
def evaluate(y_pred: nil, y_true: nil, x_true: nil, evaluator: nil, dataset: nil)
|
358
358
|
evaluator ||= self.evaluator
|
359
359
|
if y_pred.nil?
|
360
360
|
inputs = default_evaluation_inputs
|
361
361
|
y_pred = inputs[:y_pred]
|
362
362
|
y_true = inputs[:y_true]
|
363
363
|
x_true = inputs[:x_true]
|
364
|
+
dataset = inputs[:dataset]
|
364
365
|
end
|
365
|
-
EasyML::Core::ModelEvaluator.evaluate(model: self, y_pred: y_pred, y_true: y_true, x_true: x_true, evaluator: evaluator)
|
366
|
+
EasyML::Core::ModelEvaluator.evaluate(model: self, y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset, evaluator: evaluator)
|
366
367
|
end
|
367
368
|
|
368
369
|
def evaluator
|
@@ -446,6 +447,21 @@ module EasyML
|
|
446
447
|
)
|
447
448
|
end
|
448
449
|
|
450
|
+
include Rails.application.routes.mounted_helpers
|
451
|
+
|
452
|
+
def api_fields
|
453
|
+
{
|
454
|
+
url: EasyML::Engine.routes.url_helpers.predictions_path,
|
455
|
+
method: "POST",
|
456
|
+
data: {
|
457
|
+
model: slug,
|
458
|
+
input: dataset.columns.api_inputs.sort_by_required.map(&:to_api).each_with_object({}) do |field, hash|
|
459
|
+
hash[field[:name]] = field.except(:name)
|
460
|
+
end,
|
461
|
+
},
|
462
|
+
}
|
463
|
+
end
|
464
|
+
|
449
465
|
class CannotdeployError < StandardError
|
450
466
|
end
|
451
467
|
|
@@ -524,11 +540,13 @@ module EasyML
|
|
524
540
|
|
525
541
|
def default_evaluation_inputs
|
526
542
|
x_true, y_true = dataset.test(split_ys: true)
|
543
|
+
ds = dataset.test(all_columns: true)
|
527
544
|
y_pred = predict(x_true)
|
528
545
|
{
|
529
546
|
x_true: x_true,
|
530
547
|
y_true: y_true,
|
531
548
|
y_pred: y_pred,
|
549
|
+
dataset: ds,
|
532
550
|
}
|
533
551
|
end
|
534
552
|
|
@@ -603,6 +621,12 @@ module EasyML
|
|
603
621
|
errors.add(:metrics,
|
604
622
|
"don't know how to handle #{"metrics".pluralize(unknown_metrics)} #{unknown_metrics.join(", ")}, use EasyML::Core::ModelEvaluator.register(:name, Evaluator, :regression|:classification)")
|
605
623
|
end
|
624
|
+
|
625
|
+
def set_slug
|
626
|
+
if slug.nil? && name.present?
|
627
|
+
self.slug = name.gsub(/\s/, "_").downcase
|
628
|
+
end
|
629
|
+
end
|
606
630
|
end
|
607
631
|
end
|
608
632
|
|
@@ -32,7 +32,7 @@ module EasyML
|
|
32
32
|
false
|
33
33
|
end
|
34
34
|
|
35
|
-
def
|
35
|
+
def test_dataset
|
36
36
|
if tuner.present?
|
37
37
|
[tuner.x_true, tuner.y_true]
|
38
38
|
else
|
@@ -46,11 +46,12 @@ module EasyML
|
|
46
46
|
log_frequency = 10
|
47
47
|
if epoch % log_frequency == 0
|
48
48
|
model.adapter.external_model = booster
|
49
|
-
x_true, y_true =
|
49
|
+
x_true, y_true = test_dataset
|
50
50
|
@preprocessed ||= model.preprocess(x_true)
|
51
51
|
y_pred = model.predict(@preprocessed)
|
52
|
+
dataset = model.dataset.test(all_columns: true)
|
52
53
|
|
53
|
-
metrics = model.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true)
|
54
|
+
metrics = model.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset)
|
54
55
|
Wandb.log(metrics)
|
55
56
|
end
|
56
57
|
|
data/config/routes.rb
CHANGED
@@ -32,7 +32,7 @@ module EasyML
|
|
32
32
|
end
|
33
33
|
|
34
34
|
# Instance methods that evaluators must implement
|
35
|
-
def evaluate(y_pred: nil, y_true: nil, x_true: nil)
|
35
|
+
def evaluate(y_pred: nil, y_true: nil, x_true: nil, dataset: nil)
|
36
36
|
raise NotImplementedError, "#{self.class} must implement #evaluate"
|
37
37
|
end
|
38
38
|
|
@@ -5,7 +5,7 @@ module EasyML
|
|
5
5
|
class AccuracyScore
|
6
6
|
include BaseEvaluator
|
7
7
|
|
8
|
-
def evaluate(y_pred:, y_true:, x_true: nil)
|
8
|
+
def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
|
9
9
|
y_pred = Numo::Int32.cast(y_pred)
|
10
10
|
y_true = Numo::Int32.cast(y_true)
|
11
11
|
y_pred.eq(y_true).count_true.to_f / y_pred.size
|
@@ -23,7 +23,7 @@ module EasyML
|
|
23
23
|
class PrecisionScore
|
24
24
|
include BaseEvaluator
|
25
25
|
|
26
|
-
def evaluate(y_pred:, y_true:, x_true: nil)
|
26
|
+
def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
|
27
27
|
y_pred = Numo::Int32.cast(y_pred)
|
28
28
|
y_true = Numo::Int32.cast(y_true)
|
29
29
|
true_positives = (y_pred.eq(1) & y_true.eq(1)).count_true
|
@@ -45,7 +45,7 @@ module EasyML
|
|
45
45
|
class RecallScore
|
46
46
|
include BaseEvaluator
|
47
47
|
|
48
|
-
def evaluate(y_pred:, y_true:, x_true: nil)
|
48
|
+
def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
|
49
49
|
y_pred = Numo::Int32.cast(y_pred)
|
50
50
|
y_true = Numo::Int32.cast(y_true)
|
51
51
|
true_positives = (y_pred.eq(1) & y_true.eq(1)).count_true
|
@@ -65,9 +65,9 @@ module EasyML
|
|
65
65
|
class F1Score
|
66
66
|
include BaseEvaluator
|
67
67
|
|
68
|
-
def evaluate(y_pred:, y_true:, x_true: nil)
|
69
|
-
precision = PrecisionScore.new.evaluate(y_pred: y_pred, y_true: y_true)
|
70
|
-
recall = RecallScore.new.evaluate(y_pred: y_pred, y_true: y_true)
|
68
|
+
def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
|
69
|
+
precision = PrecisionScore.new.evaluate(y_pred: y_pred, y_true: y_true, dataset: dataset)
|
70
|
+
recall = RecallScore.new.evaluate(y_pred: y_pred, y_true: y_true, dataset: dataset)
|
71
71
|
return 0 unless (precision + recall) > 0
|
72
72
|
|
73
73
|
2 * (precision * recall) / (precision + recall)
|
@@ -85,7 +85,7 @@ module EasyML
|
|
85
85
|
class AUC
|
86
86
|
include BaseEvaluator
|
87
87
|
|
88
|
-
def evaluate(y_pred:, y_true:, x_true: nil)
|
88
|
+
def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
|
89
89
|
y_pred = Numo::DFloat.cast(y_pred)
|
90
90
|
y_true = Numo::Int32.cast(y_true)
|
91
91
|
|
@@ -132,8 +132,8 @@ module EasyML
|
|
132
132
|
class ROC_AUC
|
133
133
|
include BaseEvaluator
|
134
134
|
|
135
|
-
def evaluate(y_pred:, y_true:, x_true: nil)
|
136
|
-
AUC.new.evaluate(y_pred: y_pred, y_true: y_true)
|
135
|
+
def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
|
136
|
+
AUC.new.evaluate(y_pred: y_pred, y_true: y_true, dataset: dataset)
|
137
137
|
end
|
138
138
|
|
139
139
|
def description
|
@@ -5,7 +5,7 @@ module EasyML
|
|
5
5
|
class MeanAbsoluteError
|
6
6
|
include BaseEvaluator
|
7
7
|
|
8
|
-
def evaluate(y_pred:, y_true:, x_true: nil)
|
8
|
+
def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
|
9
9
|
(Numo::DFloat.cast(y_pred) - Numo::DFloat.cast(y_true)).abs.mean
|
10
10
|
end
|
11
11
|
|
@@ -21,7 +21,7 @@ module EasyML
|
|
21
21
|
class MeanSquaredError
|
22
22
|
include BaseEvaluator
|
23
23
|
|
24
|
-
def evaluate(y_pred:, y_true:, x_true: nil)
|
24
|
+
def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
|
25
25
|
((Numo::DFloat.cast(y_pred) - Numo::DFloat.cast(y_true)) ** 2).mean
|
26
26
|
end
|
27
27
|
|
@@ -37,7 +37,7 @@ module EasyML
|
|
37
37
|
class RootMeanSquaredError
|
38
38
|
include BaseEvaluator
|
39
39
|
|
40
|
-
def evaluate(y_pred:, y_true:, x_true: nil)
|
40
|
+
def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
|
41
41
|
Math.sqrt(((Numo::DFloat.cast(y_pred) - Numo::DFloat.cast(y_true)) ** 2).mean)
|
42
42
|
end
|
43
43
|
|
@@ -61,7 +61,7 @@ module EasyML
|
|
61
61
|
"maximize"
|
62
62
|
end
|
63
63
|
|
64
|
-
def evaluate(y_pred:, y_true:, x_true: nil)
|
64
|
+
def evaluate(y_pred:, y_true:, x_true: nil, dataset: nil)
|
65
65
|
y_true = Numo::DFloat.cast(y_true)
|
66
66
|
y_pred = Numo::DFloat.cast(y_pred)
|
67
67
|
|
@@ -98,13 +98,21 @@ module EasyML
|
|
98
98
|
end
|
99
99
|
end
|
100
100
|
|
101
|
-
def evaluate(model:, y_pred:, y_true:, x_true: nil, evaluator: nil)
|
101
|
+
def evaluate(model:, y_pred:, y_true:, x_true: nil, evaluator: nil, dataset: nil)
|
102
102
|
y_pred = normalize_input(y_pred)
|
103
103
|
y_true = normalize_input(y_true)
|
104
104
|
check_size(y_pred, y_true)
|
105
105
|
|
106
106
|
metrics_results = {}
|
107
107
|
|
108
|
+
if x_true.nil?
|
109
|
+
x_true = model.dataset.test
|
110
|
+
end
|
111
|
+
|
112
|
+
if dataset.nil?
|
113
|
+
dataset = model.dataset.test(all_columns: true)
|
114
|
+
end
|
115
|
+
|
108
116
|
model.metrics.each do |metric|
|
109
117
|
evaluator_class = get(metric.to_sym)
|
110
118
|
next unless evaluator_class
|
@@ -115,6 +123,7 @@ module EasyML
|
|
115
123
|
y_pred: y_pred,
|
116
124
|
y_true: y_true,
|
117
125
|
x_true: x_true,
|
126
|
+
dataset: dataset,
|
118
127
|
)
|
119
128
|
end
|
120
129
|
|
@@ -124,7 +133,7 @@ module EasyML
|
|
124
133
|
raise "Unknown evaluator: #{evaluator}" unless evaluator_class
|
125
134
|
|
126
135
|
evaluator_instance = evaluator_class.new
|
127
|
-
response = evaluator_instance.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true)
|
136
|
+
response = evaluator_instance.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset)
|
128
137
|
|
129
138
|
if response.is_a?(Hash)
|
130
139
|
metrics_results.merge!(response)
|
@@ -145,6 +154,9 @@ module EasyML
|
|
145
154
|
def normalize_input(input)
|
146
155
|
case input
|
147
156
|
when Array
|
157
|
+
if input.first.class == TrueClass || input.first.class == FalseClass
|
158
|
+
input = input.map { |value| value ? 1.0 : 0.0 }
|
159
|
+
end
|
148
160
|
Numo::DFloat.cast(input)
|
149
161
|
when Polars::DataFrame
|
150
162
|
if input.columns.count > 1
|
@@ -152,7 +164,10 @@ module EasyML
|
|
152
164
|
end
|
153
165
|
|
154
166
|
normalize_input(input[input.columns.first])
|
155
|
-
when Polars::Series
|
167
|
+
when Polars::Series
|
168
|
+
if input.dtype == Polars::Boolean
|
169
|
+
input = input.cast(Polars::Int64)
|
170
|
+
end
|
156
171
|
Numo::DFloat.cast(input)
|
157
172
|
else
|
158
173
|
raise ArgumentError, "Don't know how to evaluate model with y_pred type #{input.class}"
|
data/lib/easy_ml/core/tuner.rb
CHANGED
@@ -8,7 +8,7 @@ module EasyML
|
|
8
8
|
:metrics, :objective, :n_trials, :direction, :evaluator,
|
9
9
|
:study, :results, :adapter, :tune_started_at, :x_true, :y_true,
|
10
10
|
:project_name, :job, :current_run, :trial_enumerator, :progress_block,
|
11
|
-
:tuner_job
|
11
|
+
:tuner_job, :dataset
|
12
12
|
|
13
13
|
def initialize(options = {})
|
14
14
|
@model = options[:model]
|
@@ -77,6 +77,7 @@ module EasyML
|
|
77
77
|
x_true, y_true = model.dataset.test(split_ys: true)
|
78
78
|
self.x_true = x_true
|
79
79
|
self.y_true = y_true
|
80
|
+
self.dataset = model.dataset.test(all_columns: true)
|
80
81
|
adapter.tune_started_at = tune_started_at
|
81
82
|
adapter.y_true = y_true
|
82
83
|
adapter.x_true = x_true
|
@@ -96,14 +97,6 @@ module EasyML
|
|
96
97
|
run_metrics = tune_once
|
97
98
|
result = calculate_result(run_metrics)
|
98
99
|
@results.push(result)
|
99
|
-
|
100
|
-
params = {
|
101
|
-
hyperparameters: model.hyperparameters.to_h,
|
102
|
-
value: result,
|
103
|
-
status: :success,
|
104
|
-
}.compact
|
105
|
-
|
106
|
-
@tuner_run.update!(params)
|
107
100
|
@study.tell(@current_trial, result)
|
108
101
|
rescue StandardError => e
|
109
102
|
@tuner_run.update!(status: :failed, hyperparameters: {})
|
@@ -138,14 +131,27 @@ module EasyML
|
|
138
131
|
)
|
139
132
|
self.current_run = @tuner_run
|
140
133
|
|
141
|
-
adapter.run_trial(@current_trial) do |model|
|
142
|
-
model.
|
143
|
-
|
144
|
-
|
145
|
-
metrics = model.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true)
|
146
|
-
puts metrics
|
147
|
-
metrics
|
134
|
+
model = adapter.run_trial(@current_trial) do |model|
|
135
|
+
model.tap do
|
136
|
+
model.fit(tuning: true, &progress_block)
|
137
|
+
end
|
148
138
|
end
|
139
|
+
|
140
|
+
y_pred = model.predict(x_true)
|
141
|
+
model.metrics = metrics
|
142
|
+
metrics = model.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset)
|
143
|
+
metric = metrics.symbolize_keys.dig(model.evaluator[:metric].to_sym)
|
144
|
+
|
145
|
+
puts metrics
|
146
|
+
|
147
|
+
params = {
|
148
|
+
hyperparameters: model.hyperparameters.to_h,
|
149
|
+
value: metric,
|
150
|
+
status: :success,
|
151
|
+
}.compact
|
152
|
+
|
153
|
+
@tuner_run.update!(params)
|
154
|
+
metrics
|
149
155
|
end
|
150
156
|
|
151
157
|
private
|
@@ -167,7 +173,7 @@ module EasyML
|
|
167
173
|
end
|
168
174
|
raise ArgumentError, "Objectives required for EasyML::Core::Tuner" unless objective.present?
|
169
175
|
|
170
|
-
self.metrics = EasyML::Model.new(task: task).
|
176
|
+
self.metrics = EasyML::Model.new(task: task).default_metrics if metrics.nil? || metrics.empty?
|
171
177
|
end
|
172
178
|
end
|
173
179
|
end
|
@@ -90,46 +90,19 @@ module EasyML::Data
|
|
90
90
|
df
|
91
91
|
end
|
92
92
|
|
93
|
-
def
|
94
|
-
preprocessing_steps ||= {}
|
95
|
-
preprocessing_steps.deep_symbolize_keys!
|
96
|
-
|
97
|
-
allowed_categories = {}
|
98
|
-
(preprocessing_steps[:training] || {}).each_key do |col|
|
99
|
-
next unless [
|
100
|
-
preprocessing_steps.dig(:training, col, :params, :ordinal_encoding),
|
101
|
-
preprocessing_steps.dig(:training, col, :params, :one_hot),
|
102
|
-
preprocessing_steps.dig(:training, col, :method).to_sym == :categorical,
|
103
|
-
].any?
|
104
|
-
|
105
|
-
cat_min = preprocessing_steps.dig(:training, col, :params, :categorical_min) || 1
|
106
|
-
val_counts = df[col].value_counts
|
107
|
-
allowed_categories[col] = val_counts[val_counts["count"] >= cat_min][col].to_a.compact
|
108
|
-
end
|
109
|
-
allowed_categories
|
110
|
-
end
|
111
|
-
|
112
|
-
def fit(df)
|
93
|
+
def fit(df, precomputed_stats = {})
|
113
94
|
return if df.nil?
|
114
95
|
return if preprocessing_steps.nil? || preprocessing_steps.keys.none?
|
115
96
|
|
116
97
|
preprocessing_steps.deep_symbolize_keys!
|
117
98
|
df = apply_clip(df, preprocessing_steps)
|
118
|
-
allowed_categories = learn_categorical_min(df, preprocessing_steps)
|
119
|
-
|
120
|
-
self.statistics = StatisticsLearner.learn_df(df, dataset: dataset).deep_symbolize_keys
|
121
99
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
statistics[col][:allowed_categories] = categories
|
126
|
-
statistics[col].merge!(
|
127
|
-
fit_categorical(df[col], preprocessing_steps)
|
128
|
-
)
|
129
|
-
end
|
100
|
+
self.statistics = StatisticsLearner.learn_df(df, dataset: dataset, type: :raw).deep_symbolize_keys.merge!(
|
101
|
+
precomputed_stats
|
102
|
+
).deep_symbolize_keys
|
130
103
|
end
|
131
104
|
|
132
|
-
def postprocess(df, inference: false)
|
105
|
+
def postprocess(df, inference: false, computed: false)
|
133
106
|
puts "Postprocessing..." if verbose
|
134
107
|
return df if preprocessing_steps.nil? || preprocessing_steps.keys.none?
|
135
108
|
|
@@ -139,6 +112,11 @@ module EasyML::Data
|
|
139
112
|
preprocessing_steps[:training]
|
140
113
|
end
|
141
114
|
|
115
|
+
if computed
|
116
|
+
computed_cols = dataset.columns.computed.map(&:name).map(&:to_sym)
|
117
|
+
steps = steps.deep_dup.slice(*computed_cols)
|
118
|
+
end
|
119
|
+
|
142
120
|
df = apply_transformations(df, steps)
|
143
121
|
|
144
122
|
puts "Postprocessing complete." if @verbose
|
@@ -260,27 +238,6 @@ module EasyML::Data
|
|
260
238
|
)
|
261
239
|
end
|
262
240
|
|
263
|
-
def fit_categorical(series, _preprocessing_steps)
|
264
|
-
value_counts = series.value_counts
|
265
|
-
column_names = value_counts.columns
|
266
|
-
value_column = column_names[0]
|
267
|
-
count_column = column_names[1]
|
268
|
-
|
269
|
-
as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
|
270
|
-
label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
|
271
|
-
h.tap do
|
272
|
-
h[k] = i
|
273
|
-
end
|
274
|
-
end
|
275
|
-
label_decoder = label_encoder.invert
|
276
|
-
|
277
|
-
{
|
278
|
-
value: as_hash,
|
279
|
-
label_encoder: label_encoder,
|
280
|
-
label_decoder: label_decoder,
|
281
|
-
}
|
282
|
-
end
|
283
|
-
|
284
241
|
def prepare_for_imputation(df, col)
|
285
242
|
df = df.with_column(Polars.col(col).cast(Polars::Float64))
|
286
243
|
df.with_column(Polars.when(Polars.col(col).is_null).then(Float::NAN).otherwise(Polars.col(col)).alias(col))
|
@@ -9,15 +9,16 @@ module EasyML::Data
|
|
9
9
|
@verbose = options[:verbose]
|
10
10
|
end
|
11
11
|
|
12
|
-
def self.learn(df, dataset
|
13
|
-
new(df, dataset).learn
|
12
|
+
def self.learn(df, dataset, type)
|
13
|
+
new(df, dataset, type).learn
|
14
14
|
end
|
15
15
|
|
16
|
-
attr_reader :df, :dataset
|
16
|
+
attr_reader :df, :dataset, :type
|
17
17
|
|
18
|
-
def initialize(df, dataset)
|
18
|
+
def initialize(df, dataset, type)
|
19
19
|
@df = df
|
20
20
|
@dataset = dataset
|
21
|
+
@type = type.to_sym
|
21
22
|
end
|
22
23
|
|
23
24
|
def learn
|
@@ -27,18 +28,73 @@ module EasyML::Data
|
|
27
28
|
def learn_split(split)
|
28
29
|
df = split.read(:all)
|
29
30
|
train_df = split.read(:train)
|
30
|
-
all_stats = learn_df(df
|
31
|
-
train_stats = learn_df(train_df
|
31
|
+
all_stats = learn_df(df)
|
32
|
+
train_stats = learn_df(train_df)
|
32
33
|
|
33
34
|
all_stats.reduce({}) do |output, (k, _)|
|
34
35
|
output.tap do
|
35
36
|
output[k] = all_stats[k].slice(:num_rows, :null_count, :unique_count, :counts).merge!(
|
36
|
-
train_stats[k].slice(:mean, :median, :min, :max, :std,
|
37
|
+
train_stats[k].slice(:mean, :median, :min, :max, :std,
|
38
|
+
:last_value, :most_frequent_value, :last_known_value,
|
39
|
+
:allowed_categories, :label_encoder, :label_decoder)
|
37
40
|
)
|
38
41
|
end
|
39
42
|
end
|
40
43
|
end
|
41
44
|
|
45
|
+
def learn_categorical(df)
|
46
|
+
allowed_categories = learn_allowed_categories(df)
|
47
|
+
allowed_categories.reduce({}) do |statistics, (col, categories)|
|
48
|
+
statistics.tap do
|
49
|
+
statistics[col] ||= {}
|
50
|
+
statistics[col][:allowed_categories] = categories
|
51
|
+
statistics[col].merge!(
|
52
|
+
learn_categorical_encoder_decoder(df[col])
|
53
|
+
)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def learn_categorical_encoder_decoder(series)
|
59
|
+
value_counts = series.value_counts
|
60
|
+
column_names = value_counts.columns
|
61
|
+
value_column = column_names[0]
|
62
|
+
count_column = column_names[1]
|
63
|
+
|
64
|
+
as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
|
65
|
+
label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
|
66
|
+
h.tap do
|
67
|
+
h[k] = i
|
68
|
+
end
|
69
|
+
end
|
70
|
+
label_decoder = label_encoder.invert
|
71
|
+
|
72
|
+
{
|
73
|
+
value: as_hash,
|
74
|
+
label_encoder: label_encoder,
|
75
|
+
label_decoder: label_decoder,
|
76
|
+
}
|
77
|
+
end
|
78
|
+
|
79
|
+
def learn_allowed_categories(df)
|
80
|
+
preprocessing_steps = dataset.preprocessing_steps || {}
|
81
|
+
preprocessing_steps.deep_symbolize_keys!
|
82
|
+
|
83
|
+
allowed_categories = {}
|
84
|
+
(preprocessing_steps[:training] || {}).each_key do |col|
|
85
|
+
next unless [
|
86
|
+
preprocessing_steps.dig(:training, col, :params, :ordinal_encoding),
|
87
|
+
preprocessing_steps.dig(:training, col, :params, :one_hot),
|
88
|
+
preprocessing_steps.dig(:training, col, :method).to_sym == :categorical,
|
89
|
+
].any?
|
90
|
+
|
91
|
+
cat_min = preprocessing_steps.dig(:training, col, :params, :categorical_min) || 1
|
92
|
+
val_counts = df[col].value_counts
|
93
|
+
allowed_categories[col] = val_counts[val_counts["count"] >= cat_min][col].to_a.compact
|
94
|
+
end
|
95
|
+
allowed_categories
|
96
|
+
end
|
97
|
+
|
42
98
|
def last_known_value(df, col, date_col)
|
43
99
|
return nil if df.empty? || !df.columns.include?(date_col)
|
44
100
|
|
@@ -53,13 +109,22 @@ module EasyML::Data
|
|
53
109
|
last_value
|
54
110
|
end
|
55
111
|
|
56
|
-
def learn_df(df
|
57
|
-
|
112
|
+
def learn_df(df)
|
113
|
+
return if df.nil?
|
114
|
+
|
115
|
+
stats = learn_base_stats(df, dataset: dataset).stringify_keys
|
116
|
+
if type == :raw
|
117
|
+
categorical = learn_categorical(df).stringify_keys
|
118
|
+
categorical.each { |k, v| stats[k].merge!(v) }
|
119
|
+
end
|
120
|
+
stats
|
58
121
|
end
|
59
122
|
|
60
|
-
def self.learn_df(df, dataset: nil)
|
61
|
-
|
123
|
+
def self.learn_df(df, dataset: nil, type: :raw)
|
124
|
+
new(df, dataset, type).learn_df(df)
|
125
|
+
end
|
62
126
|
|
127
|
+
def learn_base_stats(df, dataset: nil)
|
63
128
|
base_stats = describe_to_h(df).deep_symbolize_keys
|
64
129
|
|
65
130
|
# Add basic column statistics first
|
@@ -103,16 +168,16 @@ module EasyML::Data
|
|
103
168
|
end
|
104
169
|
end
|
105
170
|
|
106
|
-
def
|
171
|
+
def id_column?(column)
|
107
172
|
col = column.to_s.downcase
|
108
173
|
col.match?(/^id$/) || col.match?(/.*_id/)
|
109
174
|
end
|
110
175
|
|
111
|
-
def
|
176
|
+
def last_value(df, col, date_col)
|
112
177
|
df.filter(Polars.col(col).is_not_null).sort(date_col)[col][-1]
|
113
178
|
end
|
114
179
|
|
115
|
-
def
|
180
|
+
def describe_to_h(df)
|
116
181
|
init_h = df.describe.to_h
|
117
182
|
rows = init_h.values.map(&:to_a)
|
118
183
|
keys = rows.first
|
@@ -127,8 +127,10 @@ module EasyML
|
|
127
127
|
)
|
128
128
|
|
129
129
|
Rails.logger.info("Downloaded #{object.key} to #{local_file_path}")
|
130
|
-
|
131
|
-
|
130
|
+
if object.key.end_with?(".gz")
|
131
|
+
ungzipped_file_path = ungzip_file(local_file_path)
|
132
|
+
Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
|
133
|
+
end
|
132
134
|
rescue Aws::S3::Errors::ServiceError, Net::OpenTimeout, Net::ReadTimeout, StandardError => e
|
133
135
|
Rails.logger.error("Failed to process #{object.key}: #{e.message}")
|
134
136
|
raise e
|