easy_ml 0.2.0.pre.rc78 → 0.2.0.pre.rc82
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +3 -3
- data/app/controllers/easy_ml/models_controller.rb +2 -2
- data/app/jobs/easy_ml/training_job.rb +2 -2
- data/app/models/easy_ml/column/imputers/imputer.rb +2 -0
- data/app/models/easy_ml/column_list.rb +2 -3
- data/app/models/easy_ml/dataset.rb +22 -11
- data/app/models/easy_ml/feature.rb +27 -7
- data/app/models/easy_ml/model.rb +11 -6
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +21 -5
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -4
- data/app/models/easy_ml/models/xgboost.rb +18 -16
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +16 -3
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +0 -17
- data/lib/easy_ml/core/tuner.rb +9 -3
- data/lib/easy_ml/data/dataset_manager/reader/base.rb +12 -0
- data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +8 -3
- data/lib/easy_ml/data/dataset_manager/reader/file.rb +5 -0
- data/lib/easy_ml/data/dataset_manager/reader.rb +7 -1
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +1 -1
- data/lib/easy_ml/data/dataset_manager.rb +10 -2
- data/lib/easy_ml/data/embeddings/adapters.rb +56 -0
- data/lib/easy_ml/data/embeddings/compression.rb +1 -0
- data/lib/easy_ml/data/embeddings.rb +43 -0
- data/lib/easy_ml/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ce245d6900c4c5c001c0de9982894ccf6b41faef31e8c958dc540ef05fe426e4
|
4
|
+
data.tar.gz: a120f14076a9ff83ca6afb8b0bd651b9ea9ed0185d42f23b82f5de6b2a4de831
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db2b7292bf07b5122a7949a111c56a25f8848496eaadfc94f101a271054fd4c21d401b4c941f69d7579c8cdc9887e68c4105f11720dba13aa7fd0fcdafb79b81
|
7
|
+
data.tar.gz: f5dc8ffee52fb67b6cda8e4462e540900d528dd9c15ff49faa8ca45f21f9e4968129b996a5018a91801aae431a65b5274d64e69045e5feeeec6f12f09900cda3
|
@@ -23,7 +23,7 @@
|
|
23
23
|
module EasyML
|
24
24
|
class DatasetsController < ApplicationController
|
25
25
|
def index
|
26
|
-
datasets = Dataset.all.order(id: :desc)
|
26
|
+
datasets = Dataset.all.includes(:columns, :datasource).order(id: :desc)
|
27
27
|
|
28
28
|
render inertia: "pages/DatasetsPage", props: {
|
29
29
|
datasets: datasets.map { |dataset| dataset_to_json_small(dataset) },
|
@@ -80,7 +80,7 @@ module EasyML
|
|
80
80
|
if dataset_params[:features_attributes].present?
|
81
81
|
# Clean up any feature IDs that don't exist anymore
|
82
82
|
feature_ids = dataset_params[:features_attributes].map { |attrs| attrs[:id] }.compact
|
83
|
-
existing_feature_ids =
|
83
|
+
existing_feature_ids = dataset.features.where(id: feature_ids).pluck(:id)
|
84
84
|
|
85
85
|
params[:dataset][:features_attributes].each do |attrs|
|
86
86
|
if attrs[:id].present? && !existing_feature_ids.include?(attrs[:id].to_i)
|
@@ -93,7 +93,7 @@ module EasyML
|
|
93
93
|
attrs[:feature_class] if attrs[:id].blank?
|
94
94
|
}.compact
|
95
95
|
|
96
|
-
existing_features =
|
96
|
+
existing_features = dataset.features.where(feature_class: feature_classes)
|
97
97
|
|
98
98
|
# Update params with existing feature IDs
|
99
99
|
existing_features.each do |feature|
|
@@ -41,7 +41,7 @@ module EasyML
|
|
41
41
|
render inertia: "pages/EditModelPage", props: {
|
42
42
|
model: model_to_json(model),
|
43
43
|
datasets: EasyML::Dataset.all.map do |dataset|
|
44
|
-
|
44
|
+
dataset_to_json_small(dataset)
|
45
45
|
end,
|
46
46
|
constants: EasyML::Model.constants,
|
47
47
|
}
|
@@ -167,7 +167,7 @@ module EasyML
|
|
167
167
|
private
|
168
168
|
|
169
169
|
def includes_list
|
170
|
-
[:retraining_runs, :retraining_job, dataset: [:
|
170
|
+
[:retraining_runs, :retraining_job, dataset: [:features, :splitter, columns: [:lineages]]]
|
171
171
|
end
|
172
172
|
|
173
173
|
def model_params
|
@@ -10,13 +10,13 @@ module EasyML
|
|
10
10
|
|
11
11
|
@last_activity = Time.current
|
12
12
|
setup_signal_traps
|
13
|
-
@monitor_thread = start_monitor_thread
|
13
|
+
# @monitor_thread = start_monitor_thread
|
14
14
|
|
15
15
|
@model.actually_train do |iteration_info|
|
16
16
|
@last_activity = Time.current
|
17
17
|
end
|
18
18
|
ensure
|
19
|
-
@monitor_thread&.exit
|
19
|
+
# @monitor_thread&.exit
|
20
20
|
@model.unlock!
|
21
21
|
end
|
22
22
|
|
@@ -28,12 +28,11 @@ module EasyML
|
|
28
28
|
if computed
|
29
29
|
cols = column_list.computed
|
30
30
|
else
|
31
|
-
cols = column_list
|
31
|
+
cols = column_list
|
32
32
|
end
|
33
33
|
|
34
34
|
by_name = cols.index_by(&:name)
|
35
|
-
|
36
|
-
column = by_name[col]
|
35
|
+
cols.each do |column|
|
37
36
|
df = column.transform(df, inference: inference, computed: computed) if column
|
38
37
|
end
|
39
38
|
|
@@ -232,20 +232,20 @@ module EasyML
|
|
232
232
|
cleanup
|
233
233
|
refresh_datasource!
|
234
234
|
split_data
|
235
|
-
|
235
|
+
fit
|
236
236
|
end
|
237
237
|
|
238
238
|
def prepare
|
239
239
|
prepare_features
|
240
240
|
refresh_datasource
|
241
241
|
split_data
|
242
|
-
|
242
|
+
fit
|
243
243
|
end
|
244
244
|
|
245
245
|
def actually_refresh
|
246
246
|
refreshing do
|
247
|
-
|
248
|
-
|
247
|
+
fit
|
248
|
+
normalize_all
|
249
249
|
fully_reload
|
250
250
|
learn
|
251
251
|
learn_statistics(type: :processed) # After processing data, we learn any new statistics
|
@@ -385,6 +385,7 @@ module EasyML
|
|
385
385
|
def unlock!
|
386
386
|
Support::Lockable.unlock!(lock_key)
|
387
387
|
features.each(&:unlock!)
|
388
|
+
true
|
388
389
|
end
|
389
390
|
|
390
391
|
def locked?
|
@@ -427,12 +428,6 @@ module EasyML
|
|
427
428
|
(read_attribute(:statistics) || {}).with_indifferent_access
|
428
429
|
end
|
429
430
|
|
430
|
-
def process_data
|
431
|
-
learn(delete: false)
|
432
|
-
fit
|
433
|
-
normalize_all
|
434
|
-
end
|
435
|
-
|
436
431
|
def needs_learn?
|
437
432
|
return true if columns_need_refresh?
|
438
433
|
|
@@ -483,7 +478,7 @@ module EasyML
|
|
483
478
|
df = apply_missing_columns(df, inference: inference)
|
484
479
|
df = columns.transform(df, inference: inference)
|
485
480
|
df = apply_features(df, features)
|
486
|
-
df = columns.transform(df, inference: inference
|
481
|
+
df = columns.transform(df, inference: inference)
|
487
482
|
df = apply_column_mask(df, inference: inference) unless all_columns
|
488
483
|
df = drop_nulls(df) unless inference
|
489
484
|
df, = processed.split_features_targets(df, true, target) if split_ys
|
@@ -722,6 +717,20 @@ module EasyML
|
|
722
717
|
reload
|
723
718
|
end
|
724
719
|
|
720
|
+
def list_nulls(input = nil, list_raw = false)
|
721
|
+
input = data(lazy: true) if input.nil?
|
722
|
+
|
723
|
+
case input
|
724
|
+
when Polars::DataFrame
|
725
|
+
input = input.lazy
|
726
|
+
when String, Symbol
|
727
|
+
input = input.to_sym
|
728
|
+
input = send(input).data(lazy: true)
|
729
|
+
end
|
730
|
+
col_list = EasyML::Data::DatasetManager.list_nulls(input)
|
731
|
+
list_raw ? col_list : regular_columns(col_list)
|
732
|
+
end
|
733
|
+
|
725
734
|
private
|
726
735
|
|
727
736
|
def apply_date_splitter_config
|
@@ -798,6 +807,7 @@ module EasyML
|
|
798
807
|
processed_df = normalize(df, all_columns: true)
|
799
808
|
processed.save(segment, processed_df)
|
800
809
|
end
|
810
|
+
features.select { |f| !f.fittable? }.each(&:after_transform)
|
801
811
|
@normalized = true
|
802
812
|
end
|
803
813
|
|
@@ -840,6 +850,7 @@ module EasyML
|
|
840
850
|
end
|
841
851
|
|
842
852
|
def fit
|
853
|
+
learn(delete: false)
|
843
854
|
learn_statistics(type: :raw)
|
844
855
|
end
|
845
856
|
|
@@ -78,11 +78,21 @@ module EasyML
|
|
78
78
|
scope :never_applied, -> { where(applied_at: nil) }
|
79
79
|
scope :never_fit, -> do
|
80
80
|
fittable = where(fit_at: nil)
|
81
|
-
fittable = fittable.select
|
81
|
+
fittable = fittable.select(&:fittable?)
|
82
82
|
where(id: fittable.map(&:id))
|
83
83
|
end
|
84
|
-
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
|
85
|
-
scope :
|
84
|
+
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed) }
|
85
|
+
scope :datasource_was_refreshed, -> do
|
86
|
+
where(id: all.select(&:datasource_was_refreshed?).map(&:id))
|
87
|
+
end
|
88
|
+
scope :ready_to_apply, -> do
|
89
|
+
base = where(needs_fit: false).where.not(id: has_changes.map(&:id))
|
90
|
+
doesnt_fit = where_no_fit
|
91
|
+
where(id: base.map(&:id).concat(doesnt_fit.map(&:id)))
|
92
|
+
end
|
93
|
+
|
94
|
+
scope :fittable, -> { all.select(&:fittable?) }
|
95
|
+
scope :where_no_fit, -> { all.reject(&:fittable?) }
|
86
96
|
|
87
97
|
before_save :apply_defaults, if: :new_record?
|
88
98
|
before_save :update_sha
|
@@ -100,6 +110,10 @@ module EasyML
|
|
100
110
|
feature_klass.present?
|
101
111
|
end
|
102
112
|
|
113
|
+
def fittable?
|
114
|
+
adapter.respond_to?(:fit)
|
115
|
+
end
|
116
|
+
|
103
117
|
def adapter
|
104
118
|
@adapter ||= feature_klass.new
|
105
119
|
end
|
@@ -133,6 +147,7 @@ module EasyML
|
|
133
147
|
end
|
134
148
|
|
135
149
|
def datasource_was_refreshed?
|
150
|
+
return false unless fittable?
|
136
151
|
return true if fit_at.nil?
|
137
152
|
return false if dataset.datasource.refreshed_at.nil?
|
138
153
|
|
@@ -213,13 +228,14 @@ module EasyML
|
|
213
228
|
end
|
214
229
|
|
215
230
|
def wipe
|
231
|
+
update(needs_fit: true) if fittable?
|
216
232
|
feature_store.wipe
|
217
233
|
end
|
218
234
|
|
219
235
|
def fit(features: [self], async: false)
|
220
236
|
ordered_features = features.sort_by(&:feature_position)
|
221
237
|
parent_batch_id = Random.uuid
|
222
|
-
jobs = ordered_features.map do |feature|
|
238
|
+
jobs = ordered_features.select(&:fittable?).map do |feature|
|
223
239
|
feature.build_batches.map do |batch_args|
|
224
240
|
batch_args.merge(parent_batch_id: parent_batch_id)
|
225
241
|
end
|
@@ -450,7 +466,7 @@ module EasyML
|
|
450
466
|
def after_fit
|
451
467
|
update_sha
|
452
468
|
|
453
|
-
feature_store.compact
|
469
|
+
feature_store.compact if fittable?
|
454
470
|
updates = {
|
455
471
|
fit_at: Time.current,
|
456
472
|
needs_fit: false,
|
@@ -459,6 +475,10 @@ module EasyML
|
|
459
475
|
update!(updates)
|
460
476
|
end
|
461
477
|
|
478
|
+
def after_transform
|
479
|
+
feature_store.compact if !fittable?
|
480
|
+
end
|
481
|
+
|
462
482
|
def unlock!
|
463
483
|
feature_store.unlock!
|
464
484
|
end
|
@@ -517,14 +537,14 @@ module EasyML
|
|
517
537
|
new_sha = compute_sha
|
518
538
|
if new_sha != self.sha
|
519
539
|
self.sha = new_sha
|
520
|
-
self.needs_fit =
|
540
|
+
self.needs_fit = fittable?
|
521
541
|
end
|
522
542
|
end
|
523
543
|
|
524
544
|
def update_from_feature_class
|
525
545
|
if read_attribute(:batch_size) != config.dig(:batch_size)
|
526
546
|
write_attribute(:batch_size, config.dig(:batch_size))
|
527
|
-
self.needs_fit =
|
547
|
+
self.needs_fit = fittable?
|
528
548
|
end
|
529
549
|
|
530
550
|
if self.primary_key != config.dig(:primary_key)
|
data/app/models/easy_ml/model.rb
CHANGED
@@ -179,17 +179,18 @@ module EasyML
|
|
179
179
|
end
|
180
180
|
|
181
181
|
def actually_train(&progress_block)
|
182
|
-
raise untrainable_error unless trainable?
|
183
|
-
|
184
182
|
lock_model do
|
185
183
|
run = pending_run
|
186
184
|
run.wrap_training do
|
185
|
+
raise untrainable_error unless trainable?
|
186
|
+
|
187
187
|
best_params = nil
|
188
188
|
if run.should_tune?
|
189
189
|
best_params = hyperparameter_search(&progress_block)
|
190
|
+
else
|
191
|
+
fit(&progress_block)
|
192
|
+
save
|
190
193
|
end
|
191
|
-
fit(&progress_block)
|
192
|
-
save
|
193
194
|
[self, best_params]
|
194
195
|
end
|
195
196
|
update(is_training: false)
|
@@ -393,6 +394,10 @@ module EasyML
|
|
393
394
|
adapter.after_tuning
|
394
395
|
end
|
395
396
|
|
397
|
+
def cleanup
|
398
|
+
adapter.cleanup
|
399
|
+
end
|
400
|
+
|
396
401
|
def fit_in_batches(tuning: false, batch_size: nil, batch_overlap: nil, batch_key: nil, checkpoint_dir: Rails.root.join("tmp", "xgboost_checkpoints"), &progress_block)
|
397
402
|
adapter.fit_in_batches(tuning: tuning, batch_size: batch_size, batch_overlap: batch_overlap, batch_key: batch_key, checkpoint_dir: checkpoint_dir, &progress_block)
|
398
403
|
end
|
@@ -619,8 +624,8 @@ module EasyML
|
|
619
624
|
private
|
620
625
|
|
621
626
|
def default_evaluation_inputs
|
622
|
-
x_true, y_true = dataset.test(split_ys: true)
|
623
|
-
ds = dataset.test(all_columns: true)
|
627
|
+
x_true, y_true = dataset.processed.test(split_ys: true, all_columns: true)
|
628
|
+
ds = dataset.processed.test(all_columns: true)
|
624
629
|
y_pred = predict(x_true)
|
625
630
|
{
|
626
631
|
x_true: x_true,
|
@@ -37,6 +37,20 @@ module EasyML
|
|
37
37
|
max: 10,
|
38
38
|
step: 0.1,
|
39
39
|
},
|
40
|
+
scale_pos_weight: {
|
41
|
+
label: "Scale Pos Weight",
|
42
|
+
description: "Balance of positive and negative weights",
|
43
|
+
min: 0,
|
44
|
+
max: 200,
|
45
|
+
step: 1,
|
46
|
+
},
|
47
|
+
max_delta_step: {
|
48
|
+
label: "Max Delta Step",
|
49
|
+
description: "Maximum delta step",
|
50
|
+
min: 0,
|
51
|
+
max: 10,
|
52
|
+
step: 1,
|
53
|
+
},
|
40
54
|
gamma: {
|
41
55
|
label: "Gamma",
|
42
56
|
description: "Minimum loss reduction required to make a further partition",
|
@@ -81,11 +95,13 @@ module EasyML
|
|
81
95
|
label: "Histogram",
|
82
96
|
description: "Fast histogram optimized approximate greedy algorithm",
|
83
97
|
},
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
98
|
+
# Only when compiled wih GPU support...
|
99
|
+
# How to make this not a default optoin
|
100
|
+
# {
|
101
|
+
# value: "gpu_hist",
|
102
|
+
# label: "GPU Histogram",
|
103
|
+
# description: "GPU implementation of hist algorithm",
|
104
|
+
# },
|
89
105
|
],
|
90
106
|
},
|
91
107
|
)
|
@@ -50,7 +50,7 @@ module EasyML
|
|
50
50
|
x_valid = x_valid.select(model.dataset.col_order(inference: true))
|
51
51
|
@preprocessed ||= model.preprocess(x_valid, y_valid)
|
52
52
|
y_pred = model.predict(@preprocessed)
|
53
|
-
dataset = model.dataset.valid(all_columns: true)
|
53
|
+
dataset = model.dataset.processed.valid(all_columns: true)
|
54
54
|
|
55
55
|
metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
|
56
56
|
Wandb.log(metrics)
|
@@ -103,7 +103,7 @@ module EasyML
|
|
103
103
|
model.callbacks.detect { |cb| cb.class == Wandb::XGBoostCallback }
|
104
104
|
end
|
105
105
|
|
106
|
-
def track_cumulative_feature_importance
|
106
|
+
def track_cumulative_feature_importance
|
107
107
|
return unless @feature_importances
|
108
108
|
|
109
109
|
project_name = model.adapter.get_wandb_project
|
@@ -127,13 +127,16 @@ module EasyML
|
|
127
127
|
"feature_importance" => bar_plot.__pyptr__,
|
128
128
|
}
|
129
129
|
Wandb.log(log_data)
|
130
|
-
model.adapter.delete_wandb_project if finish
|
131
|
-
Wandb.finish if finish
|
132
130
|
end
|
133
131
|
|
134
132
|
def after_tuning
|
135
133
|
track_cumulative_feature_importance
|
136
134
|
end
|
135
|
+
|
136
|
+
def cleanup
|
137
|
+
model.adapter.delete_wandb_project
|
138
|
+
Wandb.finish
|
139
|
+
end
|
137
140
|
end
|
138
141
|
end
|
139
142
|
end
|
@@ -135,6 +135,12 @@ module EasyML
|
|
135
135
|
end
|
136
136
|
end
|
137
137
|
|
138
|
+
def cleanup
|
139
|
+
model.callbacks.each do |callback|
|
140
|
+
callback.cleanup if callback.respond_to?(:cleanup)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
138
144
|
def prepare_callbacks(tuner)
|
139
145
|
set_wandb_project(tuner.project_name)
|
140
146
|
|
@@ -421,11 +427,11 @@ module EasyML
|
|
421
427
|
def prepare_data
|
422
428
|
if @d_train.nil?
|
423
429
|
col_order = dataset.col_order
|
424
|
-
x_sample, y_sample = dataset.train(split_ys: true, limit: 5, select: col_order, lazy: true)
|
430
|
+
x_sample, y_sample = dataset.processed.train(split_ys: true, limit: 5, select: col_order, lazy: true)
|
425
431
|
preprocess(x_sample, y_sample) # Ensure we fail fast if the dataset is misconfigured
|
426
|
-
x_train, y_train = dataset.train(split_ys: true, select: col_order, lazy: true)
|
427
|
-
x_valid, y_valid = dataset.valid(split_ys: true, select: col_order, lazy: true)
|
428
|
-
x_test, y_test = dataset.test(split_ys: true, select: col_order, lazy: true)
|
432
|
+
x_train, y_train = dataset.processed.train(split_ys: true, select: col_order, lazy: true)
|
433
|
+
x_valid, y_valid = dataset.processed.valid(split_ys: true, select: col_order, lazy: true)
|
434
|
+
x_test, y_test = dataset.processed.test(split_ys: true, select: col_order, lazy: true)
|
429
435
|
@d_train = preprocess(x_train, y_train)
|
430
436
|
@d_valid = preprocess(x_valid, y_valid)
|
431
437
|
@d_test = preprocess(x_test, y_test)
|
@@ -439,22 +445,19 @@ module EasyML
|
|
439
445
|
end
|
440
446
|
|
441
447
|
def untrainable_columns
|
442
|
-
|
448
|
+
model.dataset.refresh if model.dataset.processed.nil?
|
443
449
|
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
end
|
448
|
-
null_info = df.select(selects).collect
|
449
|
-
null_info.to_hashes.first.compact
|
450
|
-
col_list = null_info.to_hashes.first.transform_values { |v| v > 0 ? v : nil }.compact.keys
|
451
|
-
|
452
|
-
model.dataset.regular_columns(col_list)
|
450
|
+
model.dataset.list_nulls(
|
451
|
+
model.dataset.processed.data(lazy: true)
|
452
|
+
)
|
453
453
|
end
|
454
454
|
|
455
455
|
def preprocess(xs, ys = nil)
|
456
456
|
return xs if xs.is_a?(::XGBoost::DMatrix)
|
457
|
-
|
457
|
+
lazy = xs.is_a?(Polars::LazyFrame)
|
458
|
+
return xs if (lazy ? xs.limit(1).collect : xs).shape[0] == 0
|
459
|
+
|
460
|
+
weights_col = (model.weights_column.nil? || model.weights_column.blank?) ? nil : model.weights_column
|
458
461
|
|
459
462
|
if weights_col == model.dataset.target
|
460
463
|
raise ArgumentError, "Weight column cannot be the target column"
|
@@ -463,7 +466,6 @@ module EasyML
|
|
463
466
|
# Extract feature columns (all columns except label and weight)
|
464
467
|
feature_cols = xs.columns
|
465
468
|
feature_cols -= [weights_col] if weights_col
|
466
|
-
lazy = xs.is_a?(Polars::LazyFrame)
|
467
469
|
|
468
470
|
# Get features, labels and weights
|
469
471
|
features = lazy ? xs.select(feature_cols).collect.to_numo : xs.select(feature_cols).to_numo
|
@@ -18,12 +18,22 @@ module EasyML
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def defaults
|
21
|
-
|
21
|
+
model.adapter.hyperparameters.class.hyperparameter_constants.transform_values do |constant|
|
22
|
+
values = constant.slice(:min, :max, :step, :options)
|
23
|
+
if values.key?(:options)
|
24
|
+
values[:options] = values[:options].map { |option| option[:value] }
|
25
|
+
end
|
26
|
+
values
|
27
|
+
end
|
22
28
|
end
|
23
29
|
|
24
30
|
def run_trial(trial)
|
25
31
|
config = deep_merge_defaults(self.config.clone.deep_symbolize_keys)
|
26
|
-
|
32
|
+
# For first trial, re-use the original hyperparameters, so they
|
33
|
+
# serve as our starting point/imputers
|
34
|
+
unless trial == 1
|
35
|
+
suggest_parameters(trial, config)
|
36
|
+
end
|
27
37
|
yield model
|
28
38
|
end
|
29
39
|
|
@@ -57,8 +67,11 @@ module EasyML
|
|
57
67
|
min = param_config[:min]
|
58
68
|
max = param_config[:max]
|
59
69
|
log = param_config[:log]
|
70
|
+
options = param_config[:options]
|
60
71
|
|
61
|
-
if
|
72
|
+
if options
|
73
|
+
trial.suggest_categorical(param_name.to_s, options)
|
74
|
+
elsif log
|
62
75
|
trial.suggest_loguniform(param_name.to_s, min, max)
|
63
76
|
elsif max.is_a?(Integer) && min.is_a?(Integer)
|
64
77
|
trial.suggest_int(param_name.to_s, min, max)
|
@@ -5,23 +5,6 @@ module EasyML
|
|
5
5
|
class Tuner
|
6
6
|
module Adapters
|
7
7
|
class XGBoostAdapter < BaseAdapter
|
8
|
-
def defaults
|
9
|
-
{
|
10
|
-
learning_rate: {
|
11
|
-
min: 0.001,
|
12
|
-
max: 0.1,
|
13
|
-
log: true,
|
14
|
-
},
|
15
|
-
n_estimators: {
|
16
|
-
min: 100,
|
17
|
-
max: 1_000,
|
18
|
-
},
|
19
|
-
max_depth: {
|
20
|
-
min: 2,
|
21
|
-
max: 20,
|
22
|
-
},
|
23
|
-
}
|
24
|
-
end
|
25
8
|
end
|
26
9
|
end
|
27
10
|
end
|
data/lib/easy_ml/core/tuner.rb
CHANGED
@@ -73,13 +73,13 @@ module EasyML
|
|
73
73
|
model.task = task
|
74
74
|
|
75
75
|
model.dataset.refresh if model.dataset.needs_refresh?
|
76
|
-
x_valid, y_valid = model.dataset.valid(split_ys: true, all_columns: true)
|
76
|
+
x_valid, y_valid = model.dataset.processed.valid(split_ys: true, all_columns: true)
|
77
77
|
x_normalized = model.dataset.normalize(x_valid, inference: true)
|
78
78
|
x_normalized = model.preprocess(x_normalized)
|
79
79
|
self.x_valid = x_valid
|
80
80
|
self.y_valid = y_valid
|
81
81
|
self.x_normalized = x_normalized
|
82
|
-
self.dataset = model.dataset.valid(all_columns: true)
|
82
|
+
self.dataset = model.dataset.processed.valid(all_columns: true)
|
83
83
|
adapter.tune_started_at = tune_started_at
|
84
84
|
adapter.x_valid = x_valid
|
85
85
|
adapter.y_valid = y_valid
|
@@ -108,7 +108,6 @@ module EasyML
|
|
108
108
|
end
|
109
109
|
end
|
110
110
|
|
111
|
-
model.after_tuning
|
112
111
|
return nil if tuner_job.tuner_runs.all?(&:failed?)
|
113
112
|
|
114
113
|
best_run = tuner_job.best_run
|
@@ -118,6 +117,13 @@ module EasyML
|
|
118
117
|
status: :success,
|
119
118
|
completed_at: Time.current,
|
120
119
|
)
|
120
|
+
model.after_tuning
|
121
|
+
if best_run&.hyperparameters.present?
|
122
|
+
model.hyperparameters = best_run.hyperparameters
|
123
|
+
model.fit
|
124
|
+
model.save
|
125
|
+
end
|
126
|
+
model.cleanup
|
121
127
|
|
122
128
|
best_run&.hyperparameters
|
123
129
|
rescue StandardError => e
|
@@ -35,6 +35,18 @@ module EasyML
|
|
35
35
|
|
36
36
|
private
|
37
37
|
|
38
|
+
def list_df_nulls(df)
|
39
|
+
df = df.lazy
|
40
|
+
|
41
|
+
columns = df.columns
|
42
|
+
selects = columns.map do |col|
|
43
|
+
Polars.col(col).null_count.alias(col)
|
44
|
+
end
|
45
|
+
null_info = df.select(selects).collect
|
46
|
+
null_info.to_hashes.first.compact
|
47
|
+
null_info.to_hashes.first.transform_values { |v| v > 0 ? v : nil }.compact.keys
|
48
|
+
end
|
49
|
+
|
38
50
|
def apply_defaults(kwargs)
|
39
51
|
options = kwargs.dup
|
40
52
|
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
module EasyML
|
3
2
|
module Data
|
4
3
|
class DatasetManager
|
@@ -8,11 +7,17 @@ module EasyML
|
|
8
7
|
return query_dataframes(lazy_frames, schema)
|
9
8
|
end
|
10
9
|
|
10
|
+
def list_nulls
|
11
|
+
df = lazy_frames
|
12
|
+
list_df_nulls(df)
|
13
|
+
end
|
14
|
+
|
11
15
|
def schema
|
12
16
|
input.schema
|
13
17
|
end
|
14
18
|
|
15
|
-
|
19
|
+
private
|
20
|
+
|
16
21
|
def lazy_frames
|
17
22
|
input.lazy
|
18
23
|
end
|
@@ -20,4 +25,4 @@ module EasyML
|
|
20
25
|
end
|
21
26
|
end
|
22
27
|
end
|
23
|
-
end
|
28
|
+
end
|
@@ -17,12 +17,18 @@ module EasyML
|
|
17
17
|
).query
|
18
18
|
end
|
19
19
|
|
20
|
-
def self.schema(input, **kwargs, &block)
|
20
|
+
def self.schema(input = nil, **kwargs, &block)
|
21
21
|
adapter(input).new(
|
22
22
|
kwargs.merge!(input: input), &block
|
23
23
|
).schema
|
24
24
|
end
|
25
25
|
|
26
|
+
def self.list_nulls(input = nil, **kwargs, &block)
|
27
|
+
adapter(input).new(
|
28
|
+
kwargs.merge!(input: input), &block
|
29
|
+
).list_nulls
|
30
|
+
end
|
31
|
+
|
26
32
|
def self.files(dir)
|
27
33
|
Dir.glob(::File.join(dir, "**/*.{parquet}"))
|
28
34
|
end
|
@@ -44,13 +44,21 @@ module EasyML
|
|
44
44
|
Reader.schema(input, **kwargs, &block)
|
45
45
|
end
|
46
46
|
|
47
|
+
def list_nulls(input = nil, **kwargs, &block)
|
48
|
+
Reader.list_nulls(input, **kwargs, &block)
|
49
|
+
end
|
50
|
+
|
47
51
|
def num_rows
|
48
52
|
Reader.num_rows
|
49
53
|
end
|
50
54
|
end
|
51
55
|
|
52
|
-
def
|
53
|
-
Reader.
|
56
|
+
def list_nulls(input = nil, **kwargs, &block)
|
57
|
+
Reader.list_nulls(input, **kwargs, &block)
|
58
|
+
end
|
59
|
+
|
60
|
+
def num_rows(input = nil, **kwargs, &block)
|
61
|
+
Reader.num_rows(input, **kwargs, &block)
|
54
62
|
end
|
55
63
|
|
56
64
|
def query(input = nil, **kwargs, &block)
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class Embeddings
|
4
|
+
class Adapters
|
5
|
+
attr_accessor :model, :config
|
6
|
+
|
7
|
+
ADAPTERS = {
|
8
|
+
anthropic: Langchain::LLM::Anthropic,
|
9
|
+
gemini: Langchain::LLM::GoogleGemini,
|
10
|
+
openai: Langchain::LLM::OpenAI,
|
11
|
+
ollama: Langchain::LLM::Ollama,
|
12
|
+
}
|
13
|
+
|
14
|
+
DEFAULTS = {
|
15
|
+
api_key: {
|
16
|
+
anthropic: ENV["ANTHROPIC_API_KEY"],
|
17
|
+
gemini: ENV["GEMINI_API_KEY"],
|
18
|
+
openai: ENV["OPENAI_API_KEY"],
|
19
|
+
ollama: ENV["OLLAMA_API_KEY"],
|
20
|
+
},
|
21
|
+
}
|
22
|
+
|
23
|
+
def initialize(model, config = {})
|
24
|
+
@model = model.to_sym
|
25
|
+
@config = config.symbolize_keys
|
26
|
+
apply_defaults
|
27
|
+
end
|
28
|
+
|
29
|
+
def embed(df, col)
|
30
|
+
pick
|
31
|
+
texts = df[col].to_a
|
32
|
+
df = df.with_column(
|
33
|
+
embeddings: adapter.embed(text: texts),
|
34
|
+
)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def pick
|
40
|
+
@adapter ||= ADAPTERS[@model].new(config)
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
def apply_defaults
|
45
|
+
@config = @config.deep_symbolize_keys
|
46
|
+
|
47
|
+
DEFAULTS.each do |k, v|
|
48
|
+
unless @config.key?(k)
|
49
|
+
@config[k] = v[@model]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class Embeddings
|
4
|
+
COMPRESSION_DEFAULT = {
|
5
|
+
present: :balanced,
|
6
|
+
}
|
7
|
+
|
8
|
+
attr_reader :df, :column, :model, :adapter, :compression,
|
9
|
+
:embeddings, :compressed_embeddings
|
10
|
+
|
11
|
+
def initialize(options = {})
|
12
|
+
@df = options[:df]
|
13
|
+
@column = options[:column]
|
14
|
+
@model = options[:model]
|
15
|
+
@config = options[:config] || {}
|
16
|
+
@compression = options[:compression] || COMPRESSION_DEFAULT
|
17
|
+
end
|
18
|
+
|
19
|
+
def create
|
20
|
+
embed
|
21
|
+
compress
|
22
|
+
end
|
23
|
+
|
24
|
+
def embed
|
25
|
+
@embeddings ||= adapter.embed(df, column)
|
26
|
+
end
|
27
|
+
|
28
|
+
def compress
|
29
|
+
@compressed_embeddings ||= compression_adapter.compress(embeddings)
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def adapter
|
35
|
+
@adapter ||= EasyML::Data::Embeddings::Adapters.new(model, config)
|
36
|
+
end
|
37
|
+
|
38
|
+
def compression_adapter
|
39
|
+
@compression_adapter ||= EasyML::Data::Embeddings::Compression.new(compression)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/lib/easy_ml/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: easy_ml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.pre.
|
4
|
+
version: 0.2.0.pre.rc82
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brett Shollenberger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-02-
|
11
|
+
date: 2025-02-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -719,6 +719,9 @@ files:
|
|
719
719
|
- lib/easy_ml/data/dataset_manager/writer/partitioned.rb
|
720
720
|
- lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb
|
721
721
|
- lib/easy_ml/data/date_converter.rb
|
722
|
+
- lib/easy_ml/data/embeddings.rb
|
723
|
+
- lib/easy_ml/data/embeddings/adapters.rb
|
724
|
+
- lib/easy_ml/data/embeddings/compression.rb
|
722
725
|
- lib/easy_ml/data/partition.rb
|
723
726
|
- lib/easy_ml/data/partition/boundaries.rb
|
724
727
|
- lib/easy_ml/data/polars_column.rb
|