easy_ml 0.2.0.pre.rc78 → 0.2.0.pre.rc81
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +3 -3
- data/app/controllers/easy_ml/models_controller.rb +2 -2
- data/app/models/easy_ml/column/imputers/imputer.rb +2 -0
- data/app/models/easy_ml/column_list.rb +2 -3
- data/app/models/easy_ml/dataset.rb +22 -11
- data/app/models/easy_ml/feature.rb +22 -6
- data/app/models/easy_ml/model.rb +9 -4
- data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +21 -5
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +6 -3
- data/app/models/easy_ml/models/xgboost.rb +18 -12
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +16 -3
- data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +0 -17
- data/lib/easy_ml/core/tuner.rb +7 -1
- data/lib/easy_ml/data/dataset_manager/reader/base.rb +12 -0
- data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +8 -3
- data/lib/easy_ml/data/dataset_manager/reader/file.rb +5 -0
- data/lib/easy_ml/data/dataset_manager/reader.rb +7 -1
- data/lib/easy_ml/data/dataset_manager.rb +10 -2
- data/lib/easy_ml/data/embeddings/adapters.rb +56 -0
- data/lib/easy_ml/data/embeddings/compression.rb +0 -0
- data/lib/easy_ml/data/embeddings.rb +43 -0
- data/lib/easy_ml/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 169873e9ea5e1b00f7a4e499a2aeffc377615757cdd4b5fe6d70c8c454b9d426
|
4
|
+
data.tar.gz: fc1d4509606f011bd3adbdf367e767a3e9dfc4fdbb6b5cd91bb413f72da364b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f4ea106a66d3185f612e481b607cef21f5453a00ef6eb12558e53f43aa1d68b8f20d1e950c1840d97180ba6271db9b9b42c8a2d480c22ca8c1e33d1b768590d2
|
7
|
+
data.tar.gz: 24885cdecd46d612be8b8ce7d4bd9889bdf60420bc82c01993cfe0168e454ebdaaa70899f5cf9cc879ff89a895fd00b1761d01c945dafffc784f7bc1e4a2fb8e
|
@@ -23,7 +23,7 @@
|
|
23
23
|
module EasyML
|
24
24
|
class DatasetsController < ApplicationController
|
25
25
|
def index
|
26
|
-
datasets = Dataset.all.order(id: :desc)
|
26
|
+
datasets = Dataset.all.includes(:columns, :datasource).order(id: :desc)
|
27
27
|
|
28
28
|
render inertia: "pages/DatasetsPage", props: {
|
29
29
|
datasets: datasets.map { |dataset| dataset_to_json_small(dataset) },
|
@@ -80,7 +80,7 @@ module EasyML
|
|
80
80
|
if dataset_params[:features_attributes].present?
|
81
81
|
# Clean up any feature IDs that don't exist anymore
|
82
82
|
feature_ids = dataset_params[:features_attributes].map { |attrs| attrs[:id] }.compact
|
83
|
-
existing_feature_ids =
|
83
|
+
existing_feature_ids = dataset.features.where(id: feature_ids).pluck(:id)
|
84
84
|
|
85
85
|
params[:dataset][:features_attributes].each do |attrs|
|
86
86
|
if attrs[:id].present? && !existing_feature_ids.include?(attrs[:id].to_i)
|
@@ -93,7 +93,7 @@ module EasyML
|
|
93
93
|
attrs[:feature_class] if attrs[:id].blank?
|
94
94
|
}.compact
|
95
95
|
|
96
|
-
existing_features =
|
96
|
+
existing_features = dataset.features.where(feature_class: feature_classes)
|
97
97
|
|
98
98
|
# Update params with existing feature IDs
|
99
99
|
existing_features.each do |feature|
|
@@ -41,7 +41,7 @@ module EasyML
|
|
41
41
|
render inertia: "pages/EditModelPage", props: {
|
42
42
|
model: model_to_json(model),
|
43
43
|
datasets: EasyML::Dataset.all.map do |dataset|
|
44
|
-
|
44
|
+
dataset_to_json_small(dataset)
|
45
45
|
end,
|
46
46
|
constants: EasyML::Model.constants,
|
47
47
|
}
|
@@ -167,7 +167,7 @@ module EasyML
|
|
167
167
|
private
|
168
168
|
|
169
169
|
def includes_list
|
170
|
-
[:retraining_runs, :retraining_job, dataset: [:
|
170
|
+
[:retraining_runs, :retraining_job, dataset: [:features, :splitter, columns: [:lineages]]]
|
171
171
|
end
|
172
172
|
|
173
173
|
def model_params
|
@@ -28,12 +28,11 @@ module EasyML
|
|
28
28
|
if computed
|
29
29
|
cols = column_list.computed
|
30
30
|
else
|
31
|
-
cols = column_list
|
31
|
+
cols = column_list
|
32
32
|
end
|
33
33
|
|
34
34
|
by_name = cols.index_by(&:name)
|
35
|
-
|
36
|
-
column = by_name[col]
|
35
|
+
cols.each do |column|
|
37
36
|
df = column.transform(df, inference: inference, computed: computed) if column
|
38
37
|
end
|
39
38
|
|
@@ -232,20 +232,20 @@ module EasyML
|
|
232
232
|
cleanup
|
233
233
|
refresh_datasource!
|
234
234
|
split_data
|
235
|
-
|
235
|
+
fit
|
236
236
|
end
|
237
237
|
|
238
238
|
def prepare
|
239
239
|
prepare_features
|
240
240
|
refresh_datasource
|
241
241
|
split_data
|
242
|
-
|
242
|
+
fit
|
243
243
|
end
|
244
244
|
|
245
245
|
def actually_refresh
|
246
246
|
refreshing do
|
247
|
-
|
248
|
-
|
247
|
+
fit
|
248
|
+
normalize_all
|
249
249
|
fully_reload
|
250
250
|
learn
|
251
251
|
learn_statistics(type: :processed) # After processing data, we learn any new statistics
|
@@ -385,6 +385,7 @@ module EasyML
|
|
385
385
|
def unlock!
|
386
386
|
Support::Lockable.unlock!(lock_key)
|
387
387
|
features.each(&:unlock!)
|
388
|
+
true
|
388
389
|
end
|
389
390
|
|
390
391
|
def locked?
|
@@ -427,12 +428,6 @@ module EasyML
|
|
427
428
|
(read_attribute(:statistics) || {}).with_indifferent_access
|
428
429
|
end
|
429
430
|
|
430
|
-
def process_data
|
431
|
-
learn(delete: false)
|
432
|
-
fit
|
433
|
-
normalize_all
|
434
|
-
end
|
435
|
-
|
436
431
|
def needs_learn?
|
437
432
|
return true if columns_need_refresh?
|
438
433
|
|
@@ -483,7 +478,7 @@ module EasyML
|
|
483
478
|
df = apply_missing_columns(df, inference: inference)
|
484
479
|
df = columns.transform(df, inference: inference)
|
485
480
|
df = apply_features(df, features)
|
486
|
-
df = columns.transform(df, inference: inference
|
481
|
+
df = columns.transform(df, inference: inference)
|
487
482
|
df = apply_column_mask(df, inference: inference) unless all_columns
|
488
483
|
df = drop_nulls(df) unless inference
|
489
484
|
df, = processed.split_features_targets(df, true, target) if split_ys
|
@@ -722,6 +717,20 @@ module EasyML
|
|
722
717
|
reload
|
723
718
|
end
|
724
719
|
|
720
|
+
def list_nulls(input = nil, list_raw = false)
|
721
|
+
input = data(lazy: true) if input.nil?
|
722
|
+
|
723
|
+
case input
|
724
|
+
when Polars::DataFrame
|
725
|
+
input = input.lazy
|
726
|
+
when String, Symbol
|
727
|
+
input = input.to_sym
|
728
|
+
input = send(input).data(lazy: true)
|
729
|
+
end
|
730
|
+
col_list = EasyML::Data::DatasetManager.list_nulls(input)
|
731
|
+
list_raw ? col_list : regular_columns(col_list)
|
732
|
+
end
|
733
|
+
|
725
734
|
private
|
726
735
|
|
727
736
|
def apply_date_splitter_config
|
@@ -798,6 +807,7 @@ module EasyML
|
|
798
807
|
processed_df = normalize(df, all_columns: true)
|
799
808
|
processed.save(segment, processed_df)
|
800
809
|
end
|
810
|
+
features.select { |f| !f.fittable? }.each(&:after_transform)
|
801
811
|
@normalized = true
|
802
812
|
end
|
803
813
|
|
@@ -840,6 +850,7 @@ module EasyML
|
|
840
850
|
end
|
841
851
|
|
842
852
|
def fit
|
853
|
+
learn(delete: false)
|
843
854
|
learn_statistics(type: :raw)
|
844
855
|
end
|
845
856
|
|
@@ -78,11 +78,18 @@ module EasyML
|
|
78
78
|
scope :never_applied, -> { where(applied_at: nil) }
|
79
79
|
scope :never_fit, -> do
|
80
80
|
fittable = where(fit_at: nil)
|
81
|
-
fittable = fittable.select
|
81
|
+
fittable = fittable.select(&:fittable?)
|
82
82
|
where(id: fittable.map(&:id))
|
83
83
|
end
|
84
84
|
scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
|
85
|
-
scope :ready_to_apply, ->
|
85
|
+
scope :ready_to_apply, -> do
|
86
|
+
base = where(needs_fit: false).where.not(id: has_changes.map(&:id))
|
87
|
+
doesnt_fit = where_no_fit
|
88
|
+
where(id: base.map(&:id).concat(doesnt_fit.map(&:id)))
|
89
|
+
end
|
90
|
+
|
91
|
+
scope :fittable, -> { all.select(&:fittable?) }
|
92
|
+
scope :where_no_fit, -> { all.reject(&:fittable?) }
|
86
93
|
|
87
94
|
before_save :apply_defaults, if: :new_record?
|
88
95
|
before_save :update_sha
|
@@ -100,6 +107,10 @@ module EasyML
|
|
100
107
|
feature_klass.present?
|
101
108
|
end
|
102
109
|
|
110
|
+
def fittable?
|
111
|
+
adapter.respond_to?(:fit)
|
112
|
+
end
|
113
|
+
|
103
114
|
def adapter
|
104
115
|
@adapter ||= feature_klass.new
|
105
116
|
end
|
@@ -213,13 +224,14 @@ module EasyML
|
|
213
224
|
end
|
214
225
|
|
215
226
|
def wipe
|
227
|
+
update(needs_fit: true) if fittable?
|
216
228
|
feature_store.wipe
|
217
229
|
end
|
218
230
|
|
219
231
|
def fit(features: [self], async: false)
|
220
232
|
ordered_features = features.sort_by(&:feature_position)
|
221
233
|
parent_batch_id = Random.uuid
|
222
|
-
jobs = ordered_features.map do |feature|
|
234
|
+
jobs = ordered_features.select(&:fittable?).map do |feature|
|
223
235
|
feature.build_batches.map do |batch_args|
|
224
236
|
batch_args.merge(parent_batch_id: parent_batch_id)
|
225
237
|
end
|
@@ -450,7 +462,7 @@ module EasyML
|
|
450
462
|
def after_fit
|
451
463
|
update_sha
|
452
464
|
|
453
|
-
feature_store.compact
|
465
|
+
feature_store.compact if fittable?
|
454
466
|
updates = {
|
455
467
|
fit_at: Time.current,
|
456
468
|
needs_fit: false,
|
@@ -459,6 +471,10 @@ module EasyML
|
|
459
471
|
update!(updates)
|
460
472
|
end
|
461
473
|
|
474
|
+
def after_transform
|
475
|
+
feature_store.compact if !fittable?
|
476
|
+
end
|
477
|
+
|
462
478
|
def unlock!
|
463
479
|
feature_store.unlock!
|
464
480
|
end
|
@@ -517,14 +533,14 @@ module EasyML
|
|
517
533
|
new_sha = compute_sha
|
518
534
|
if new_sha != self.sha
|
519
535
|
self.sha = new_sha
|
520
|
-
self.needs_fit =
|
536
|
+
self.needs_fit = fittable?
|
521
537
|
end
|
522
538
|
end
|
523
539
|
|
524
540
|
def update_from_feature_class
|
525
541
|
if read_attribute(:batch_size) != config.dig(:batch_size)
|
526
542
|
write_attribute(:batch_size, config.dig(:batch_size))
|
527
|
-
self.needs_fit =
|
543
|
+
self.needs_fit = fittable?
|
528
544
|
end
|
529
545
|
|
530
546
|
if self.primary_key != config.dig(:primary_key)
|
data/app/models/easy_ml/model.rb
CHANGED
@@ -179,17 +179,18 @@ module EasyML
|
|
179
179
|
end
|
180
180
|
|
181
181
|
def actually_train(&progress_block)
|
182
|
-
raise untrainable_error unless trainable?
|
183
|
-
|
184
182
|
lock_model do
|
185
183
|
run = pending_run
|
186
184
|
run.wrap_training do
|
185
|
+
raise untrainable_error unless trainable?
|
186
|
+
|
187
187
|
best_params = nil
|
188
188
|
if run.should_tune?
|
189
189
|
best_params = hyperparameter_search(&progress_block)
|
190
|
+
else
|
191
|
+
fit(&progress_block)
|
192
|
+
save
|
190
193
|
end
|
191
|
-
fit(&progress_block)
|
192
|
-
save
|
193
194
|
[self, best_params]
|
194
195
|
end
|
195
196
|
update(is_training: false)
|
@@ -393,6 +394,10 @@ module EasyML
|
|
393
394
|
adapter.after_tuning
|
394
395
|
end
|
395
396
|
|
397
|
+
def cleanup
|
398
|
+
adapter.cleanup
|
399
|
+
end
|
400
|
+
|
396
401
|
def fit_in_batches(tuning: false, batch_size: nil, batch_overlap: nil, batch_key: nil, checkpoint_dir: Rails.root.join("tmp", "xgboost_checkpoints"), &progress_block)
|
397
402
|
adapter.fit_in_batches(tuning: tuning, batch_size: batch_size, batch_overlap: batch_overlap, batch_key: batch_key, checkpoint_dir: checkpoint_dir, &progress_block)
|
398
403
|
end
|
@@ -37,6 +37,20 @@ module EasyML
|
|
37
37
|
max: 10,
|
38
38
|
step: 0.1,
|
39
39
|
},
|
40
|
+
scale_pos_weight: {
|
41
|
+
label: "Scale Pos Weight",
|
42
|
+
description: "Balance of positive and negative weights",
|
43
|
+
min: 0,
|
44
|
+
max: 200,
|
45
|
+
step: 1,
|
46
|
+
},
|
47
|
+
max_delta_step: {
|
48
|
+
label: "Max Delta Step",
|
49
|
+
description: "Maximum delta step",
|
50
|
+
min: 0,
|
51
|
+
max: 10,
|
52
|
+
step: 1,
|
53
|
+
},
|
40
54
|
gamma: {
|
41
55
|
label: "Gamma",
|
42
56
|
description: "Minimum loss reduction required to make a further partition",
|
@@ -81,11 +95,13 @@ module EasyML
|
|
81
95
|
label: "Histogram",
|
82
96
|
description: "Fast histogram optimized approximate greedy algorithm",
|
83
97
|
},
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
98
|
+
# Only when compiled wih GPU support...
|
99
|
+
# How to make this not a default optoin
|
100
|
+
# {
|
101
|
+
# value: "gpu_hist",
|
102
|
+
# label: "GPU Histogram",
|
103
|
+
# description: "GPU implementation of hist algorithm",
|
104
|
+
# },
|
89
105
|
],
|
90
106
|
},
|
91
107
|
)
|
@@ -103,7 +103,7 @@ module EasyML
|
|
103
103
|
model.callbacks.detect { |cb| cb.class == Wandb::XGBoostCallback }
|
104
104
|
end
|
105
105
|
|
106
|
-
def track_cumulative_feature_importance
|
106
|
+
def track_cumulative_feature_importance
|
107
107
|
return unless @feature_importances
|
108
108
|
|
109
109
|
project_name = model.adapter.get_wandb_project
|
@@ -127,13 +127,16 @@ module EasyML
|
|
127
127
|
"feature_importance" => bar_plot.__pyptr__,
|
128
128
|
}
|
129
129
|
Wandb.log(log_data)
|
130
|
-
model.adapter.delete_wandb_project if finish
|
131
|
-
Wandb.finish if finish
|
132
130
|
end
|
133
131
|
|
134
132
|
def after_tuning
|
135
133
|
track_cumulative_feature_importance
|
136
134
|
end
|
135
|
+
|
136
|
+
def cleanup
|
137
|
+
model.adapter.delete_wandb_project
|
138
|
+
Wandb.finish
|
139
|
+
end
|
137
140
|
end
|
138
141
|
end
|
139
142
|
end
|
@@ -135,6 +135,12 @@ module EasyML
|
|
135
135
|
end
|
136
136
|
end
|
137
137
|
|
138
|
+
def cleanup
|
139
|
+
model.callbacks.each do |callback|
|
140
|
+
callback.cleanup if callback.respond_to?(:cleanup)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
138
144
|
def prepare_callbacks(tuner)
|
139
145
|
set_wandb_project(tuner.project_name)
|
140
146
|
|
@@ -439,21 +445,18 @@ module EasyML
|
|
439
445
|
end
|
440
446
|
|
441
447
|
def untrainable_columns
|
442
|
-
|
443
|
-
|
444
|
-
columns = df.columns
|
445
|
-
selects = columns.map do |col|
|
446
|
-
Polars.col(col).null_count.alias(col)
|
447
|
-
end
|
448
|
-
null_info = df.select(selects).collect
|
449
|
-
null_info.to_hashes.first.compact
|
450
|
-
col_list = null_info.to_hashes.first.transform_values { |v| v > 0 ? v : nil }.compact.keys
|
448
|
+
model.dataset.refresh if model.dataset.processed.nil?
|
451
449
|
|
452
|
-
model.dataset.
|
450
|
+
model.dataset.list_nulls(
|
451
|
+
model.dataset.processed.data(lazy: true)
|
452
|
+
)
|
453
453
|
end
|
454
454
|
|
455
455
|
def preprocess(xs, ys = nil)
|
456
456
|
return xs if xs.is_a?(::XGBoost::DMatrix)
|
457
|
+
lazy = xs.is_a?(Polars::LazyFrame)
|
458
|
+
return xs if (lazy ? xs.limit(1).collect : xs).shape[0] == 0
|
459
|
+
|
457
460
|
weights_col = model.weights_column || nil
|
458
461
|
|
459
462
|
if weights_col == model.dataset.target
|
@@ -463,10 +466,13 @@ module EasyML
|
|
463
466
|
# Extract feature columns (all columns except label and weight)
|
464
467
|
feature_cols = xs.columns
|
465
468
|
feature_cols -= [weights_col] if weights_col
|
466
|
-
lazy = xs.is_a?(Polars::LazyFrame)
|
467
469
|
|
468
470
|
# Get features, labels and weights
|
469
|
-
|
471
|
+
begin
|
472
|
+
features = lazy ? xs.select(feature_cols).collect.to_numo : xs.select(feature_cols).to_numo
|
473
|
+
rescue => e
|
474
|
+
binding.pry
|
475
|
+
end
|
470
476
|
weights = weights_col ? (lazy ? xs.select(weights_col).collect.to_numo : xs.select(weights_col).to_numo) : nil
|
471
477
|
weights = weights.flatten if weights
|
472
478
|
if ys.present?
|
@@ -18,12 +18,22 @@ module EasyML
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def defaults
|
21
|
-
|
21
|
+
model.adapter.hyperparameters.class.hyperparameter_constants.transform_values do |constant|
|
22
|
+
values = constant.slice(:min, :max, :step, :options)
|
23
|
+
if values.key?(:options)
|
24
|
+
values[:options] = values[:options].map { |option| option[:value] }
|
25
|
+
end
|
26
|
+
values
|
27
|
+
end
|
22
28
|
end
|
23
29
|
|
24
30
|
def run_trial(trial)
|
25
31
|
config = deep_merge_defaults(self.config.clone.deep_symbolize_keys)
|
26
|
-
|
32
|
+
# For first trial, re-use the original hyperparameters, so they
|
33
|
+
# serve as our starting point/imputers
|
34
|
+
unless trial == 1
|
35
|
+
suggest_parameters(trial, config)
|
36
|
+
end
|
27
37
|
yield model
|
28
38
|
end
|
29
39
|
|
@@ -57,8 +67,11 @@ module EasyML
|
|
57
67
|
min = param_config[:min]
|
58
68
|
max = param_config[:max]
|
59
69
|
log = param_config[:log]
|
70
|
+
options = param_config[:options]
|
60
71
|
|
61
|
-
if
|
72
|
+
if options
|
73
|
+
trial.suggest_categorical(param_name.to_s, options)
|
74
|
+
elsif log
|
62
75
|
trial.suggest_loguniform(param_name.to_s, min, max)
|
63
76
|
elsif max.is_a?(Integer) && min.is_a?(Integer)
|
64
77
|
trial.suggest_int(param_name.to_s, min, max)
|
@@ -5,23 +5,6 @@ module EasyML
|
|
5
5
|
class Tuner
|
6
6
|
module Adapters
|
7
7
|
class XGBoostAdapter < BaseAdapter
|
8
|
-
def defaults
|
9
|
-
{
|
10
|
-
learning_rate: {
|
11
|
-
min: 0.001,
|
12
|
-
max: 0.1,
|
13
|
-
log: true,
|
14
|
-
},
|
15
|
-
n_estimators: {
|
16
|
-
min: 100,
|
17
|
-
max: 1_000,
|
18
|
-
},
|
19
|
-
max_depth: {
|
20
|
-
min: 2,
|
21
|
-
max: 20,
|
22
|
-
},
|
23
|
-
}
|
24
|
-
end
|
25
8
|
end
|
26
9
|
end
|
27
10
|
end
|
data/lib/easy_ml/core/tuner.rb
CHANGED
@@ -108,7 +108,6 @@ module EasyML
|
|
108
108
|
end
|
109
109
|
end
|
110
110
|
|
111
|
-
model.after_tuning
|
112
111
|
return nil if tuner_job.tuner_runs.all?(&:failed?)
|
113
112
|
|
114
113
|
best_run = tuner_job.best_run
|
@@ -118,6 +117,13 @@ module EasyML
|
|
118
117
|
status: :success,
|
119
118
|
completed_at: Time.current,
|
120
119
|
)
|
120
|
+
model.after_tuning
|
121
|
+
if best_run&.hyperparameters.present?
|
122
|
+
model.hyperparameters = best_run.hyperparameters
|
123
|
+
model.fit
|
124
|
+
model.save
|
125
|
+
end
|
126
|
+
model.cleanup
|
121
127
|
|
122
128
|
best_run&.hyperparameters
|
123
129
|
rescue StandardError => e
|
@@ -35,6 +35,18 @@ module EasyML
|
|
35
35
|
|
36
36
|
private
|
37
37
|
|
38
|
+
def list_df_nulls(df)
|
39
|
+
df = df.lazy
|
40
|
+
|
41
|
+
columns = df.columns
|
42
|
+
selects = columns.map do |col|
|
43
|
+
Polars.col(col).null_count.alias(col)
|
44
|
+
end
|
45
|
+
null_info = df.select(selects).collect
|
46
|
+
null_info.to_hashes.first.compact
|
47
|
+
null_info.to_hashes.first.transform_values { |v| v > 0 ? v : nil }.compact.keys
|
48
|
+
end
|
49
|
+
|
38
50
|
def apply_defaults(kwargs)
|
39
51
|
options = kwargs.dup
|
40
52
|
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
module EasyML
|
3
2
|
module Data
|
4
3
|
class DatasetManager
|
@@ -8,11 +7,17 @@ module EasyML
|
|
8
7
|
return query_dataframes(lazy_frames, schema)
|
9
8
|
end
|
10
9
|
|
10
|
+
def list_nulls
|
11
|
+
df = lazy_frames
|
12
|
+
list_df_nulls(df)
|
13
|
+
end
|
14
|
+
|
11
15
|
def schema
|
12
16
|
input.schema
|
13
17
|
end
|
14
18
|
|
15
|
-
|
19
|
+
private
|
20
|
+
|
16
21
|
def lazy_frames
|
17
22
|
input.lazy
|
18
23
|
end
|
@@ -20,4 +25,4 @@ module EasyML
|
|
20
25
|
end
|
21
26
|
end
|
22
27
|
end
|
23
|
-
end
|
28
|
+
end
|
@@ -17,12 +17,18 @@ module EasyML
|
|
17
17
|
).query
|
18
18
|
end
|
19
19
|
|
20
|
-
def self.schema(input, **kwargs, &block)
|
20
|
+
def self.schema(input = nil, **kwargs, &block)
|
21
21
|
adapter(input).new(
|
22
22
|
kwargs.merge!(input: input), &block
|
23
23
|
).schema
|
24
24
|
end
|
25
25
|
|
26
|
+
def self.list_nulls(input = nil, **kwargs, &block)
|
27
|
+
adapter(input).new(
|
28
|
+
kwargs.merge!(input: input), &block
|
29
|
+
).list_nulls
|
30
|
+
end
|
31
|
+
|
26
32
|
def self.files(dir)
|
27
33
|
Dir.glob(::File.join(dir, "**/*.{parquet}"))
|
28
34
|
end
|
@@ -44,13 +44,21 @@ module EasyML
|
|
44
44
|
Reader.schema(input, **kwargs, &block)
|
45
45
|
end
|
46
46
|
|
47
|
+
def list_nulls(input = nil, **kwargs, &block)
|
48
|
+
Reader.list_nulls(input, **kwargs, &block)
|
49
|
+
end
|
50
|
+
|
47
51
|
def num_rows
|
48
52
|
Reader.num_rows
|
49
53
|
end
|
50
54
|
end
|
51
55
|
|
52
|
-
def
|
53
|
-
Reader.
|
56
|
+
def list_nulls(input = nil, **kwargs, &block)
|
57
|
+
Reader.list_nulls(input, **kwargs, &block)
|
58
|
+
end
|
59
|
+
|
60
|
+
def num_rows(input = nil, **kwargs, &block)
|
61
|
+
Reader.num_rows(input, **kwargs, &block)
|
54
62
|
end
|
55
63
|
|
56
64
|
def query(input = nil, **kwargs, &block)
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class Embeddings
|
4
|
+
class Adapters
|
5
|
+
attr_accessor :model, :config
|
6
|
+
|
7
|
+
ADAPTERS = {
|
8
|
+
anthropic: Langchain::LLM::Anthropic,
|
9
|
+
gemini: Langchain::LLM::GoogleGemini,
|
10
|
+
openai: Langchain::LLM::OpenAI,
|
11
|
+
ollama: Langchain::LLM::Ollama,
|
12
|
+
}
|
13
|
+
|
14
|
+
DEFAULTS = {
|
15
|
+
api_key: {
|
16
|
+
anthropic: ENV["ANTHROPIC_API_KEY"],
|
17
|
+
gemini: ENV["GEMINI_API_KEY"],
|
18
|
+
openai: ENV["OPENAI_API_KEY"],
|
19
|
+
ollama: ENV["OLLAMA_API_KEY"],
|
20
|
+
},
|
21
|
+
}
|
22
|
+
|
23
|
+
def initialize(model, config = {})
|
24
|
+
@model = model.to_sym
|
25
|
+
@config = config.symbolize_keys
|
26
|
+
apply_defaults
|
27
|
+
end
|
28
|
+
|
29
|
+
def embed(df, col)
|
30
|
+
pick
|
31
|
+
texts = df[col].to_a
|
32
|
+
df = df.with_column(
|
33
|
+
embeddings: adapter.embed(text: texts),
|
34
|
+
)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def pick
|
40
|
+
@adapter ||= ADAPTERS[@model].new(config)
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
def apply_defaults
|
45
|
+
@config = @config.deep_symbolize_keys
|
46
|
+
|
47
|
+
DEFAULTS.each do |k, v|
|
48
|
+
unless @config.key?(k)
|
49
|
+
@config[k] = v[@model]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
File without changes
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class Embeddings
|
4
|
+
COMPRESSION_DEFAULT = {
|
5
|
+
present: :balanced,
|
6
|
+
}
|
7
|
+
|
8
|
+
attr_reader :df, :column, :model, :adapter, :compression,
|
9
|
+
:embeddings, :compressed_embeddings
|
10
|
+
|
11
|
+
def initialize(options = {})
|
12
|
+
@df = options[:df]
|
13
|
+
@column = options[:column]
|
14
|
+
@model = options[:model]
|
15
|
+
@config = options[:config] || {}
|
16
|
+
@compression = options[:compression] || COMPRESSION_DEFAULT
|
17
|
+
end
|
18
|
+
|
19
|
+
def create
|
20
|
+
embed
|
21
|
+
compress
|
22
|
+
end
|
23
|
+
|
24
|
+
def embed
|
25
|
+
@embeddings ||= adapter.embed(df, column)
|
26
|
+
end
|
27
|
+
|
28
|
+
def compress
|
29
|
+
@compressed_embeddings ||= compression_adapter.compress(embeddings)
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def adapter
|
35
|
+
@adapter ||= EasyML::Data::Embeddings::Adapters.new(model, config)
|
36
|
+
end
|
37
|
+
|
38
|
+
def compression_adapter
|
39
|
+
@compression_adapter ||= EasyML::Data::Embeddings::Compression.new(compression)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/lib/easy_ml/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: easy_ml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.0.pre.
|
4
|
+
version: 0.2.0.pre.rc81
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brett Shollenberger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-02-
|
11
|
+
date: 2025-02-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -719,6 +719,9 @@ files:
|
|
719
719
|
- lib/easy_ml/data/dataset_manager/writer/partitioned.rb
|
720
720
|
- lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb
|
721
721
|
- lib/easy_ml/data/date_converter.rb
|
722
|
+
- lib/easy_ml/data/embeddings.rb
|
723
|
+
- lib/easy_ml/data/embeddings/adapters.rb
|
724
|
+
- lib/easy_ml/data/embeddings/compression.rb
|
722
725
|
- lib/easy_ml/data/partition.rb
|
723
726
|
- lib/easy_ml/data/partition/boundaries.rb
|
724
727
|
- lib/easy_ml/data/polars_column.rb
|