easy_ml 0.2.0.pre.rc78 → 0.2.0.pre.rc82

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 13858267adb9445f665a01214f2109bc23dd63a76d5ab0ae502c60ac94a6d2d4
4
- data.tar.gz: bc1b37afabf4757ce1e7e311699d6e8ac0bea2230025d8e696ada4071b0b3563
3
+ metadata.gz: ce245d6900c4c5c001c0de9982894ccf6b41faef31e8c958dc540ef05fe426e4
4
+ data.tar.gz: a120f14076a9ff83ca6afb8b0bd651b9ea9ed0185d42f23b82f5de6b2a4de831
5
5
  SHA512:
6
- metadata.gz: ccd5fc9e0b9529da07012a1745f826cf8e88391b24e3df20ba636c9e6ccf853172d18916cccc3087692873971a9dd2b72aa7151e286824df5cb500255610d603
7
- data.tar.gz: 6034abbae5e25a00f204a649c62b568a90a76481c6ff91aaadd766fe515fe76dbf6692bebabe905c5a4bc1b9642717c77f4cbfda6b43684624a5e32517f73d99
6
+ metadata.gz: db2b7292bf07b5122a7949a111c56a25f8848496eaadfc94f101a271054fd4c21d401b4c941f69d7579c8cdc9887e68c4105f11720dba13aa7fd0fcdafb79b81
7
+ data.tar.gz: f5dc8ffee52fb67b6cda8e4462e540900d528dd9c15ff49faa8ca45f21f9e4968129b996a5018a91801aae431a65b5274d64e69045e5feeeec6f12f09900cda3
@@ -23,7 +23,7 @@
23
23
  module EasyML
24
24
  class DatasetsController < ApplicationController
25
25
  def index
26
- datasets = Dataset.all.order(id: :desc)
26
+ datasets = Dataset.all.includes(:columns, :datasource).order(id: :desc)
27
27
 
28
28
  render inertia: "pages/DatasetsPage", props: {
29
29
  datasets: datasets.map { |dataset| dataset_to_json_small(dataset) },
@@ -80,7 +80,7 @@ module EasyML
80
80
  if dataset_params[:features_attributes].present?
81
81
  # Clean up any feature IDs that don't exist anymore
82
82
  feature_ids = dataset_params[:features_attributes].map { |attrs| attrs[:id] }.compact
83
- existing_feature_ids = Feature.where(id: feature_ids).pluck(:id)
83
+ existing_feature_ids = dataset.features.where(id: feature_ids).pluck(:id)
84
84
 
85
85
  params[:dataset][:features_attributes].each do |attrs|
86
86
  if attrs[:id].present? && !existing_feature_ids.include?(attrs[:id].to_i)
@@ -93,7 +93,7 @@ module EasyML
93
93
  attrs[:feature_class] if attrs[:id].blank?
94
94
  }.compact
95
95
 
96
- existing_features = Feature.where(feature_class: feature_classes)
96
+ existing_features = dataset.features.where(feature_class: feature_classes)
97
97
 
98
98
  # Update params with existing feature IDs
99
99
  existing_features.each do |feature|
@@ -41,7 +41,7 @@ module EasyML
41
41
  render inertia: "pages/EditModelPage", props: {
42
42
  model: model_to_json(model),
43
43
  datasets: EasyML::Dataset.all.map do |dataset|
44
- dataset_to_json(dataset)
44
+ dataset_to_json_small(dataset)
45
45
  end,
46
46
  constants: EasyML::Model.constants,
47
47
  }
@@ -167,7 +167,7 @@ module EasyML
167
167
  private
168
168
 
169
169
  def includes_list
170
- [:retraining_runs, :retraining_job, dataset: [:columns, :features, :splitter]]
170
+ [:retraining_runs, :retraining_job, dataset: [:features, :splitter, columns: [:lineages]]]
171
171
  end
172
172
 
173
173
  def model_params
@@ -10,13 +10,13 @@ module EasyML
10
10
 
11
11
  @last_activity = Time.current
12
12
  setup_signal_traps
13
- @monitor_thread = start_monitor_thread
13
+ # @monitor_thread = start_monitor_thread
14
14
 
15
15
  @model.actually_train do |iteration_info|
16
16
  @last_activity = Time.current
17
17
  end
18
18
  ensure
19
- @monitor_thread&.exit
19
+ # @monitor_thread&.exit
20
20
  @model.unlock!
21
21
  end
22
22
 
@@ -54,6 +54,8 @@ module EasyML
54
54
  return df unless anything?
55
55
 
56
56
  adapters.reduce(df) do |df, adapter|
57
+ next df if df.columns.exclude?(column.name)
58
+
57
59
  adapter.transform(df)
58
60
  end
59
61
  end
@@ -28,12 +28,11 @@ module EasyML
28
28
  if computed
29
29
  cols = column_list.computed
30
30
  else
31
- cols = column_list.raw
31
+ cols = column_list
32
32
  end
33
33
 
34
34
  by_name = cols.index_by(&:name)
35
- df.columns.each do |col|
36
- column = by_name[col]
35
+ cols.each do |column|
37
36
  df = column.transform(df, inference: inference, computed: computed) if column
38
37
  end
39
38
 
@@ -232,20 +232,20 @@ module EasyML
232
232
  cleanup
233
233
  refresh_datasource!
234
234
  split_data
235
- process_data
235
+ fit
236
236
  end
237
237
 
238
238
  def prepare
239
239
  prepare_features
240
240
  refresh_datasource
241
241
  split_data
242
- process_data
242
+ fit
243
243
  end
244
244
 
245
245
  def actually_refresh
246
246
  refreshing do
247
- learn(delete: false) # After syncing datasource, learn new statistics + sync columns
248
- process_data
247
+ fit
248
+ normalize_all
249
249
  fully_reload
250
250
  learn
251
251
  learn_statistics(type: :processed) # After processing data, we learn any new statistics
@@ -385,6 +385,7 @@ module EasyML
385
385
  def unlock!
386
386
  Support::Lockable.unlock!(lock_key)
387
387
  features.each(&:unlock!)
388
+ true
388
389
  end
389
390
 
390
391
  def locked?
@@ -427,12 +428,6 @@ module EasyML
427
428
  (read_attribute(:statistics) || {}).with_indifferent_access
428
429
  end
429
430
 
430
- def process_data
431
- learn(delete: false)
432
- fit
433
- normalize_all
434
- end
435
-
436
431
  def needs_learn?
437
432
  return true if columns_need_refresh?
438
433
 
@@ -483,7 +478,7 @@ module EasyML
483
478
  df = apply_missing_columns(df, inference: inference)
484
479
  df = columns.transform(df, inference: inference)
485
480
  df = apply_features(df, features)
486
- df = columns.transform(df, inference: inference, computed: true)
481
+ df = columns.transform(df, inference: inference)
487
482
  df = apply_column_mask(df, inference: inference) unless all_columns
488
483
  df = drop_nulls(df) unless inference
489
484
  df, = processed.split_features_targets(df, true, target) if split_ys
@@ -722,6 +717,20 @@ module EasyML
722
717
  reload
723
718
  end
724
719
 
720
+ def list_nulls(input = nil, list_raw = false)
721
+ input = data(lazy: true) if input.nil?
722
+
723
+ case input
724
+ when Polars::DataFrame
725
+ input = input.lazy
726
+ when String, Symbol
727
+ input = input.to_sym
728
+ input = send(input).data(lazy: true)
729
+ end
730
+ col_list = EasyML::Data::DatasetManager.list_nulls(input)
731
+ list_raw ? col_list : regular_columns(col_list)
732
+ end
733
+
725
734
  private
726
735
 
727
736
  def apply_date_splitter_config
@@ -798,6 +807,7 @@ module EasyML
798
807
  processed_df = normalize(df, all_columns: true)
799
808
  processed.save(segment, processed_df)
800
809
  end
810
+ features.select { |f| !f.fittable? }.each(&:after_transform)
801
811
  @normalized = true
802
812
  end
803
813
 
@@ -840,6 +850,7 @@ module EasyML
840
850
  end
841
851
 
842
852
  def fit
853
+ learn(delete: false)
843
854
  learn_statistics(type: :raw)
844
855
  end
845
856
 
@@ -78,11 +78,21 @@ module EasyML
78
78
  scope :never_applied, -> { where(applied_at: nil) }
79
79
  scope :never_fit, -> do
80
80
  fittable = where(fit_at: nil)
81
- fittable = fittable.select { |f| f.adapter.respond_to?(:fit) }
81
+ fittable = fittable.select(&:fittable?)
82
82
  where(id: fittable.map(&:id))
83
83
  end
84
- scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
85
- scope :ready_to_apply, -> { where(needs_fit: false).where.not(id: has_changes.map(&:id)) }
84
+ scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed) }
85
+ scope :datasource_was_refreshed, -> do
86
+ where(id: all.select(&:datasource_was_refreshed?).map(&:id))
87
+ end
88
+ scope :ready_to_apply, -> do
89
+ base = where(needs_fit: false).where.not(id: has_changes.map(&:id))
90
+ doesnt_fit = where_no_fit
91
+ where(id: base.map(&:id).concat(doesnt_fit.map(&:id)))
92
+ end
93
+
94
+ scope :fittable, -> { all.select(&:fittable?) }
95
+ scope :where_no_fit, -> { all.reject(&:fittable?) }
86
96
 
87
97
  before_save :apply_defaults, if: :new_record?
88
98
  before_save :update_sha
@@ -100,6 +110,10 @@ module EasyML
100
110
  feature_klass.present?
101
111
  end
102
112
 
113
+ def fittable?
114
+ adapter.respond_to?(:fit)
115
+ end
116
+
103
117
  def adapter
104
118
  @adapter ||= feature_klass.new
105
119
  end
@@ -133,6 +147,7 @@ module EasyML
133
147
  end
134
148
 
135
149
  def datasource_was_refreshed?
150
+ return false unless fittable?
136
151
  return true if fit_at.nil?
137
152
  return false if dataset.datasource.refreshed_at.nil?
138
153
 
@@ -213,13 +228,14 @@ module EasyML
213
228
  end
214
229
 
215
230
  def wipe
231
+ update(needs_fit: true) if fittable?
216
232
  feature_store.wipe
217
233
  end
218
234
 
219
235
  def fit(features: [self], async: false)
220
236
  ordered_features = features.sort_by(&:feature_position)
221
237
  parent_batch_id = Random.uuid
222
- jobs = ordered_features.map do |feature|
238
+ jobs = ordered_features.select(&:fittable?).map do |feature|
223
239
  feature.build_batches.map do |batch_args|
224
240
  batch_args.merge(parent_batch_id: parent_batch_id)
225
241
  end
@@ -450,7 +466,7 @@ module EasyML
450
466
  def after_fit
451
467
  update_sha
452
468
 
453
- feature_store.compact
469
+ feature_store.compact if fittable?
454
470
  updates = {
455
471
  fit_at: Time.current,
456
472
  needs_fit: false,
@@ -459,6 +475,10 @@ module EasyML
459
475
  update!(updates)
460
476
  end
461
477
 
478
+ def after_transform
479
+ feature_store.compact if !fittable?
480
+ end
481
+
462
482
  def unlock!
463
483
  feature_store.unlock!
464
484
  end
@@ -517,14 +537,14 @@ module EasyML
517
537
  new_sha = compute_sha
518
538
  if new_sha != self.sha
519
539
  self.sha = new_sha
520
- self.needs_fit = true
540
+ self.needs_fit = fittable?
521
541
  end
522
542
  end
523
543
 
524
544
  def update_from_feature_class
525
545
  if read_attribute(:batch_size) != config.dig(:batch_size)
526
546
  write_attribute(:batch_size, config.dig(:batch_size))
527
- self.needs_fit = true
547
+ self.needs_fit = fittable?
528
548
  end
529
549
 
530
550
  if self.primary_key != config.dig(:primary_key)
@@ -179,17 +179,18 @@ module EasyML
179
179
  end
180
180
 
181
181
  def actually_train(&progress_block)
182
- raise untrainable_error unless trainable?
183
-
184
182
  lock_model do
185
183
  run = pending_run
186
184
  run.wrap_training do
185
+ raise untrainable_error unless trainable?
186
+
187
187
  best_params = nil
188
188
  if run.should_tune?
189
189
  best_params = hyperparameter_search(&progress_block)
190
+ else
191
+ fit(&progress_block)
192
+ save
190
193
  end
191
- fit(&progress_block)
192
- save
193
194
  [self, best_params]
194
195
  end
195
196
  update(is_training: false)
@@ -393,6 +394,10 @@ module EasyML
393
394
  adapter.after_tuning
394
395
  end
395
396
 
397
+ def cleanup
398
+ adapter.cleanup
399
+ end
400
+
396
401
  def fit_in_batches(tuning: false, batch_size: nil, batch_overlap: nil, batch_key: nil, checkpoint_dir: Rails.root.join("tmp", "xgboost_checkpoints"), &progress_block)
397
402
  adapter.fit_in_batches(tuning: tuning, batch_size: batch_size, batch_overlap: batch_overlap, batch_key: batch_key, checkpoint_dir: checkpoint_dir, &progress_block)
398
403
  end
@@ -619,8 +624,8 @@ module EasyML
619
624
  private
620
625
 
621
626
  def default_evaluation_inputs
622
- x_true, y_true = dataset.test(split_ys: true)
623
- ds = dataset.test(all_columns: true)
627
+ x_true, y_true = dataset.processed.test(split_ys: true, all_columns: true)
628
+ ds = dataset.processed.test(all_columns: true)
624
629
  y_pred = predict(x_true)
625
630
  {
626
631
  x_true: x_true,
@@ -37,6 +37,20 @@ module EasyML
37
37
  max: 10,
38
38
  step: 0.1,
39
39
  },
40
+ scale_pos_weight: {
41
+ label: "Scale Pos Weight",
42
+ description: "Balance of positive and negative weights",
43
+ min: 0,
44
+ max: 200,
45
+ step: 1,
46
+ },
47
+ max_delta_step: {
48
+ label: "Max Delta Step",
49
+ description: "Maximum delta step",
50
+ min: 0,
51
+ max: 10,
52
+ step: 1,
53
+ },
40
54
  gamma: {
41
55
  label: "Gamma",
42
56
  description: "Minimum loss reduction required to make a further partition",
@@ -81,11 +95,13 @@ module EasyML
81
95
  label: "Histogram",
82
96
  description: "Fast histogram optimized approximate greedy algorithm",
83
97
  },
84
- {
85
- value: "gpu_hist",
86
- label: "GPU Histogram",
87
- description: "GPU implementation of hist algorithm",
88
- },
98
+ # Only when compiled wih GPU support...
99
+ # How to make this not a default optoin
100
+ # {
101
+ # value: "gpu_hist",
102
+ # label: "GPU Histogram",
103
+ # description: "GPU implementation of hist algorithm",
104
+ # },
89
105
  ],
90
106
  },
91
107
  )
@@ -50,7 +50,7 @@ module EasyML
50
50
  x_valid = x_valid.select(model.dataset.col_order(inference: true))
51
51
  @preprocessed ||= model.preprocess(x_valid, y_valid)
52
52
  y_pred = model.predict(@preprocessed)
53
- dataset = model.dataset.valid(all_columns: true)
53
+ dataset = model.dataset.processed.valid(all_columns: true)
54
54
 
55
55
  metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
56
56
  Wandb.log(metrics)
@@ -103,7 +103,7 @@ module EasyML
103
103
  model.callbacks.detect { |cb| cb.class == Wandb::XGBoostCallback }
104
104
  end
105
105
 
106
- def track_cumulative_feature_importance(finish = true)
106
+ def track_cumulative_feature_importance
107
107
  return unless @feature_importances
108
108
 
109
109
  project_name = model.adapter.get_wandb_project
@@ -127,13 +127,16 @@ module EasyML
127
127
  "feature_importance" => bar_plot.__pyptr__,
128
128
  }
129
129
  Wandb.log(log_data)
130
- model.adapter.delete_wandb_project if finish
131
- Wandb.finish if finish
132
130
  end
133
131
 
134
132
  def after_tuning
135
133
  track_cumulative_feature_importance
136
134
  end
135
+
136
+ def cleanup
137
+ model.adapter.delete_wandb_project
138
+ Wandb.finish
139
+ end
137
140
  end
138
141
  end
139
142
  end
@@ -135,6 +135,12 @@ module EasyML
135
135
  end
136
136
  end
137
137
 
138
+ def cleanup
139
+ model.callbacks.each do |callback|
140
+ callback.cleanup if callback.respond_to?(:cleanup)
141
+ end
142
+ end
143
+
138
144
  def prepare_callbacks(tuner)
139
145
  set_wandb_project(tuner.project_name)
140
146
 
@@ -421,11 +427,11 @@ module EasyML
421
427
  def prepare_data
422
428
  if @d_train.nil?
423
429
  col_order = dataset.col_order
424
- x_sample, y_sample = dataset.train(split_ys: true, limit: 5, select: col_order, lazy: true)
430
+ x_sample, y_sample = dataset.processed.train(split_ys: true, limit: 5, select: col_order, lazy: true)
425
431
  preprocess(x_sample, y_sample) # Ensure we fail fast if the dataset is misconfigured
426
- x_train, y_train = dataset.train(split_ys: true, select: col_order, lazy: true)
427
- x_valid, y_valid = dataset.valid(split_ys: true, select: col_order, lazy: true)
428
- x_test, y_test = dataset.test(split_ys: true, select: col_order, lazy: true)
432
+ x_train, y_train = dataset.processed.train(split_ys: true, select: col_order, lazy: true)
433
+ x_valid, y_valid = dataset.processed.valid(split_ys: true, select: col_order, lazy: true)
434
+ x_test, y_test = dataset.processed.test(split_ys: true, select: col_order, lazy: true)
429
435
  @d_train = preprocess(x_train, y_train)
430
436
  @d_valid = preprocess(x_valid, y_valid)
431
437
  @d_test = preprocess(x_test, y_test)
@@ -439,22 +445,19 @@ module EasyML
439
445
  end
440
446
 
441
447
  def untrainable_columns
442
- df = model.dataset.processed.data(lazy: true)
448
+ model.dataset.refresh if model.dataset.processed.nil?
443
449
 
444
- columns = df.columns
445
- selects = columns.map do |col|
446
- Polars.col(col).null_count.alias(col)
447
- end
448
- null_info = df.select(selects).collect
449
- null_info.to_hashes.first.compact
450
- col_list = null_info.to_hashes.first.transform_values { |v| v > 0 ? v : nil }.compact.keys
451
-
452
- model.dataset.regular_columns(col_list)
450
+ model.dataset.list_nulls(
451
+ model.dataset.processed.data(lazy: true)
452
+ )
453
453
  end
454
454
 
455
455
  def preprocess(xs, ys = nil)
456
456
  return xs if xs.is_a?(::XGBoost::DMatrix)
457
- weights_col = model.weights_column || nil
457
+ lazy = xs.is_a?(Polars::LazyFrame)
458
+ return xs if (lazy ? xs.limit(1).collect : xs).shape[0] == 0
459
+
460
+ weights_col = (model.weights_column.nil? || model.weights_column.blank?) ? nil : model.weights_column
458
461
 
459
462
  if weights_col == model.dataset.target
460
463
  raise ArgumentError, "Weight column cannot be the target column"
@@ -463,7 +466,6 @@ module EasyML
463
466
  # Extract feature columns (all columns except label and weight)
464
467
  feature_cols = xs.columns
465
468
  feature_cols -= [weights_col] if weights_col
466
- lazy = xs.is_a?(Polars::LazyFrame)
467
469
 
468
470
  # Get features, labels and weights
469
471
  features = lazy ? xs.select(feature_cols).collect.to_numo : xs.select(feature_cols).to_numo
@@ -18,12 +18,22 @@ module EasyML
18
18
  end
19
19
 
20
20
  def defaults
21
- {}
21
+ model.adapter.hyperparameters.class.hyperparameter_constants.transform_values do |constant|
22
+ values = constant.slice(:min, :max, :step, :options)
23
+ if values.key?(:options)
24
+ values[:options] = values[:options].map { |option| option[:value] }
25
+ end
26
+ values
27
+ end
22
28
  end
23
29
 
24
30
  def run_trial(trial)
25
31
  config = deep_merge_defaults(self.config.clone.deep_symbolize_keys)
26
- suggest_parameters(trial, config)
32
+ # For first trial, re-use the original hyperparameters, so they
33
+ # serve as our starting point/imputers
34
+ unless trial == 1
35
+ suggest_parameters(trial, config)
36
+ end
27
37
  yield model
28
38
  end
29
39
 
@@ -57,8 +67,11 @@ module EasyML
57
67
  min = param_config[:min]
58
68
  max = param_config[:max]
59
69
  log = param_config[:log]
70
+ options = param_config[:options]
60
71
 
61
- if log
72
+ if options
73
+ trial.suggest_categorical(param_name.to_s, options)
74
+ elsif log
62
75
  trial.suggest_loguniform(param_name.to_s, min, max)
63
76
  elsif max.is_a?(Integer) && min.is_a?(Integer)
64
77
  trial.suggest_int(param_name.to_s, min, max)
@@ -5,23 +5,6 @@ module EasyML
5
5
  class Tuner
6
6
  module Adapters
7
7
  class XGBoostAdapter < BaseAdapter
8
- def defaults
9
- {
10
- learning_rate: {
11
- min: 0.001,
12
- max: 0.1,
13
- log: true,
14
- },
15
- n_estimators: {
16
- min: 100,
17
- max: 1_000,
18
- },
19
- max_depth: {
20
- min: 2,
21
- max: 20,
22
- },
23
- }
24
- end
25
8
  end
26
9
  end
27
10
  end
@@ -73,13 +73,13 @@ module EasyML
73
73
  model.task = task
74
74
 
75
75
  model.dataset.refresh if model.dataset.needs_refresh?
76
- x_valid, y_valid = model.dataset.valid(split_ys: true, all_columns: true)
76
+ x_valid, y_valid = model.dataset.processed.valid(split_ys: true, all_columns: true)
77
77
  x_normalized = model.dataset.normalize(x_valid, inference: true)
78
78
  x_normalized = model.preprocess(x_normalized)
79
79
  self.x_valid = x_valid
80
80
  self.y_valid = y_valid
81
81
  self.x_normalized = x_normalized
82
- self.dataset = model.dataset.valid(all_columns: true)
82
+ self.dataset = model.dataset.processed.valid(all_columns: true)
83
83
  adapter.tune_started_at = tune_started_at
84
84
  adapter.x_valid = x_valid
85
85
  adapter.y_valid = y_valid
@@ -108,7 +108,6 @@ module EasyML
108
108
  end
109
109
  end
110
110
 
111
- model.after_tuning
112
111
  return nil if tuner_job.tuner_runs.all?(&:failed?)
113
112
 
114
113
  best_run = tuner_job.best_run
@@ -118,6 +117,13 @@ module EasyML
118
117
  status: :success,
119
118
  completed_at: Time.current,
120
119
  )
120
+ model.after_tuning
121
+ if best_run&.hyperparameters.present?
122
+ model.hyperparameters = best_run.hyperparameters
123
+ model.fit
124
+ model.save
125
+ end
126
+ model.cleanup
121
127
 
122
128
  best_run&.hyperparameters
123
129
  rescue StandardError => e
@@ -35,6 +35,18 @@ module EasyML
35
35
 
36
36
  private
37
37
 
38
+ def list_df_nulls(df)
39
+ df = df.lazy
40
+
41
+ columns = df.columns
42
+ selects = columns.map do |col|
43
+ Polars.col(col).null_count.alias(col)
44
+ end
45
+ null_info = df.select(selects).collect
46
+ null_info.to_hashes.first.compact
47
+ null_info.to_hashes.first.transform_values { |v| v > 0 ? v : nil }.compact.keys
48
+ end
49
+
38
50
  def apply_defaults(kwargs)
39
51
  options = kwargs.dup
40
52
 
@@ -1,4 +1,3 @@
1
-
2
1
  module EasyML
3
2
  module Data
4
3
  class DatasetManager
@@ -8,11 +7,17 @@ module EasyML
8
7
  return query_dataframes(lazy_frames, schema)
9
8
  end
10
9
 
10
+ def list_nulls
11
+ df = lazy_frames
12
+ list_df_nulls(df)
13
+ end
14
+
11
15
  def schema
12
16
  input.schema
13
17
  end
14
18
 
15
- private
19
+ private
20
+
16
21
  def lazy_frames
17
22
  input.lazy
18
23
  end
@@ -20,4 +25,4 @@ module EasyML
20
25
  end
21
26
  end
22
27
  end
23
- end
28
+ end
@@ -15,6 +15,11 @@ module EasyML
15
15
  return Batch.new(options, &block).query
16
16
  end
17
17
 
18
+ def list_nulls
19
+ df = dataframe.lazy
20
+ list_df_nulls(df)
21
+ end
22
+
18
23
  def schema
19
24
  @schema ||= files.any? ? Polars.read_parquet_schema(files.first) : nil
20
25
  end
@@ -17,12 +17,18 @@ module EasyML
17
17
  ).query
18
18
  end
19
19
 
20
- def self.schema(input, **kwargs, &block)
20
+ def self.schema(input = nil, **kwargs, &block)
21
21
  adapter(input).new(
22
22
  kwargs.merge!(input: input), &block
23
23
  ).schema
24
24
  end
25
25
 
26
+ def self.list_nulls(input = nil, **kwargs, &block)
27
+ adapter(input).new(
28
+ kwargs.merge!(input: input), &block
29
+ ).list_nulls
30
+ end
31
+
26
32
  def self.files(dir)
27
33
  Dir.glob(::File.join(dir, "**/*.{parquet}"))
28
34
  end
@@ -95,7 +95,7 @@ module EasyML
95
95
  keylist = unique_id_key(subdir: "keylist")
96
96
 
97
97
  acquire_lock(keylist) do |suo|
98
- suo.client.sadd(keylist, key)
98
+ suo.client.sadd?(keylist, key)
99
99
  end
100
100
  end
101
101
 
@@ -44,13 +44,21 @@ module EasyML
44
44
  Reader.schema(input, **kwargs, &block)
45
45
  end
46
46
 
47
+ def list_nulls(input = nil, **kwargs, &block)
48
+ Reader.list_nulls(input, **kwargs, &block)
49
+ end
50
+
47
51
  def num_rows
48
52
  Reader.num_rows
49
53
  end
50
54
  end
51
55
 
52
- def num_rows
53
- Reader.num_rows(root_dir)
56
+ def list_nulls(input = nil, **kwargs, &block)
57
+ Reader.list_nulls(input, **kwargs, &block)
58
+ end
59
+
60
+ def num_rows(input = nil, **kwargs, &block)
61
+ Reader.num_rows(input, **kwargs, &block)
54
62
  end
55
63
 
56
64
  def query(input = nil, **kwargs, &block)
@@ -0,0 +1,56 @@
1
+ module EasyML
2
+ module Data
3
+ class Embeddings
4
+ class Adapters
5
+ attr_accessor :model, :config
6
+
7
+ ADAPTERS = {
8
+ anthropic: Langchain::LLM::Anthropic,
9
+ gemini: Langchain::LLM::GoogleGemini,
10
+ openai: Langchain::LLM::OpenAI,
11
+ ollama: Langchain::LLM::Ollama,
12
+ }
13
+
14
+ DEFAULTS = {
15
+ api_key: {
16
+ anthropic: ENV["ANTHROPIC_API_KEY"],
17
+ gemini: ENV["GEMINI_API_KEY"],
18
+ openai: ENV["OPENAI_API_KEY"],
19
+ ollama: ENV["OLLAMA_API_KEY"],
20
+ },
21
+ }
22
+
23
+ def initialize(model, config = {})
24
+ @model = model.to_sym
25
+ @config = config.symbolize_keys
26
+ apply_defaults
27
+ end
28
+
29
+ def embed(df, col)
30
+ pick
31
+ texts = df[col].to_a
32
+ df = df.with_column(
33
+ embeddings: adapter.embed(text: texts),
34
+ )
35
+ end
36
+
37
+ private
38
+
39
+ def pick
40
+ @adapter ||= ADAPTERS[@model].new(config)
41
+ self
42
+ end
43
+
44
+ def apply_defaults
45
+ @config = @config.deep_symbolize_keys
46
+
47
+ DEFAULTS.each do |k, v|
48
+ unless @config.key?(k)
49
+ @config[k] = v[@model]
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,43 @@
1
+ module EasyML
2
+ module Data
3
+ class Embeddings
4
+ COMPRESSION_DEFAULT = {
5
+ present: :balanced,
6
+ }
7
+
8
+ attr_reader :df, :column, :model, :adapter, :compression,
9
+ :embeddings, :compressed_embeddings
10
+
11
+ def initialize(options = {})
12
+ @df = options[:df]
13
+ @column = options[:column]
14
+ @model = options[:model]
15
+ @config = options[:config] || {}
16
+ @compression = options[:compression] || COMPRESSION_DEFAULT
17
+ end
18
+
19
+ def create
20
+ embed
21
+ compress
22
+ end
23
+
24
+ def embed
25
+ @embeddings ||= adapter.embed(df, column)
26
+ end
27
+
28
+ def compress
29
+ @compressed_embeddings ||= compression_adapter.compress(embeddings)
30
+ end
31
+
32
+ private
33
+
34
+ def adapter
35
+ @adapter ||= EasyML::Data::Embeddings::Adapters.new(model, config)
36
+ end
37
+
38
+ def compression_adapter
39
+ @compression_adapter ||= EasyML::Data::Embeddings::Compression.new(compression)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc78"
4
+ VERSION = "0.2.0-rc82"
5
5
 
6
6
  module Version
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: easy_ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre.rc78
4
+ version: 0.2.0.pre.rc82
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brett Shollenberger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-02-18 00:00:00.000000000 Z
11
+ date: 2025-02-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
@@ -719,6 +719,9 @@ files:
719
719
  - lib/easy_ml/data/dataset_manager/writer/partitioned.rb
720
720
  - lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb
721
721
  - lib/easy_ml/data/date_converter.rb
722
+ - lib/easy_ml/data/embeddings.rb
723
+ - lib/easy_ml/data/embeddings/adapters.rb
724
+ - lib/easy_ml/data/embeddings/compression.rb
722
725
  - lib/easy_ml/data/partition.rb
723
726
  - lib/easy_ml/data/partition/boundaries.rb
724
727
  - lib/easy_ml/data/polars_column.rb