easy_ml 0.2.0.pre.rc78 → 0.2.0.pre.rc81

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 13858267adb9445f665a01214f2109bc23dd63a76d5ab0ae502c60ac94a6d2d4
4
- data.tar.gz: bc1b37afabf4757ce1e7e311699d6e8ac0bea2230025d8e696ada4071b0b3563
3
+ metadata.gz: 169873e9ea5e1b00f7a4e499a2aeffc377615757cdd4b5fe6d70c8c454b9d426
4
+ data.tar.gz: fc1d4509606f011bd3adbdf367e767a3e9dfc4fdbb6b5cd91bb413f72da364b2
5
5
  SHA512:
6
- metadata.gz: ccd5fc9e0b9529da07012a1745f826cf8e88391b24e3df20ba636c9e6ccf853172d18916cccc3087692873971a9dd2b72aa7151e286824df5cb500255610d603
7
- data.tar.gz: 6034abbae5e25a00f204a649c62b568a90a76481c6ff91aaadd766fe515fe76dbf6692bebabe905c5a4bc1b9642717c77f4cbfda6b43684624a5e32517f73d99
6
+ metadata.gz: f4ea106a66d3185f612e481b607cef21f5453a00ef6eb12558e53f43aa1d68b8f20d1e950c1840d97180ba6271db9b9b42c8a2d480c22ca8c1e33d1b768590d2
7
+ data.tar.gz: 24885cdecd46d612be8b8ce7d4bd9889bdf60420bc82c01993cfe0168e454ebdaaa70899f5cf9cc879ff89a895fd00b1761d01c945dafffc784f7bc1e4a2fb8e
@@ -23,7 +23,7 @@
23
23
  module EasyML
24
24
  class DatasetsController < ApplicationController
25
25
  def index
26
- datasets = Dataset.all.order(id: :desc)
26
+ datasets = Dataset.all.includes(:columns, :datasource).order(id: :desc)
27
27
 
28
28
  render inertia: "pages/DatasetsPage", props: {
29
29
  datasets: datasets.map { |dataset| dataset_to_json_small(dataset) },
@@ -80,7 +80,7 @@ module EasyML
80
80
  if dataset_params[:features_attributes].present?
81
81
  # Clean up any feature IDs that don't exist anymore
82
82
  feature_ids = dataset_params[:features_attributes].map { |attrs| attrs[:id] }.compact
83
- existing_feature_ids = Feature.where(id: feature_ids).pluck(:id)
83
+ existing_feature_ids = dataset.features.where(id: feature_ids).pluck(:id)
84
84
 
85
85
  params[:dataset][:features_attributes].each do |attrs|
86
86
  if attrs[:id].present? && !existing_feature_ids.include?(attrs[:id].to_i)
@@ -93,7 +93,7 @@ module EasyML
93
93
  attrs[:feature_class] if attrs[:id].blank?
94
94
  }.compact
95
95
 
96
- existing_features = Feature.where(feature_class: feature_classes)
96
+ existing_features = dataset.features.where(feature_class: feature_classes)
97
97
 
98
98
  # Update params with existing feature IDs
99
99
  existing_features.each do |feature|
@@ -41,7 +41,7 @@ module EasyML
41
41
  render inertia: "pages/EditModelPage", props: {
42
42
  model: model_to_json(model),
43
43
  datasets: EasyML::Dataset.all.map do |dataset|
44
- dataset_to_json(dataset)
44
+ dataset_to_json_small(dataset)
45
45
  end,
46
46
  constants: EasyML::Model.constants,
47
47
  }
@@ -167,7 +167,7 @@ module EasyML
167
167
  private
168
168
 
169
169
  def includes_list
170
- [:retraining_runs, :retraining_job, dataset: [:columns, :features, :splitter]]
170
+ [:retraining_runs, :retraining_job, dataset: [:features, :splitter, columns: [:lineages]]]
171
171
  end
172
172
 
173
173
  def model_params
@@ -54,6 +54,8 @@ module EasyML
54
54
  return df unless anything?
55
55
 
56
56
  adapters.reduce(df) do |df, adapter|
57
+ next df if df.columns.exclude?(column.name)
58
+
57
59
  adapter.transform(df)
58
60
  end
59
61
  end
@@ -28,12 +28,11 @@ module EasyML
28
28
  if computed
29
29
  cols = column_list.computed
30
30
  else
31
- cols = column_list.raw
31
+ cols = column_list
32
32
  end
33
33
 
34
34
  by_name = cols.index_by(&:name)
35
- df.columns.each do |col|
36
- column = by_name[col]
35
+ cols.each do |column|
37
36
  df = column.transform(df, inference: inference, computed: computed) if column
38
37
  end
39
38
 
@@ -232,20 +232,20 @@ module EasyML
232
232
  cleanup
233
233
  refresh_datasource!
234
234
  split_data
235
- process_data
235
+ fit
236
236
  end
237
237
 
238
238
  def prepare
239
239
  prepare_features
240
240
  refresh_datasource
241
241
  split_data
242
- process_data
242
+ fit
243
243
  end
244
244
 
245
245
  def actually_refresh
246
246
  refreshing do
247
- learn(delete: false) # After syncing datasource, learn new statistics + sync columns
248
- process_data
247
+ fit
248
+ normalize_all
249
249
  fully_reload
250
250
  learn
251
251
  learn_statistics(type: :processed) # After processing data, we learn any new statistics
@@ -385,6 +385,7 @@ module EasyML
385
385
  def unlock!
386
386
  Support::Lockable.unlock!(lock_key)
387
387
  features.each(&:unlock!)
388
+ true
388
389
  end
389
390
 
390
391
  def locked?
@@ -427,12 +428,6 @@ module EasyML
427
428
  (read_attribute(:statistics) || {}).with_indifferent_access
428
429
  end
429
430
 
430
- def process_data
431
- learn(delete: false)
432
- fit
433
- normalize_all
434
- end
435
-
436
431
  def needs_learn?
437
432
  return true if columns_need_refresh?
438
433
 
@@ -483,7 +478,7 @@ module EasyML
483
478
  df = apply_missing_columns(df, inference: inference)
484
479
  df = columns.transform(df, inference: inference)
485
480
  df = apply_features(df, features)
486
- df = columns.transform(df, inference: inference, computed: true)
481
+ df = columns.transform(df, inference: inference)
487
482
  df = apply_column_mask(df, inference: inference) unless all_columns
488
483
  df = drop_nulls(df) unless inference
489
484
  df, = processed.split_features_targets(df, true, target) if split_ys
@@ -722,6 +717,20 @@ module EasyML
722
717
  reload
723
718
  end
724
719
 
720
+ def list_nulls(input = nil, list_raw = false)
721
+ input = data(lazy: true) if input.nil?
722
+
723
+ case input
724
+ when Polars::DataFrame
725
+ input = input.lazy
726
+ when String, Symbol
727
+ input = input.to_sym
728
+ input = send(input).data(lazy: true)
729
+ end
730
+ col_list = EasyML::Data::DatasetManager.list_nulls(input)
731
+ list_raw ? col_list : regular_columns(col_list)
732
+ end
733
+
725
734
  private
726
735
 
727
736
  def apply_date_splitter_config
@@ -798,6 +807,7 @@ module EasyML
798
807
  processed_df = normalize(df, all_columns: true)
799
808
  processed.save(segment, processed_df)
800
809
  end
810
+ features.select { |f| !f.fittable? }.each(&:after_transform)
801
811
  @normalized = true
802
812
  end
803
813
 
@@ -840,6 +850,7 @@ module EasyML
840
850
  end
841
851
 
842
852
  def fit
853
+ learn(delete: false)
843
854
  learn_statistics(type: :raw)
844
855
  end
845
856
 
@@ -78,11 +78,18 @@ module EasyML
78
78
  scope :never_applied, -> { where(applied_at: nil) }
79
79
  scope :never_fit, -> do
80
80
  fittable = where(fit_at: nil)
81
- fittable = fittable.select { |f| f.adapter.respond_to?(:fit) }
81
+ fittable = fittable.select(&:fittable?)
82
82
  where(id: fittable.map(&:id))
83
83
  end
84
84
  scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
85
- scope :ready_to_apply, -> { where(needs_fit: false).where.not(id: has_changes.map(&:id)) }
85
+ scope :ready_to_apply, -> do
86
+ base = where(needs_fit: false).where.not(id: has_changes.map(&:id))
87
+ doesnt_fit = where_no_fit
88
+ where(id: base.map(&:id).concat(doesnt_fit.map(&:id)))
89
+ end
90
+
91
+ scope :fittable, -> { all.select(&:fittable?) }
92
+ scope :where_no_fit, -> { all.reject(&:fittable?) }
86
93
 
87
94
  before_save :apply_defaults, if: :new_record?
88
95
  before_save :update_sha
@@ -100,6 +107,10 @@ module EasyML
100
107
  feature_klass.present?
101
108
  end
102
109
 
110
+ def fittable?
111
+ adapter.respond_to?(:fit)
112
+ end
113
+
103
114
  def adapter
104
115
  @adapter ||= feature_klass.new
105
116
  end
@@ -213,13 +224,14 @@ module EasyML
213
224
  end
214
225
 
215
226
  def wipe
227
+ update(needs_fit: true) if fittable?
216
228
  feature_store.wipe
217
229
  end
218
230
 
219
231
  def fit(features: [self], async: false)
220
232
  ordered_features = features.sort_by(&:feature_position)
221
233
  parent_batch_id = Random.uuid
222
- jobs = ordered_features.map do |feature|
234
+ jobs = ordered_features.select(&:fittable?).map do |feature|
223
235
  feature.build_batches.map do |batch_args|
224
236
  batch_args.merge(parent_batch_id: parent_batch_id)
225
237
  end
@@ -450,7 +462,7 @@ module EasyML
450
462
  def after_fit
451
463
  update_sha
452
464
 
453
- feature_store.compact
465
+ feature_store.compact if fittable?
454
466
  updates = {
455
467
  fit_at: Time.current,
456
468
  needs_fit: false,
@@ -459,6 +471,10 @@ module EasyML
459
471
  update!(updates)
460
472
  end
461
473
 
474
+ def after_transform
475
+ feature_store.compact if !fittable?
476
+ end
477
+
462
478
  def unlock!
463
479
  feature_store.unlock!
464
480
  end
@@ -517,14 +533,14 @@ module EasyML
517
533
  new_sha = compute_sha
518
534
  if new_sha != self.sha
519
535
  self.sha = new_sha
520
- self.needs_fit = true
536
+ self.needs_fit = fittable?
521
537
  end
522
538
  end
523
539
 
524
540
  def update_from_feature_class
525
541
  if read_attribute(:batch_size) != config.dig(:batch_size)
526
542
  write_attribute(:batch_size, config.dig(:batch_size))
527
- self.needs_fit = true
543
+ self.needs_fit = fittable?
528
544
  end
529
545
 
530
546
  if self.primary_key != config.dig(:primary_key)
@@ -179,17 +179,18 @@ module EasyML
179
179
  end
180
180
 
181
181
  def actually_train(&progress_block)
182
- raise untrainable_error unless trainable?
183
-
184
182
  lock_model do
185
183
  run = pending_run
186
184
  run.wrap_training do
185
+ raise untrainable_error unless trainable?
186
+
187
187
  best_params = nil
188
188
  if run.should_tune?
189
189
  best_params = hyperparameter_search(&progress_block)
190
+ else
191
+ fit(&progress_block)
192
+ save
190
193
  end
191
- fit(&progress_block)
192
- save
193
194
  [self, best_params]
194
195
  end
195
196
  update(is_training: false)
@@ -393,6 +394,10 @@ module EasyML
393
394
  adapter.after_tuning
394
395
  end
395
396
 
397
+ def cleanup
398
+ adapter.cleanup
399
+ end
400
+
396
401
  def fit_in_batches(tuning: false, batch_size: nil, batch_overlap: nil, batch_key: nil, checkpoint_dir: Rails.root.join("tmp", "xgboost_checkpoints"), &progress_block)
397
402
  adapter.fit_in_batches(tuning: tuning, batch_size: batch_size, batch_overlap: batch_overlap, batch_key: batch_key, checkpoint_dir: checkpoint_dir, &progress_block)
398
403
  end
@@ -37,6 +37,20 @@ module EasyML
37
37
  max: 10,
38
38
  step: 0.1,
39
39
  },
40
+ scale_pos_weight: {
41
+ label: "Scale Pos Weight",
42
+ description: "Balance of positive and negative weights",
43
+ min: 0,
44
+ max: 200,
45
+ step: 1,
46
+ },
47
+ max_delta_step: {
48
+ label: "Max Delta Step",
49
+ description: "Maximum delta step",
50
+ min: 0,
51
+ max: 10,
52
+ step: 1,
53
+ },
40
54
  gamma: {
41
55
  label: "Gamma",
42
56
  description: "Minimum loss reduction required to make a further partition",
@@ -81,11 +95,13 @@ module EasyML
81
95
  label: "Histogram",
82
96
  description: "Fast histogram optimized approximate greedy algorithm",
83
97
  },
84
- {
85
- value: "gpu_hist",
86
- label: "GPU Histogram",
87
- description: "GPU implementation of hist algorithm",
88
- },
98
+ # Only when compiled wih GPU support...
99
+ # How to make this not a default optoin
100
+ # {
101
+ # value: "gpu_hist",
102
+ # label: "GPU Histogram",
103
+ # description: "GPU implementation of hist algorithm",
104
+ # },
89
105
  ],
90
106
  },
91
107
  )
@@ -103,7 +103,7 @@ module EasyML
103
103
  model.callbacks.detect { |cb| cb.class == Wandb::XGBoostCallback }
104
104
  end
105
105
 
106
- def track_cumulative_feature_importance(finish = true)
106
+ def track_cumulative_feature_importance
107
107
  return unless @feature_importances
108
108
 
109
109
  project_name = model.adapter.get_wandb_project
@@ -127,13 +127,16 @@ module EasyML
127
127
  "feature_importance" => bar_plot.__pyptr__,
128
128
  }
129
129
  Wandb.log(log_data)
130
- model.adapter.delete_wandb_project if finish
131
- Wandb.finish if finish
132
130
  end
133
131
 
134
132
  def after_tuning
135
133
  track_cumulative_feature_importance
136
134
  end
135
+
136
+ def cleanup
137
+ model.adapter.delete_wandb_project
138
+ Wandb.finish
139
+ end
137
140
  end
138
141
  end
139
142
  end
@@ -135,6 +135,12 @@ module EasyML
135
135
  end
136
136
  end
137
137
 
138
+ def cleanup
139
+ model.callbacks.each do |callback|
140
+ callback.cleanup if callback.respond_to?(:cleanup)
141
+ end
142
+ end
143
+
138
144
  def prepare_callbacks(tuner)
139
145
  set_wandb_project(tuner.project_name)
140
146
 
@@ -439,21 +445,18 @@ module EasyML
439
445
  end
440
446
 
441
447
  def untrainable_columns
442
- df = model.dataset.processed.data(lazy: true)
443
-
444
- columns = df.columns
445
- selects = columns.map do |col|
446
- Polars.col(col).null_count.alias(col)
447
- end
448
- null_info = df.select(selects).collect
449
- null_info.to_hashes.first.compact
450
- col_list = null_info.to_hashes.first.transform_values { |v| v > 0 ? v : nil }.compact.keys
448
+ model.dataset.refresh if model.dataset.processed.nil?
451
449
 
452
- model.dataset.regular_columns(col_list)
450
+ model.dataset.list_nulls(
451
+ model.dataset.processed.data(lazy: true)
452
+ )
453
453
  end
454
454
 
455
455
  def preprocess(xs, ys = nil)
456
456
  return xs if xs.is_a?(::XGBoost::DMatrix)
457
+ lazy = xs.is_a?(Polars::LazyFrame)
458
+ return xs if (lazy ? xs.limit(1).collect : xs).shape[0] == 0
459
+
457
460
  weights_col = model.weights_column || nil
458
461
 
459
462
  if weights_col == model.dataset.target
@@ -463,10 +466,13 @@ module EasyML
463
466
  # Extract feature columns (all columns except label and weight)
464
467
  feature_cols = xs.columns
465
468
  feature_cols -= [weights_col] if weights_col
466
- lazy = xs.is_a?(Polars::LazyFrame)
467
469
 
468
470
  # Get features, labels and weights
469
- features = lazy ? xs.select(feature_cols).collect.to_numo : xs.select(feature_cols).to_numo
471
+ begin
472
+ features = lazy ? xs.select(feature_cols).collect.to_numo : xs.select(feature_cols).to_numo
473
+ rescue => e
474
+ binding.pry
475
+ end
470
476
  weights = weights_col ? (lazy ? xs.select(weights_col).collect.to_numo : xs.select(weights_col).to_numo) : nil
471
477
  weights = weights.flatten if weights
472
478
  if ys.present?
@@ -18,12 +18,22 @@ module EasyML
18
18
  end
19
19
 
20
20
  def defaults
21
- {}
21
+ model.adapter.hyperparameters.class.hyperparameter_constants.transform_values do |constant|
22
+ values = constant.slice(:min, :max, :step, :options)
23
+ if values.key?(:options)
24
+ values[:options] = values[:options].map { |option| option[:value] }
25
+ end
26
+ values
27
+ end
22
28
  end
23
29
 
24
30
  def run_trial(trial)
25
31
  config = deep_merge_defaults(self.config.clone.deep_symbolize_keys)
26
- suggest_parameters(trial, config)
32
+ # For first trial, re-use the original hyperparameters, so they
33
+ # serve as our starting point/imputers
34
+ unless trial == 1
35
+ suggest_parameters(trial, config)
36
+ end
27
37
  yield model
28
38
  end
29
39
 
@@ -57,8 +67,11 @@ module EasyML
57
67
  min = param_config[:min]
58
68
  max = param_config[:max]
59
69
  log = param_config[:log]
70
+ options = param_config[:options]
60
71
 
61
- if log
72
+ if options
73
+ trial.suggest_categorical(param_name.to_s, options)
74
+ elsif log
62
75
  trial.suggest_loguniform(param_name.to_s, min, max)
63
76
  elsif max.is_a?(Integer) && min.is_a?(Integer)
64
77
  trial.suggest_int(param_name.to_s, min, max)
@@ -5,23 +5,6 @@ module EasyML
5
5
  class Tuner
6
6
  module Adapters
7
7
  class XGBoostAdapter < BaseAdapter
8
- def defaults
9
- {
10
- learning_rate: {
11
- min: 0.001,
12
- max: 0.1,
13
- log: true,
14
- },
15
- n_estimators: {
16
- min: 100,
17
- max: 1_000,
18
- },
19
- max_depth: {
20
- min: 2,
21
- max: 20,
22
- },
23
- }
24
- end
25
8
  end
26
9
  end
27
10
  end
@@ -108,7 +108,6 @@ module EasyML
108
108
  end
109
109
  end
110
110
 
111
- model.after_tuning
112
111
  return nil if tuner_job.tuner_runs.all?(&:failed?)
113
112
 
114
113
  best_run = tuner_job.best_run
@@ -118,6 +117,13 @@ module EasyML
118
117
  status: :success,
119
118
  completed_at: Time.current,
120
119
  )
120
+ model.after_tuning
121
+ if best_run&.hyperparameters.present?
122
+ model.hyperparameters = best_run.hyperparameters
123
+ model.fit
124
+ model.save
125
+ end
126
+ model.cleanup
121
127
 
122
128
  best_run&.hyperparameters
123
129
  rescue StandardError => e
@@ -35,6 +35,18 @@ module EasyML
35
35
 
36
36
  private
37
37
 
38
+ def list_df_nulls(df)
39
+ df = df.lazy
40
+
41
+ columns = df.columns
42
+ selects = columns.map do |col|
43
+ Polars.col(col).null_count.alias(col)
44
+ end
45
+ null_info = df.select(selects).collect
46
+ null_info.to_hashes.first.compact
47
+ null_info.to_hashes.first.transform_values { |v| v > 0 ? v : nil }.compact.keys
48
+ end
49
+
38
50
  def apply_defaults(kwargs)
39
51
  options = kwargs.dup
40
52
 
@@ -1,4 +1,3 @@
1
-
2
1
  module EasyML
3
2
  module Data
4
3
  class DatasetManager
@@ -8,11 +7,17 @@ module EasyML
8
7
  return query_dataframes(lazy_frames, schema)
9
8
  end
10
9
 
10
+ def list_nulls
11
+ df = lazy_frames
12
+ list_df_nulls(df)
13
+ end
14
+
11
15
  def schema
12
16
  input.schema
13
17
  end
14
18
 
15
- private
19
+ private
20
+
16
21
  def lazy_frames
17
22
  input.lazy
18
23
  end
@@ -20,4 +25,4 @@ module EasyML
20
25
  end
21
26
  end
22
27
  end
23
- end
28
+ end
@@ -15,6 +15,11 @@ module EasyML
15
15
  return Batch.new(options, &block).query
16
16
  end
17
17
 
18
+ def list_nulls
19
+ df = dataframe.lazy
20
+ list_df_nulls(df)
21
+ end
22
+
18
23
  def schema
19
24
  @schema ||= files.any? ? Polars.read_parquet_schema(files.first) : nil
20
25
  end
@@ -17,12 +17,18 @@ module EasyML
17
17
  ).query
18
18
  end
19
19
 
20
- def self.schema(input, **kwargs, &block)
20
+ def self.schema(input = nil, **kwargs, &block)
21
21
  adapter(input).new(
22
22
  kwargs.merge!(input: input), &block
23
23
  ).schema
24
24
  end
25
25
 
26
+ def self.list_nulls(input = nil, **kwargs, &block)
27
+ adapter(input).new(
28
+ kwargs.merge!(input: input), &block
29
+ ).list_nulls
30
+ end
31
+
26
32
  def self.files(dir)
27
33
  Dir.glob(::File.join(dir, "**/*.{parquet}"))
28
34
  end
@@ -44,13 +44,21 @@ module EasyML
44
44
  Reader.schema(input, **kwargs, &block)
45
45
  end
46
46
 
47
+ def list_nulls(input = nil, **kwargs, &block)
48
+ Reader.list_nulls(input, **kwargs, &block)
49
+ end
50
+
47
51
  def num_rows
48
52
  Reader.num_rows
49
53
  end
50
54
  end
51
55
 
52
- def num_rows
53
- Reader.num_rows(root_dir)
56
+ def list_nulls(input = nil, **kwargs, &block)
57
+ Reader.list_nulls(input, **kwargs, &block)
58
+ end
59
+
60
+ def num_rows(input = nil, **kwargs, &block)
61
+ Reader.num_rows(input, **kwargs, &block)
54
62
  end
55
63
 
56
64
  def query(input = nil, **kwargs, &block)
@@ -0,0 +1,56 @@
1
+ module EasyML
2
+ module Data
3
+ class Embeddings
4
+ class Adapters
5
+ attr_accessor :model, :config
6
+
7
+ ADAPTERS = {
8
+ anthropic: Langchain::LLM::Anthropic,
9
+ gemini: Langchain::LLM::GoogleGemini,
10
+ openai: Langchain::LLM::OpenAI,
11
+ ollama: Langchain::LLM::Ollama,
12
+ }
13
+
14
+ DEFAULTS = {
15
+ api_key: {
16
+ anthropic: ENV["ANTHROPIC_API_KEY"],
17
+ gemini: ENV["GEMINI_API_KEY"],
18
+ openai: ENV["OPENAI_API_KEY"],
19
+ ollama: ENV["OLLAMA_API_KEY"],
20
+ },
21
+ }
22
+
23
+ def initialize(model, config = {})
24
+ @model = model.to_sym
25
+ @config = config.symbolize_keys
26
+ apply_defaults
27
+ end
28
+
29
+ def embed(df, col)
30
+ pick
31
+ texts = df[col].to_a
32
+ df = df.with_column(
33
+ embeddings: adapter.embed(text: texts),
34
+ )
35
+ end
36
+
37
+ private
38
+
39
+ def pick
40
+ @adapter ||= ADAPTERS[@model].new(config)
41
+ self
42
+ end
43
+
44
+ def apply_defaults
45
+ @config = @config.deep_symbolize_keys
46
+
47
+ DEFAULTS.each do |k, v|
48
+ unless @config.key?(k)
49
+ @config[k] = v[@model]
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
File without changes
@@ -0,0 +1,43 @@
1
+ module EasyML
2
+ module Data
3
+ class Embeddings
4
+ COMPRESSION_DEFAULT = {
5
+ present: :balanced,
6
+ }
7
+
8
+ attr_reader :df, :column, :model, :adapter, :compression,
9
+ :embeddings, :compressed_embeddings
10
+
11
+ def initialize(options = {})
12
+ @df = options[:df]
13
+ @column = options[:column]
14
+ @model = options[:model]
15
+ @config = options[:config] || {}
16
+ @compression = options[:compression] || COMPRESSION_DEFAULT
17
+ end
18
+
19
+ def create
20
+ embed
21
+ compress
22
+ end
23
+
24
+ def embed
25
+ @embeddings ||= adapter.embed(df, column)
26
+ end
27
+
28
+ def compress
29
+ @compressed_embeddings ||= compression_adapter.compress(embeddings)
30
+ end
31
+
32
+ private
33
+
34
+ def adapter
35
+ @adapter ||= EasyML::Data::Embeddings::Adapters.new(model, config)
36
+ end
37
+
38
+ def compression_adapter
39
+ @compression_adapter ||= EasyML::Data::Embeddings::Compression.new(compression)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc78"
4
+ VERSION = "0.2.0-rc81"
5
5
 
6
6
  module Version
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: easy_ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre.rc78
4
+ version: 0.2.0.pre.rc81
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brett Shollenberger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-02-18 00:00:00.000000000 Z
11
+ date: 2025-02-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
@@ -719,6 +719,9 @@ files:
719
719
  - lib/easy_ml/data/dataset_manager/writer/partitioned.rb
720
720
  - lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb
721
721
  - lib/easy_ml/data/date_converter.rb
722
+ - lib/easy_ml/data/embeddings.rb
723
+ - lib/easy_ml/data/embeddings/adapters.rb
724
+ - lib/easy_ml/data/embeddings/compression.rb
722
725
  - lib/easy_ml/data/partition.rb
723
726
  - lib/easy_ml/data/partition/boundaries.rb
724
727
  - lib/easy_ml/data/polars_column.rb