easy_ml 0.2.0.pre.rc85 → 0.2.0.pre.rc89

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +18 -2
  3. data/app/controllers/easy_ml/predictions_controller.rb +9 -1
  4. data/app/frontend/components/dataset/PreprocessingConfig.tsx +523 -150
  5. data/app/frontend/pages/DatasetsPage.tsx +0 -1
  6. data/app/frontend/types/dataset.ts +5 -2
  7. data/app/models/easy_ml/column/imputers/base.rb +23 -2
  8. data/app/models/easy_ml/column/imputers/embedding_encoder.rb +18 -0
  9. data/app/models/easy_ml/column/imputers/imputer.rb +1 -0
  10. data/app/models/easy_ml/column/imputers/most_frequent.rb +1 -1
  11. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -1
  12. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -1
  13. data/app/models/easy_ml/column/imputers.rb +47 -41
  14. data/app/models/easy_ml/column/selector.rb +2 -2
  15. data/app/models/easy_ml/column.rb +260 -56
  16. data/app/models/easy_ml/column_history.rb +6 -0
  17. data/app/models/easy_ml/column_list.rb +30 -1
  18. data/app/models/easy_ml/dataset/learner/lazy/embedding.rb +10 -0
  19. data/app/models/easy_ml/dataset/learner/lazy/query.rb +2 -0
  20. data/app/models/easy_ml/dataset/learner.rb +11 -0
  21. data/app/models/easy_ml/dataset.rb +6 -19
  22. data/app/models/easy_ml/lineage_history.rb +17 -0
  23. data/app/models/easy_ml/model.rb +11 -1
  24. data/app/models/easy_ml/models/xgboost.rb +37 -7
  25. data/app/models/easy_ml/pca_model.rb +21 -0
  26. data/app/models/easy_ml/prediction.rb +2 -1
  27. data/app/serializers/easy_ml/column_serializer.rb +13 -1
  28. data/config/initializers/inflections.rb +1 -0
  29. data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +6 -8
  30. data/lib/easy_ml/data/dataset_manager/writer/base.rb +15 -2
  31. data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +0 -1
  32. data/lib/easy_ml/data/dataset_manager/writer.rb +2 -0
  33. data/lib/easy_ml/data/embeddings/compressor.rb +179 -0
  34. data/lib/easy_ml/data/embeddings/embedder.rb +226 -0
  35. data/lib/easy_ml/data/embeddings.rb +61 -0
  36. data/lib/easy_ml/data/polars_column.rb +3 -0
  37. data/lib/easy_ml/data/polars_reader.rb +54 -23
  38. data/lib/easy_ml/data/polars_schema.rb +28 -2
  39. data/lib/easy_ml/data/splits/file_split.rb +7 -2
  40. data/lib/easy_ml/data.rb +1 -0
  41. data/lib/easy_ml/embedding_store.rb +92 -0
  42. data/lib/easy_ml/engine.rb +4 -2
  43. data/lib/easy_ml/predict.rb +42 -20
  44. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +5 -0
  45. data/lib/easy_ml/railtie/templates/migration/add_is_primary_key_to_easy_ml_columns.rb.tt +9 -0
  46. data/lib/easy_ml/railtie/templates/migration/add_metadata_to_easy_ml_predictions.rb.tt +6 -0
  47. data/lib/easy_ml/railtie/templates/migration/add_pca_model_id_to_easy_ml_columns.rb.tt +9 -0
  48. data/lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_dataset_histories.rb.tt +13 -0
  49. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_pca_models.rb.tt +14 -0
  50. data/lib/easy_ml/version.rb +1 -1
  51. data/lib/easy_ml.rb +1 -0
  52. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  53. data/public/easy_ml/assets/assets/Application-DfPoyRr8.css +1 -0
  54. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js +533 -0
  55. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js.map +1 -0
  56. metadata +59 -6
  57. data/lib/tasks/profile.rake +0 -40
  58. data/public/easy_ml/assets/assets/Application-nnn_XLuL.css +0 -1
  59. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js +0 -522
  60. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js.map +0 -1
@@ -315,12 +315,12 @@ module EasyML
315
315
  end
316
316
  end
317
317
 
318
- def predict(xs)
318
+ def predicting(xs, &block)
319
319
  raise "No trained model! Train a model before calling predict" unless @booster.present?
320
320
  raise "Cannot predict on nil — XGBoost" if xs.nil?
321
321
 
322
322
  begin
323
- y_pred = @booster.predict(preprocess(xs))
323
+ y_pred = yield(preprocess(xs))
324
324
  rescue StandardError => e
325
325
  raise e unless e.message.match?(/Number of columns does not match/)
326
326
 
@@ -335,6 +335,12 @@ module EasyML
335
335
  #{xs.columns}
336
336
  )
337
337
  end
338
+ end
339
+
340
+ def predict(xs)
341
+ y_pred = predicting(xs) do |d_matrix|
342
+ @booster.predict(d_matrix)
343
+ end
338
344
 
339
345
  case task.to_sym
340
346
  when :classification
@@ -344,12 +350,12 @@ module EasyML
344
350
  end
345
351
  end
346
352
 
347
- def predict_proba(data)
348
- dmat = DMatrix.new(data)
349
- y_pred = @booster.predict(dmat)
353
+ def predict_proba(xs)
354
+ y_pred = predicting(xs) do |d_matrix|
355
+ @booster.predict(d_matrix)
356
+ end
350
357
 
351
358
  if y_pred.first.is_a?(Array)
352
- # multiple classes
353
359
  y_pred
354
360
  else
355
361
  y_pred.map { |v| [1 - v, v] }
@@ -452,6 +458,27 @@ module EasyML
452
458
  )
453
459
  end
454
460
 
461
+ def explode_embeddings(df)
462
+ embedding_cols = dataset.columns.where.not(hidden: true).select(&:embedded?)
463
+ # Create all extraction expressions at once
464
+ select_expressions = []
465
+
466
+ # Retain all non-embedding columns
467
+ base_cols = df.schema.keys - embedding_cols.map(&:embedding_column)
468
+ select_expressions << Polars.col(base_cols)
469
+
470
+ # Add all embedding extraction expressions
471
+ embedding_cols.each do |col|
472
+ dims = col.n_dimensions || 1
473
+ (0...dims).each do |i|
474
+ # Create a single expression that extracts one element
475
+ select_expressions << Polars.col(col.embedding_column).list.get(i).alias("#{col.embedding_column}_#{i}")
476
+ end
477
+ end
478
+
479
+ df.select(select_expressions)
480
+ end
481
+
455
482
  def preprocess(xs, ys = nil)
456
483
  return xs if xs.is_a?(::XGBoost::DMatrix)
457
484
  lazy = xs.is_a?(Polars::LazyFrame)
@@ -468,7 +495,10 @@ module EasyML
468
495
  feature_cols -= [weights_col] if weights_col
469
496
 
470
497
  # Get features, labels and weights
471
- features = lazy ? xs.select(feature_cols).collect.to_numo : xs.select(feature_cols).to_numo
498
+ exploded = explode_embeddings(xs.select(feature_cols))
499
+ feature_cols = exploded.columns
500
+ features = lazy ? exploded.collect.to_numo : exploded.to_numo
501
+
472
502
  weights = weights_col ? (lazy ? xs.select(weights_col).collect.to_numo : xs.select(weights_col).to_numo) : nil
473
503
  weights = weights.flatten if weights
474
504
  if ys.present?
@@ -0,0 +1,21 @@
1
+ # == Schema Information
2
+ #
3
+ # Table name: easy_ml_pca_models
4
+ #
5
+ # id :bigint not null, primary key
6
+ # model :binary not null
7
+ # fit_at :datetime
8
+ # created_at :datetime not null
9
+ # updated_at :datetime not null
10
+ #
11
+ module EasyML
12
+ class PCAModel < ActiveRecord::Base
13
+ def model
14
+ Marshal.load(read_attribute(:model))
15
+ end
16
+
17
+ def model=(model)
18
+ write_attribute(:model, Marshal.dump(model.dup))
19
+ end
20
+ end
21
+ end
@@ -11,6 +11,7 @@
11
11
  # normalized_input :jsonb
12
12
  # created_at :datetime not null
13
13
  # updated_at :datetime not null
14
+ # metadata :jsonb not null
14
15
  #
15
16
  module EasyML
16
17
  class Prediction < ActiveRecord::Base
@@ -30,7 +31,7 @@ module EasyML
30
31
  end
31
32
 
32
33
  def probabilities
33
- prediction_value["probabilities"]
34
+ metadata["probabilities"]
34
35
  end
35
36
 
36
37
  def regression?
@@ -27,13 +27,25 @@ module EasyML
27
27
  include JSONAPI::Serializer
28
28
 
29
29
  attributes :id, :name, :description, :dataset_id, :datatype, :polars_datatype, :preprocessing_steps,
30
- :hidden, :drop_if_null, :sample_values, :statistics, :is_target,
30
+ :hidden, :drop_if_null, :sample_values, :is_target,
31
31
  :is_computed, :computed_by
32
32
 
33
33
  attribute :required do |object|
34
34
  object.required?
35
35
  end
36
36
 
37
+ attribute :statistics do |column|
38
+ if column.is_computed?
39
+ stats = column.statistics
40
+ {
41
+ raw: stats[:processed],
42
+ processed: stats[:processed],
43
+ }
44
+ else
45
+ column.statistics
46
+ end
47
+ end
48
+
37
49
  attribute :lineage do |column|
38
50
  column.lineages.map do |lineage|
39
51
  LineageSerializer.new(lineage).serializable_hash.dig(:data, :attributes)
@@ -14,6 +14,7 @@ module EasyML
14
14
  inflect.acronym "HTML"
15
15
  inflect.acronym "API"
16
16
  inflect.acronym "APIs"
17
+ inflect.acronym "PCA"
17
18
  end
18
19
  end
19
20
  end
@@ -13,21 +13,19 @@ module EasyML
13
13
  end
14
14
 
15
15
  def store
16
- # If there are no existing files, just store as normal
16
+ @df = @df.unique(subset: [primary_key])
17
17
  return super if files.empty?
18
18
 
19
19
  # Get existing data lazily
20
- existing_keys = query(lazy: true)
21
- .select(primary_key)
22
- .collect[primary_key]
23
- .to_a
20
+ existing_keys = query(lazy: true).select(primary_key)
24
21
 
25
22
  # Convert input to lazy if it isn't already
26
23
  input_data = df.is_a?(Polars::LazyFrame) ? df : df.lazy
27
24
 
28
- # Filter out records that already exist
29
- new_records = input_data.filter(
30
- Polars.col(primary_key).is_in(existing_keys).not_
25
+ new_records = input_data.join(
26
+ existing_keys,
27
+ on: primary_key,
28
+ how: "anti",
31
29
  )
32
30
 
33
31
  # If we have new records, store them
@@ -66,7 +66,20 @@ module EasyML
66
66
 
67
67
  def safe_write(df, path)
68
68
  FileUtils.mkdir_p(File.dirname(path))
69
- df.is_a?(Polars::LazyFrame) ? df.sink_parquet(path) : df.write_parquet(path)
69
+ if df.is_a?(Polars::LazyFrame)
70
+ # Depending on the query plan, sometimes sink_parquet will throw an error...
71
+ # in this case we have to collect first and fallback to write_parquet
72
+ begin
73
+ # Try the faster sink_parquet first
74
+ df.sink_parquet(path)
75
+ rescue Polars::InvalidOperationError => e
76
+ # Fall back to collect().write_parquet()
77
+ df.collect.write_parquet(path)
78
+ end
79
+ else
80
+ # Already a materialized DataFrame
81
+ df.write_parquet(path)
82
+ end
70
83
  path
71
84
  end
72
85
 
@@ -95,7 +108,7 @@ module EasyML
95
108
  keylist = unique_id_key(subdir: "keylist")
96
109
 
97
110
  acquire_lock(keylist) do |suo|
98
- suo.client.sadd(keylist, key)
111
+ suo.client.sadd?(keylist, key)
99
112
  end
100
113
  end
101
114
 
@@ -65,7 +65,6 @@ module EasyML
65
65
  partition_df = df.filter(Polars.col(primary_key).is_between(partition_start, partition_end))
66
66
  num_rows = lazy? ? partition_df.select(Polars.length).collect[0, 0] : partition_df.shape[0]
67
67
 
68
- binding.pry if num_rows == 0
69
68
  next if num_rows == 0
70
69
  yield partition_df, partition
71
70
  end
@@ -32,6 +32,8 @@ module EasyML
32
32
  end
33
33
 
34
34
  def store(df, *args)
35
+ return df if df.is_a?(Polars::LazyFrame) ? df.schema.empty? : df.empty?
36
+
35
37
  adapter_class.new(options.merge!(df: df)).store(*args)
36
38
  end
37
39
 
@@ -0,0 +1,179 @@
1
+ module EasyML
2
+ module Data
3
+ class Embeddings
4
+ class Compressor
5
+ # Quality presets with their respective variance preservation targets
6
+ PRESETS = {
7
+ full: {
8
+ variance_target: 1.0,
9
+ description: "Preserves all information while reducing dimensions",
10
+ },
11
+ high_quality: {
12
+ variance_target: 0.95,
13
+ description: "Preserves 95% of information while reducing dimensions",
14
+ },
15
+ balanced: {
16
+ variance_target: 0.85,
17
+ description: "Balanced approach: 85% information preservation with substantial size reduction",
18
+ },
19
+ space_efficient: {
20
+ variance_target: 0.75,
21
+ description: "Maximizes storage savings while maintaining 75% of important information",
22
+ },
23
+ }
24
+
25
+ attr_reader :original_dimensions, :reduced_dimensions, :preserved_variance,
26
+ :compression_ratio, :storage_savings, :preset_used
27
+ attr_accessor :preset, :dimensions, :column, :embedding_column, :fit, :pca_model
28
+
29
+ def initialize(config = {})
30
+ @preset = config.dig(:preset)
31
+ @dimensions = config.dig(:dimensions)
32
+
33
+ @preset = :full unless @preset || @dimensions
34
+ @pca_model = config.dig(:pca_model)
35
+ @original_dimensions = nil
36
+ @reduced_dimensions = nil
37
+ @preserved_variance = nil
38
+ @compression_ratio = nil
39
+ @storage_savings = nil
40
+ @preset_used = nil
41
+ end
42
+
43
+ def inspect
44
+ "#<#{self.class.name} original_dimensions=#{@original_dimensions}, reduced_dimensions=#{@reduced_dimensions}, preserved_variance=#{@preserved_variance}, compression_ratio=#{@compression_ratio}, storage_savings=#{@storage_savings}, preset_used=#{@preset_used}>"
45
+ end
46
+
47
+ # Right now, enabling OpenBLAS as the Numo::LinAlg backend causes
48
+ # memory issues with XGBoost due to conflicts with libomp.
49
+ # Since arm-based OSX has doesn't have support for MKL, we have to fall back to
50
+ # a very slow matrix factorization implementation which doesn't seem sustainable.
51
+ #
52
+ # One potential solution is to create an accelerate backend for Numo::LinAlg,
53
+ # or compiling OpenBLAS without USE_OPENMP=0,
54
+ # but for now I'm just disabling compression support.
55
+ #
56
+ # http://pypackaging-native.github.io/key-issues/native-dependencies/blas_openmp/
57
+ #
58
+ COMPRESSION_ENABLED = false
59
+
60
+ def compress(df, column, embedding_column, fit: false)
61
+ # begin
62
+ # result = actually_compress(df, column, embedding_column, fit: fit)
63
+ # GC.start # This might allow us to cleanup after OpenBLAS and fix the thread pool
64
+ # end
65
+
66
+ # result
67
+ return df unless COMPRESSION_ENABLED
68
+ actually_compress(df, column, embedding_column, fit: fit)
69
+ end
70
+
71
+ def actually_compress(df, column, embedding_column, fit: false)
72
+ @column = column
73
+ @embedding_column = embedding_column
74
+ @fit = fit
75
+
76
+ # Create a dataframe of unique texts and their embeddings
77
+ unique_df = df.select([column, embedding_column])
78
+ .filter(Polars.col(column).is_not_null & (Polars.col(column) != ""))
79
+ .unique
80
+
81
+ # Compress the unique embeddings
82
+ compressed_df = reduce_to_dimensions(unique_df, target_dimensions: dimensions)
83
+ compressed_df = compressed_df.with_columns(Polars.col(embedding_column).cast(df.schema[embedding_column]).alias(embedding_column))
84
+
85
+ df = df.drop(embedding_column)
86
+
87
+ # Join back to original dataframe to maintain all rows
88
+ df.join(compressed_df, on: column, how: "left")
89
+ end
90
+
91
+ # Reduce dimensions using a preset quality level
92
+ def reduce_with_preset(embeddings_df, preset: :balanced)
93
+ unless PRESETS.key?(preset)
94
+ raise ArgumentError, "Unknown preset: #{preset}. Available presets: #{PRESETS.keys.join(", ")}"
95
+ end
96
+
97
+ @preset_used = preset
98
+ target_variance = PRESETS[preset][:variance_target]
99
+
100
+ reduce_to_variance(embeddings_df, target_variance: target_variance)
101
+ end
102
+
103
+ # Reduce dimensions to a specific number
104
+ def reduce_to_dimensions(embeddings_df, target_dimensions:)
105
+ puts "reducing model dims..."
106
+ validate_input(embeddings_df)
107
+
108
+ # Convert embedding columns to Numo::NArray for Rumale
109
+ x = df_to_narray(embeddings_df, embedding_column)
110
+ @original_dimensions = x.shape[1]
111
+
112
+ if target_dimensions >= @original_dimensions
113
+ raise ArgumentError, "Target dimensions must be less than original dimensions"
114
+ end
115
+
116
+ # Initialize and fit PCA
117
+ if @pca_model.present?
118
+ transformed = @pca_model.transform(x)
119
+ else
120
+ @pca_model = Rumale::Decomposition::PCA.new(n_components: target_dimensions)
121
+ transformed = @pca_model.fit_transform(x)
122
+ end
123
+
124
+ # Create new dataframe with reduced embeddings
125
+ create_result_dataframe(embeddings_df, embedding_column, transformed)
126
+ end
127
+
128
+ # Reduce dimensions to preserve a target variance
129
+ def reduce_to_variance(embeddings_df, target_variance:)
130
+ validate_input(embeddings_df)
131
+
132
+ # Convert embedding columns to Numo::NArray for Rumale
133
+ x = df_to_narray(embeddings_df, embedding_column)
134
+
135
+ # Get original dimensions from the first embedding
136
+ @original_dimensions = x.shape[1]
137
+
138
+ # Calculate the target number of components based on variance preservation
139
+ target_components = (@original_dimensions * target_variance).ceil
140
+
141
+ # First fit PCA with all components to analyze variance
142
+ if @pca_model.present?
143
+ transformed = @pca_model.transform(x)
144
+ else
145
+ @pca_model = Rumale::Decomposition::PCA.new(n_components: target_components)
146
+ transformed = @pca_model.fit_transform(x)
147
+ end
148
+
149
+ # Create new dataframe with reduced embeddings
150
+ create_result_dataframe(embeddings_df, embedding_column, transformed)
151
+ end
152
+
153
+ private
154
+
155
+ def validate_input(df)
156
+ return if df.is_a?(Polars::DataFrame)
157
+
158
+ raise ArgumentError, "Input must be a Polars DataFrame"
159
+ end
160
+
161
+ def get_embedding_columns(df)
162
+ # Assumes embedding columns are numeric and have a pattern like 'embedding_0', 'embedding_1', etc.
163
+ # Adjust this logic if your embedding columns follow a different naming convention
164
+ df.columns.select { |col| col.match(/^embedding_\d+$/) || col.match(/^vector_\d+$/) }
165
+ end
166
+
167
+ def df_to_narray(df, embedding_column)
168
+ Numo::DFloat.cast(df[embedding_column].to_a)
169
+ end
170
+
171
+ def create_result_dataframe(original_df, embedding_column, transformed_data)
172
+ original_df.with_column(
173
+ Polars.lit(transformed_data).alias(embedding_column)
174
+ )
175
+ end
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,226 @@
1
+ module EasyML
2
+ module Data
3
+ class Embeddings
4
+ class Embedder
5
+ attr_accessor :llm, :config, :adapter
6
+
7
+ # Provider-specific batch size recommendations
8
+ BATCH_SIZES = {
9
+ openai: 500, # OpenAI allows up to 2048 items per batch, but 500 is recommended
10
+ anthropic: 100, # Conservative default for Anthropic
11
+ gemini: 100, # Conservative default for Google's Gemini
12
+ ollama: 50, # Local models typically have more limited batch sizes
13
+ default: 100, # Default for any other provider
14
+ }
15
+
16
+ ADAPTERS = {
17
+ anthropic: Langchain::LLM::Anthropic,
18
+ gemini: Langchain::LLM::GoogleGemini,
19
+ openai: Langchain::LLM::OpenAI,
20
+ ollama: Langchain::LLM::Ollama,
21
+ }
22
+
23
+ DEFAULTS = {
24
+ api_key: {
25
+ anthropic: ENV["ANTHROPIC_API_KEY"],
26
+ gemini: ENV["GEMINI_API_KEY"],
27
+ openai: ENV["OPENAI_API_KEY"],
28
+ ollama: ENV["OLLAMA_API_KEY"],
29
+ },
30
+ }
31
+
32
+ def initialize(llm, config = {})
33
+ @llm = llm.to_sym
34
+ @config = config.symbolize_keys
35
+ apply_defaults
36
+ end
37
+
38
+ def embed(df, col, output_column)
39
+ pick
40
+
41
+ # Create a dataframe of unique texts and their embeddings
42
+ unique_df = df.select(col)
43
+ .filter(Polars.col(col).is_not_null & (Polars.col(col) != ""))
44
+ .unique
45
+
46
+ unique_texts = unique_df[col].to_a
47
+ unique_embeddings = batch_embed(unique_texts)
48
+
49
+ # Create a new dataframe with text-embedding pairs
50
+ embeddings_df = Polars::DataFrame.new(
51
+ { col => unique_texts, output_column => unique_embeddings }
52
+ )
53
+ embeddings_df = embeddings_df.with_columns(
54
+ Polars.col(col).cast(df.schema[col]).alias(col)
55
+ )
56
+
57
+ # Join the original dataframe with the embeddings
58
+ df = df.join(embeddings_df, on: col, how: "left")
59
+
60
+ if df.columns.include?("#{output_column}_right")
61
+ df = df.with_columns(
62
+ Polars.when(
63
+ Polars.col(output_column).is_null.not_
64
+ ).then(
65
+ Polars.col(output_column)
66
+ ).otherwise(
67
+ Polars.col("#{output_column}_right")
68
+ )
69
+ )
70
+ df = df.drop("#{output_column}_right")
71
+ end
72
+
73
+ df
74
+ end
75
+
76
+ private
77
+
78
+ def batch_embed(texts)
79
+ # Skip empty processing
80
+ return [] if texts.nil? || texts.empty?
81
+
82
+ # Filter out nil or empty strings
83
+ texts = texts.compact.reject(&:empty?)
84
+ return [] if texts.empty?
85
+
86
+ # Get batch size based on provider
87
+ batch_size = config[:batch_size] || BATCH_SIZES[@llm] || BATCH_SIZES[:default]
88
+
89
+ # Get parallel processing settings
90
+ parallel_processes = config[:parallel_processes] || 4
91
+ parallelism_mode = (config[:parallelism_mode] || :threads).to_sym
92
+
93
+ # Calculate optimal number of batches based on input size and processes
94
+ total_batches = (texts.size.to_f / batch_size).ceil
95
+ num_batches = [total_batches, parallel_processes].min
96
+ optimal_batch_size = (texts.size.to_f / num_batches).ceil
97
+
98
+ # Create batches based on the optimal batch size
99
+ batches = texts.each_slice(optimal_batch_size).to_a
100
+
101
+ parallel_processes = [parallel_processes, num_batches].min
102
+
103
+ # Process in parallel with appropriate error handling
104
+ all_embeddings = []
105
+
106
+ if parallel_processes > 1 && num_batches > 1
107
+ case parallelism_mode
108
+ when :threads
109
+ all_embeddings = Parallel.map(batches, in_threads: parallel_processes) do |batch|
110
+ with_retries { process_batch(batch) }
111
+ end
112
+ when :processes
113
+ all_embeddings = Parallel.map(batches, in_processes: parallel_processes) do |batch|
114
+ with_retries { process_batch(batch) }
115
+ end
116
+ else
117
+ raise ArgumentError, "parallelism_mode must be :threads or :processes"
118
+ end
119
+ else
120
+ # Sequential processing
121
+ batches.each do |batch|
122
+ all_embeddings << with_retries { process_batch(batch) }
123
+ end
124
+ end
125
+
126
+ # Flatten the results and return
127
+ all_embeddings.flatten(1)
128
+ end
129
+
130
+ def process_batch(batch)
131
+ response = adapter.embed(text: batch)
132
+ unpack(response)
133
+ end
134
+
135
+ def unpack(embeddings)
136
+ raw_response = embeddings.raw_response.deep_symbolize_keys
137
+ case llm.to_sym
138
+ when :openai
139
+ raw_response.dig(:data).map { |e| e[:embedding] }
140
+ else
141
+ embeddings
142
+ end
143
+ end
144
+
145
+ def with_retries(max_retries: 3, base_delay: 1, max_delay: 60)
146
+ retries = 0
147
+ begin
148
+ yield
149
+ rescue => e
150
+ retries += 1
151
+ if retries <= max_retries
152
+ # Exponential backoff with jitter
153
+ delay = [base_delay * (2 ** (retries - 1)) * (1 + rand * 0.1), max_delay].min
154
+ sleep(delay)
155
+ retry
156
+ else
157
+ raise e
158
+ end
159
+ end
160
+ end
161
+
162
+ # These options are pulled from Langchain
163
+ #
164
+ # default_options: {
165
+ # embeddings_model_name: "text-embedding-3-small",
166
+ # },
167
+ def pick
168
+ @adapter ||= ADAPTERS[@llm].new(**config)
169
+ self
170
+ end
171
+
172
+ def apply_defaults
173
+ @config = @config.deep_symbolize_keys
174
+
175
+ DEFAULTS.each do |k, v|
176
+ unless @config.key?(k)
177
+ @config[k] = v[@llm]
178
+ end
179
+ end
180
+ end
181
+
182
+ def self.constants
183
+ {
184
+ providers: [
185
+ { value: "openai", label: "OpenAI" },
186
+ { value: "anthropic", label: "Anthropic" },
187
+ { value: "ollama", label: "Ollama (Local)" },
188
+ ],
189
+ models: {
190
+ openai: [
191
+ { value: "text-embedding-3-small", label: "text-embedding-3-small", dimensions: 1536 },
192
+ { value: "text-embedding-3-large", label: "text-embedding-3-large", dimensions: 3072 },
193
+ { value: "text-embedding-ada-002", label: "text-embedding-ada-002", dimensions: 1536 },
194
+ ],
195
+ anthropic: [
196
+ { value: "claude-3", label: "Claude 3", dimensions: 3072 },
197
+ { value: "claude-2", label: "Claude 2", dimensions: 1536 },
198
+ ],
199
+ ollama: [
200
+ { value: "llama2", label: "Llama 2", dimensions: 4096 },
201
+ { value: "mistral", label: "Mistral", dimensions: 4096 },
202
+ { value: "mixtral", label: "Mixtral", dimensions: 4096 },
203
+ { value: "nomic-embed-text", label: "Nomic Embed", dimensions: 768 },
204
+ { value: "starling-lm", label: "Starling", dimensions: 4096 },
205
+ ],
206
+ },
207
+ compression_presets: {
208
+ high_quality: {
209
+ description: "Preserves subtle relationships and nuanced meaning",
210
+ variance_target: 0.95,
211
+ },
212
+ balanced: {
213
+ description: "Good balance of quality and storage efficiency",
214
+ variance_target: 0.85,
215
+ },
216
+ storage_optimized: {
217
+ description: "Maximizes storage efficiency while maintaining core meaning",
218
+ variance_target: 0.75,
219
+ },
220
+ },
221
+ }
222
+ end
223
+ end
224
+ end
225
+ end
226
+ end