easy_ml 0.2.0.pre.rc84 → 0.2.0.pre.rc88
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +19 -3
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +523 -150
- data/app/frontend/types/dataset.ts +5 -2
- data/app/models/easy_ml/column/imputers/base.rb +23 -2
- data/app/models/easy_ml/column/imputers/embedding_encoder.rb +18 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +1 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +1 -1
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -1
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -1
- data/app/models/easy_ml/column/imputers.rb +47 -41
- data/app/models/easy_ml/column/selector.rb +2 -2
- data/app/models/easy_ml/column.rb +260 -56
- data/app/models/easy_ml/column_history.rb +6 -0
- data/app/models/easy_ml/column_list.rb +30 -1
- data/app/models/easy_ml/dataset/learner/lazy/embedding.rb +10 -0
- data/app/models/easy_ml/dataset/learner/lazy/query.rb +2 -0
- data/app/models/easy_ml/dataset/learner.rb +11 -0
- data/app/models/easy_ml/dataset.rb +6 -19
- data/app/models/easy_ml/lineage_history.rb +17 -0
- data/app/models/easy_ml/model.rb +11 -1
- data/app/models/easy_ml/models/xgboost.rb +37 -7
- data/app/models/easy_ml/pca_model.rb +21 -0
- data/app/models/easy_ml/prediction.rb +2 -1
- data/app/serializers/easy_ml/column_serializer.rb +13 -1
- data/config/initializers/inflections.rb +1 -0
- data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +6 -8
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +15 -2
- data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +0 -1
- data/lib/easy_ml/data/dataset_manager/writer.rb +2 -0
- data/lib/easy_ml/data/embeddings/compressor.rb +179 -0
- data/lib/easy_ml/data/embeddings/embedder.rb +226 -0
- data/lib/easy_ml/data/embeddings.rb +61 -0
- data/lib/easy_ml/data/polars_column.rb +3 -0
- data/lib/easy_ml/data/polars_reader.rb +54 -23
- data/lib/easy_ml/data/polars_schema.rb +28 -2
- data/lib/easy_ml/data/splits/file_split.rb +7 -2
- data/lib/easy_ml/data.rb +1 -0
- data/lib/easy_ml/embedding_store.rb +92 -0
- data/lib/easy_ml/engine.rb +4 -2
- data/lib/easy_ml/predict.rb +42 -20
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +5 -0
- data/lib/easy_ml/railtie/templates/migration/add_is_primary_key_to_easy_ml_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_metadata_to_easy_ml_predictions.rb.tt +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_pca_model_id_to_easy_ml_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_dataset_histories.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_pca_models.rb.tt +14 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +1 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-DfPoyRr8.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js +533 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js.map +1 -0
- metadata +59 -6
- data/lib/tasks/profile.rake +0 -40
- data/public/easy_ml/assets/assets/Application-nnn_XLuL.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js +0 -522
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js.map +0 -1
@@ -315,12 +315,12 @@ module EasyML
|
|
315
315
|
end
|
316
316
|
end
|
317
317
|
|
318
|
-
def
|
318
|
+
def predicting(xs, &block)
|
319
319
|
raise "No trained model! Train a model before calling predict" unless @booster.present?
|
320
320
|
raise "Cannot predict on nil — XGBoost" if xs.nil?
|
321
321
|
|
322
322
|
begin
|
323
|
-
y_pred =
|
323
|
+
y_pred = yield(preprocess(xs))
|
324
324
|
rescue StandardError => e
|
325
325
|
raise e unless e.message.match?(/Number of columns does not match/)
|
326
326
|
|
@@ -335,6 +335,12 @@ module EasyML
|
|
335
335
|
#{xs.columns}
|
336
336
|
)
|
337
337
|
end
|
338
|
+
end
|
339
|
+
|
340
|
+
def predict(xs)
|
341
|
+
y_pred = predicting(xs) do |d_matrix|
|
342
|
+
@booster.predict(d_matrix)
|
343
|
+
end
|
338
344
|
|
339
345
|
case task.to_sym
|
340
346
|
when :classification
|
@@ -344,12 +350,12 @@ module EasyML
|
|
344
350
|
end
|
345
351
|
end
|
346
352
|
|
347
|
-
def predict_proba(
|
348
|
-
|
349
|
-
|
353
|
+
def predict_proba(xs)
|
354
|
+
y_pred = predicting(xs) do |d_matrix|
|
355
|
+
@booster.predict(d_matrix)
|
356
|
+
end
|
350
357
|
|
351
358
|
if y_pred.first.is_a?(Array)
|
352
|
-
# multiple classes
|
353
359
|
y_pred
|
354
360
|
else
|
355
361
|
y_pred.map { |v| [1 - v, v] }
|
@@ -452,6 +458,27 @@ module EasyML
|
|
452
458
|
)
|
453
459
|
end
|
454
460
|
|
461
|
+
def explode_embeddings(df)
|
462
|
+
embedding_cols = dataset.columns.where.not(hidden: true).select(&:embedded?)
|
463
|
+
# Create all extraction expressions at once
|
464
|
+
select_expressions = []
|
465
|
+
|
466
|
+
# Retain all non-embedding columns
|
467
|
+
base_cols = df.schema.keys - embedding_cols.map(&:embedding_column)
|
468
|
+
select_expressions << Polars.col(base_cols)
|
469
|
+
|
470
|
+
# Add all embedding extraction expressions
|
471
|
+
embedding_cols.each do |col|
|
472
|
+
dims = col.n_dimensions || 1
|
473
|
+
(0...dims).each do |i|
|
474
|
+
# Create a single expression that extracts one element
|
475
|
+
select_expressions << Polars.col(col.embedding_column).list.get(i).alias("#{col.embedding_column}_#{i}")
|
476
|
+
end
|
477
|
+
end
|
478
|
+
|
479
|
+
df.select(select_expressions)
|
480
|
+
end
|
481
|
+
|
455
482
|
def preprocess(xs, ys = nil)
|
456
483
|
return xs if xs.is_a?(::XGBoost::DMatrix)
|
457
484
|
lazy = xs.is_a?(Polars::LazyFrame)
|
@@ -468,7 +495,10 @@ module EasyML
|
|
468
495
|
feature_cols -= [weights_col] if weights_col
|
469
496
|
|
470
497
|
# Get features, labels and weights
|
471
|
-
|
498
|
+
exploded = explode_embeddings(xs.select(feature_cols))
|
499
|
+
feature_cols = exploded.columns
|
500
|
+
features = lazy ? exploded.collect.to_numo : exploded.to_numo
|
501
|
+
|
472
502
|
weights = weights_col ? (lazy ? xs.select(weights_col).collect.to_numo : xs.select(weights_col).to_numo) : nil
|
473
503
|
weights = weights.flatten if weights
|
474
504
|
if ys.present?
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_pca_models
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# model :binary not null
|
7
|
+
# fit_at :datetime
|
8
|
+
# created_at :datetime not null
|
9
|
+
# updated_at :datetime not null
|
10
|
+
#
|
11
|
+
module EasyML
|
12
|
+
class PCAModel < ActiveRecord::Base
|
13
|
+
def model
|
14
|
+
Marshal.load(read_attribute(:model))
|
15
|
+
end
|
16
|
+
|
17
|
+
def model=(model)
|
18
|
+
write_attribute(:model, Marshal.dump(model.dup))
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -11,6 +11,7 @@
|
|
11
11
|
# normalized_input :jsonb
|
12
12
|
# created_at :datetime not null
|
13
13
|
# updated_at :datetime not null
|
14
|
+
# metadata :jsonb not null
|
14
15
|
#
|
15
16
|
module EasyML
|
16
17
|
class Prediction < ActiveRecord::Base
|
@@ -30,7 +31,7 @@ module EasyML
|
|
30
31
|
end
|
31
32
|
|
32
33
|
def probabilities
|
33
|
-
|
34
|
+
metadata["probabilities"]
|
34
35
|
end
|
35
36
|
|
36
37
|
def regression?
|
@@ -27,13 +27,25 @@ module EasyML
|
|
27
27
|
include JSONAPI::Serializer
|
28
28
|
|
29
29
|
attributes :id, :name, :description, :dataset_id, :datatype, :polars_datatype, :preprocessing_steps,
|
30
|
-
:hidden, :drop_if_null, :sample_values, :
|
30
|
+
:hidden, :drop_if_null, :sample_values, :is_target,
|
31
31
|
:is_computed, :computed_by
|
32
32
|
|
33
33
|
attribute :required do |object|
|
34
34
|
object.required?
|
35
35
|
end
|
36
36
|
|
37
|
+
attribute :statistics do |column|
|
38
|
+
if column.is_computed?
|
39
|
+
stats = column.statistics
|
40
|
+
{
|
41
|
+
raw: stats[:processed],
|
42
|
+
processed: stats[:processed],
|
43
|
+
}
|
44
|
+
else
|
45
|
+
column.statistics
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
37
49
|
attribute :lineage do |column|
|
38
50
|
column.lineages.map do |lineage|
|
39
51
|
LineageSerializer.new(lineage).serializable_hash.dig(:data, :attributes)
|
@@ -13,21 +13,19 @@ module EasyML
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def store
|
16
|
-
|
16
|
+
@df = @df.unique(subset: [primary_key])
|
17
17
|
return super if files.empty?
|
18
18
|
|
19
19
|
# Get existing data lazily
|
20
|
-
existing_keys = query(lazy: true)
|
21
|
-
.select(primary_key)
|
22
|
-
.collect[primary_key]
|
23
|
-
.to_a
|
20
|
+
existing_keys = query(lazy: true).select(primary_key)
|
24
21
|
|
25
22
|
# Convert input to lazy if it isn't already
|
26
23
|
input_data = df.is_a?(Polars::LazyFrame) ? df : df.lazy
|
27
24
|
|
28
|
-
|
29
|
-
|
30
|
-
|
25
|
+
new_records = input_data.join(
|
26
|
+
existing_keys,
|
27
|
+
on: primary_key,
|
28
|
+
how: "anti",
|
31
29
|
)
|
32
30
|
|
33
31
|
# If we have new records, store them
|
@@ -66,7 +66,20 @@ module EasyML
|
|
66
66
|
|
67
67
|
def safe_write(df, path)
|
68
68
|
FileUtils.mkdir_p(File.dirname(path))
|
69
|
-
df.is_a?(Polars::LazyFrame)
|
69
|
+
if df.is_a?(Polars::LazyFrame)
|
70
|
+
# Depending on the query plan, sometimes sink_parquet will throw an error...
|
71
|
+
# in this case we have to collect first and fallback to write_parquet
|
72
|
+
begin
|
73
|
+
# Try the faster sink_parquet first
|
74
|
+
df.sink_parquet(path)
|
75
|
+
rescue Polars::InvalidOperationError => e
|
76
|
+
# Fall back to collect().write_parquet()
|
77
|
+
df.collect.write_parquet(path)
|
78
|
+
end
|
79
|
+
else
|
80
|
+
# Already a materialized DataFrame
|
81
|
+
df.write_parquet(path)
|
82
|
+
end
|
70
83
|
path
|
71
84
|
end
|
72
85
|
|
@@ -95,7 +108,7 @@ module EasyML
|
|
95
108
|
keylist = unique_id_key(subdir: "keylist")
|
96
109
|
|
97
110
|
acquire_lock(keylist) do |suo|
|
98
|
-
suo.client.sadd(keylist, key)
|
111
|
+
suo.client.sadd?(keylist, key)
|
99
112
|
end
|
100
113
|
end
|
101
114
|
|
@@ -65,7 +65,6 @@ module EasyML
|
|
65
65
|
partition_df = df.filter(Polars.col(primary_key).is_between(partition_start, partition_end))
|
66
66
|
num_rows = lazy? ? partition_df.select(Polars.length).collect[0, 0] : partition_df.shape[0]
|
67
67
|
|
68
|
-
binding.pry if num_rows == 0
|
69
68
|
next if num_rows == 0
|
70
69
|
yield partition_df, partition
|
71
70
|
end
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class Embeddings
|
4
|
+
class Compressor
|
5
|
+
# Quality presets with their respective variance preservation targets
|
6
|
+
PRESETS = {
|
7
|
+
full: {
|
8
|
+
variance_target: 1.0,
|
9
|
+
description: "Preserves all information while reducing dimensions",
|
10
|
+
},
|
11
|
+
high_quality: {
|
12
|
+
variance_target: 0.95,
|
13
|
+
description: "Preserves 95% of information while reducing dimensions",
|
14
|
+
},
|
15
|
+
balanced: {
|
16
|
+
variance_target: 0.85,
|
17
|
+
description: "Balanced approach: 85% information preservation with substantial size reduction",
|
18
|
+
},
|
19
|
+
space_efficient: {
|
20
|
+
variance_target: 0.75,
|
21
|
+
description: "Maximizes storage savings while maintaining 75% of important information",
|
22
|
+
},
|
23
|
+
}
|
24
|
+
|
25
|
+
attr_reader :original_dimensions, :reduced_dimensions, :preserved_variance,
|
26
|
+
:compression_ratio, :storage_savings, :preset_used
|
27
|
+
attr_accessor :preset, :dimensions, :column, :embedding_column, :fit, :pca_model
|
28
|
+
|
29
|
+
def initialize(config = {})
|
30
|
+
@preset = config.dig(:preset)
|
31
|
+
@dimensions = config.dig(:dimensions)
|
32
|
+
|
33
|
+
@preset = :full unless @preset || @dimensions
|
34
|
+
@pca_model = config.dig(:pca_model)
|
35
|
+
@original_dimensions = nil
|
36
|
+
@reduced_dimensions = nil
|
37
|
+
@preserved_variance = nil
|
38
|
+
@compression_ratio = nil
|
39
|
+
@storage_savings = nil
|
40
|
+
@preset_used = nil
|
41
|
+
end
|
42
|
+
|
43
|
+
def inspect
|
44
|
+
"#<#{self.class.name} original_dimensions=#{@original_dimensions}, reduced_dimensions=#{@reduced_dimensions}, preserved_variance=#{@preserved_variance}, compression_ratio=#{@compression_ratio}, storage_savings=#{@storage_savings}, preset_used=#{@preset_used}>"
|
45
|
+
end
|
46
|
+
|
47
|
+
# Right now, enabling OpenBLAS as the Numo::LinAlg backend causes
|
48
|
+
# memory issues with XGBoost due to conflicts with libomp.
|
49
|
+
# Since arm-based OSX has doesn't have support for MKL, we have to fall back to
|
50
|
+
# a very slow matrix factorization implementation which doesn't seem sustainable.
|
51
|
+
#
|
52
|
+
# One potential solution is to create an accelerate backend for Numo::LinAlg,
|
53
|
+
# or compiling OpenBLAS without USE_OPENMP=0,
|
54
|
+
# but for now I'm just disabling compression support.
|
55
|
+
#
|
56
|
+
# http://pypackaging-native.github.io/key-issues/native-dependencies/blas_openmp/
|
57
|
+
#
|
58
|
+
COMPRESSION_ENABLED = false
|
59
|
+
|
60
|
+
def compress(df, column, embedding_column, fit: false)
|
61
|
+
# begin
|
62
|
+
# result = actually_compress(df, column, embedding_column, fit: fit)
|
63
|
+
# GC.start # This might allow us to cleanup after OpenBLAS and fix the thread pool
|
64
|
+
# end
|
65
|
+
|
66
|
+
# result
|
67
|
+
return df unless COMPRESSION_ENABLED
|
68
|
+
actually_compress(df, column, embedding_column, fit: fit)
|
69
|
+
end
|
70
|
+
|
71
|
+
def actually_compress(df, column, embedding_column, fit: false)
|
72
|
+
@column = column
|
73
|
+
@embedding_column = embedding_column
|
74
|
+
@fit = fit
|
75
|
+
|
76
|
+
# Create a dataframe of unique texts and their embeddings
|
77
|
+
unique_df = df.select([column, embedding_column])
|
78
|
+
.filter(Polars.col(column).is_not_null & (Polars.col(column) != ""))
|
79
|
+
.unique
|
80
|
+
|
81
|
+
# Compress the unique embeddings
|
82
|
+
compressed_df = reduce_to_dimensions(unique_df, target_dimensions: dimensions)
|
83
|
+
compressed_df = compressed_df.with_columns(Polars.col(embedding_column).cast(df.schema[embedding_column]).alias(embedding_column))
|
84
|
+
|
85
|
+
df = df.drop(embedding_column)
|
86
|
+
|
87
|
+
# Join back to original dataframe to maintain all rows
|
88
|
+
df.join(compressed_df, on: column, how: "left")
|
89
|
+
end
|
90
|
+
|
91
|
+
# Reduce dimensions using a preset quality level
|
92
|
+
def reduce_with_preset(embeddings_df, preset: :balanced)
|
93
|
+
unless PRESETS.key?(preset)
|
94
|
+
raise ArgumentError, "Unknown preset: #{preset}. Available presets: #{PRESETS.keys.join(", ")}"
|
95
|
+
end
|
96
|
+
|
97
|
+
@preset_used = preset
|
98
|
+
target_variance = PRESETS[preset][:variance_target]
|
99
|
+
|
100
|
+
reduce_to_variance(embeddings_df, target_variance: target_variance)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Reduce dimensions to a specific number
|
104
|
+
def reduce_to_dimensions(embeddings_df, target_dimensions:)
|
105
|
+
puts "reducing model dims..."
|
106
|
+
validate_input(embeddings_df)
|
107
|
+
|
108
|
+
# Convert embedding columns to Numo::NArray for Rumale
|
109
|
+
x = df_to_narray(embeddings_df, embedding_column)
|
110
|
+
@original_dimensions = x.shape[1]
|
111
|
+
|
112
|
+
if target_dimensions >= @original_dimensions
|
113
|
+
raise ArgumentError, "Target dimensions must be less than original dimensions"
|
114
|
+
end
|
115
|
+
|
116
|
+
# Initialize and fit PCA
|
117
|
+
if @pca_model.present?
|
118
|
+
transformed = @pca_model.transform(x)
|
119
|
+
else
|
120
|
+
@pca_model = Rumale::Decomposition::PCA.new(n_components: target_dimensions)
|
121
|
+
transformed = @pca_model.fit_transform(x)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Create new dataframe with reduced embeddings
|
125
|
+
create_result_dataframe(embeddings_df, embedding_column, transformed)
|
126
|
+
end
|
127
|
+
|
128
|
+
# Reduce dimensions to preserve a target variance
|
129
|
+
def reduce_to_variance(embeddings_df, target_variance:)
|
130
|
+
validate_input(embeddings_df)
|
131
|
+
|
132
|
+
# Convert embedding columns to Numo::NArray for Rumale
|
133
|
+
x = df_to_narray(embeddings_df, embedding_column)
|
134
|
+
|
135
|
+
# Get original dimensions from the first embedding
|
136
|
+
@original_dimensions = x.shape[1]
|
137
|
+
|
138
|
+
# Calculate the target number of components based on variance preservation
|
139
|
+
target_components = (@original_dimensions * target_variance).ceil
|
140
|
+
|
141
|
+
# First fit PCA with all components to analyze variance
|
142
|
+
if @pca_model.present?
|
143
|
+
transformed = @pca_model.transform(x)
|
144
|
+
else
|
145
|
+
@pca_model = Rumale::Decomposition::PCA.new(n_components: target_components)
|
146
|
+
transformed = @pca_model.fit_transform(x)
|
147
|
+
end
|
148
|
+
|
149
|
+
# Create new dataframe with reduced embeddings
|
150
|
+
create_result_dataframe(embeddings_df, embedding_column, transformed)
|
151
|
+
end
|
152
|
+
|
153
|
+
private
|
154
|
+
|
155
|
+
def validate_input(df)
|
156
|
+
return if df.is_a?(Polars::DataFrame)
|
157
|
+
|
158
|
+
raise ArgumentError, "Input must be a Polars DataFrame"
|
159
|
+
end
|
160
|
+
|
161
|
+
def get_embedding_columns(df)
|
162
|
+
# Assumes embedding columns are numeric and have a pattern like 'embedding_0', 'embedding_1', etc.
|
163
|
+
# Adjust this logic if your embedding columns follow a different naming convention
|
164
|
+
df.columns.select { |col| col.match(/^embedding_\d+$/) || col.match(/^vector_\d+$/) }
|
165
|
+
end
|
166
|
+
|
167
|
+
def df_to_narray(df, embedding_column)
|
168
|
+
Numo::DFloat.cast(df[embedding_column].to_a)
|
169
|
+
end
|
170
|
+
|
171
|
+
def create_result_dataframe(original_df, embedding_column, transformed_data)
|
172
|
+
original_df.with_column(
|
173
|
+
Polars.lit(transformed_data).alias(embedding_column)
|
174
|
+
)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
@@ -0,0 +1,226 @@
|
|
1
|
+
module EasyML
|
2
|
+
module Data
|
3
|
+
class Embeddings
|
4
|
+
class Embedder
|
5
|
+
attr_accessor :llm, :config, :adapter
|
6
|
+
|
7
|
+
# Provider-specific batch size recommendations
|
8
|
+
BATCH_SIZES = {
|
9
|
+
openai: 500, # OpenAI allows up to 2048 items per batch, but 500 is recommended
|
10
|
+
anthropic: 100, # Conservative default for Anthropic
|
11
|
+
gemini: 100, # Conservative default for Google's Gemini
|
12
|
+
ollama: 50, # Local models typically have more limited batch sizes
|
13
|
+
default: 100, # Default for any other provider
|
14
|
+
}
|
15
|
+
|
16
|
+
ADAPTERS = {
|
17
|
+
anthropic: Langchain::LLM::Anthropic,
|
18
|
+
gemini: Langchain::LLM::GoogleGemini,
|
19
|
+
openai: Langchain::LLM::OpenAI,
|
20
|
+
ollama: Langchain::LLM::Ollama,
|
21
|
+
}
|
22
|
+
|
23
|
+
DEFAULTS = {
|
24
|
+
api_key: {
|
25
|
+
anthropic: ENV["ANTHROPIC_API_KEY"],
|
26
|
+
gemini: ENV["GEMINI_API_KEY"],
|
27
|
+
openai: ENV["OPENAI_API_KEY"],
|
28
|
+
ollama: ENV["OLLAMA_API_KEY"],
|
29
|
+
},
|
30
|
+
}
|
31
|
+
|
32
|
+
def initialize(llm, config = {})
|
33
|
+
@llm = llm.to_sym
|
34
|
+
@config = config.symbolize_keys
|
35
|
+
apply_defaults
|
36
|
+
end
|
37
|
+
|
38
|
+
def embed(df, col, output_column)
|
39
|
+
pick
|
40
|
+
|
41
|
+
# Create a dataframe of unique texts and their embeddings
|
42
|
+
unique_df = df.select(col)
|
43
|
+
.filter(Polars.col(col).is_not_null & (Polars.col(col) != ""))
|
44
|
+
.unique
|
45
|
+
|
46
|
+
unique_texts = unique_df[col].to_a
|
47
|
+
unique_embeddings = batch_embed(unique_texts)
|
48
|
+
|
49
|
+
# Create a new dataframe with text-embedding pairs
|
50
|
+
embeddings_df = Polars::DataFrame.new(
|
51
|
+
{ col => unique_texts, output_column => unique_embeddings }
|
52
|
+
)
|
53
|
+
embeddings_df = embeddings_df.with_columns(
|
54
|
+
Polars.col(col).cast(df.schema[col]).alias(col)
|
55
|
+
)
|
56
|
+
|
57
|
+
# Join the original dataframe with the embeddings
|
58
|
+
df = df.join(embeddings_df, on: col, how: "left")
|
59
|
+
|
60
|
+
if df.columns.include?("#{output_column}_right")
|
61
|
+
df = df.with_columns(
|
62
|
+
Polars.when(
|
63
|
+
Polars.col(output_column).is_null.not_
|
64
|
+
).then(
|
65
|
+
Polars.col(output_column)
|
66
|
+
).otherwise(
|
67
|
+
Polars.col("#{output_column}_right")
|
68
|
+
)
|
69
|
+
)
|
70
|
+
df = df.drop("#{output_column}_right")
|
71
|
+
end
|
72
|
+
|
73
|
+
df
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def batch_embed(texts)
|
79
|
+
# Skip empty processing
|
80
|
+
return [] if texts.nil? || texts.empty?
|
81
|
+
|
82
|
+
# Filter out nil or empty strings
|
83
|
+
texts = texts.compact.reject(&:empty?)
|
84
|
+
return [] if texts.empty?
|
85
|
+
|
86
|
+
# Get batch size based on provider
|
87
|
+
batch_size = config[:batch_size] || BATCH_SIZES[@llm] || BATCH_SIZES[:default]
|
88
|
+
|
89
|
+
# Get parallel processing settings
|
90
|
+
parallel_processes = config[:parallel_processes] || 4
|
91
|
+
parallelism_mode = (config[:parallelism_mode] || :threads).to_sym
|
92
|
+
|
93
|
+
# Calculate optimal number of batches based on input size and processes
|
94
|
+
total_batches = (texts.size.to_f / batch_size).ceil
|
95
|
+
num_batches = [total_batches, parallel_processes].min
|
96
|
+
optimal_batch_size = (texts.size.to_f / num_batches).ceil
|
97
|
+
|
98
|
+
# Create batches based on the optimal batch size
|
99
|
+
batches = texts.each_slice(optimal_batch_size).to_a
|
100
|
+
|
101
|
+
parallel_processes = [parallel_processes, num_batches].min
|
102
|
+
|
103
|
+
# Process in parallel with appropriate error handling
|
104
|
+
all_embeddings = []
|
105
|
+
|
106
|
+
if parallel_processes > 1 && num_batches > 1
|
107
|
+
case parallelism_mode
|
108
|
+
when :threads
|
109
|
+
all_embeddings = Parallel.map(batches, in_threads: parallel_processes) do |batch|
|
110
|
+
with_retries { process_batch(batch) }
|
111
|
+
end
|
112
|
+
when :processes
|
113
|
+
all_embeddings = Parallel.map(batches, in_processes: parallel_processes) do |batch|
|
114
|
+
with_retries { process_batch(batch) }
|
115
|
+
end
|
116
|
+
else
|
117
|
+
raise ArgumentError, "parallelism_mode must be :threads or :processes"
|
118
|
+
end
|
119
|
+
else
|
120
|
+
# Sequential processing
|
121
|
+
batches.each do |batch|
|
122
|
+
all_embeddings << with_retries { process_batch(batch) }
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# Flatten the results and return
|
127
|
+
all_embeddings.flatten(1)
|
128
|
+
end
|
129
|
+
|
130
|
+
def process_batch(batch)
|
131
|
+
response = adapter.embed(text: batch)
|
132
|
+
unpack(response)
|
133
|
+
end
|
134
|
+
|
135
|
+
def unpack(embeddings)
|
136
|
+
raw_response = embeddings.raw_response.deep_symbolize_keys
|
137
|
+
case llm.to_sym
|
138
|
+
when :openai
|
139
|
+
raw_response.dig(:data).map { |e| e[:embedding] }
|
140
|
+
else
|
141
|
+
embeddings
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def with_retries(max_retries: 3, base_delay: 1, max_delay: 60)
|
146
|
+
retries = 0
|
147
|
+
begin
|
148
|
+
yield
|
149
|
+
rescue => e
|
150
|
+
retries += 1
|
151
|
+
if retries <= max_retries
|
152
|
+
# Exponential backoff with jitter
|
153
|
+
delay = [base_delay * (2 ** (retries - 1)) * (1 + rand * 0.1), max_delay].min
|
154
|
+
sleep(delay)
|
155
|
+
retry
|
156
|
+
else
|
157
|
+
raise e
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# These options are pulled from Langchain
|
163
|
+
#
|
164
|
+
# default_options: {
|
165
|
+
# embeddings_model_name: "text-embedding-3-small",
|
166
|
+
# },
|
167
|
+
def pick
|
168
|
+
@adapter ||= ADAPTERS[@llm].new(**config)
|
169
|
+
self
|
170
|
+
end
|
171
|
+
|
172
|
+
def apply_defaults
|
173
|
+
@config = @config.deep_symbolize_keys
|
174
|
+
|
175
|
+
DEFAULTS.each do |k, v|
|
176
|
+
unless @config.key?(k)
|
177
|
+
@config[k] = v[@llm]
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
def self.constants
|
183
|
+
{
|
184
|
+
providers: [
|
185
|
+
{ value: "openai", label: "OpenAI" },
|
186
|
+
{ value: "anthropic", label: "Anthropic" },
|
187
|
+
{ value: "ollama", label: "Ollama (Local)" },
|
188
|
+
],
|
189
|
+
models: {
|
190
|
+
openai: [
|
191
|
+
{ value: "text-embedding-3-small", label: "text-embedding-3-small", dimensions: 1536 },
|
192
|
+
{ value: "text-embedding-3-large", label: "text-embedding-3-large", dimensions: 3072 },
|
193
|
+
{ value: "text-embedding-ada-002", label: "text-embedding-ada-002", dimensions: 1536 },
|
194
|
+
],
|
195
|
+
anthropic: [
|
196
|
+
{ value: "claude-3", label: "Claude 3", dimensions: 3072 },
|
197
|
+
{ value: "claude-2", label: "Claude 2", dimensions: 1536 },
|
198
|
+
],
|
199
|
+
ollama: [
|
200
|
+
{ value: "llama2", label: "Llama 2", dimensions: 4096 },
|
201
|
+
{ value: "mistral", label: "Mistral", dimensions: 4096 },
|
202
|
+
{ value: "mixtral", label: "Mixtral", dimensions: 4096 },
|
203
|
+
{ value: "nomic-embed-text", label: "Nomic Embed", dimensions: 768 },
|
204
|
+
{ value: "starling-lm", label: "Starling", dimensions: 4096 },
|
205
|
+
],
|
206
|
+
},
|
207
|
+
compression_presets: {
|
208
|
+
high_quality: {
|
209
|
+
description: "Preserves subtle relationships and nuanced meaning",
|
210
|
+
variance_target: 0.95,
|
211
|
+
},
|
212
|
+
balanced: {
|
213
|
+
description: "Good balance of quality and storage efficiency",
|
214
|
+
variance_target: 0.85,
|
215
|
+
},
|
216
|
+
storage_optimized: {
|
217
|
+
description: "Maximizes storage efficiency while maintaining core meaning",
|
218
|
+
variance_target: 0.75,
|
219
|
+
},
|
220
|
+
},
|
221
|
+
}
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|