RubyGems - easy_ml - Versions diffs - 0.2.0.pre.rc85 → 0.2.0.pre.rc89 - Mend

easy_ml 0.2.0.pre.rc85 → 0.2.0.pre.rc89

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +4 -4
data/app/controllers/easy_ml/datasets_controller.rb +18 -2
data/app/controllers/easy_ml/predictions_controller.rb +9 -1
data/app/frontend/components/dataset/PreprocessingConfig.tsx +523 -150
data/app/frontend/pages/DatasetsPage.tsx +0 -1
data/app/frontend/types/dataset.ts +5 -2
data/app/models/easy_ml/column/imputers/base.rb +23 -2
data/app/models/easy_ml/column/imputers/embedding_encoder.rb +18 -0
data/app/models/easy_ml/column/imputers/imputer.rb +1 -0
data/app/models/easy_ml/column/imputers/most_frequent.rb +1 -1
data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -1
data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -1
data/app/models/easy_ml/column/imputers.rb +47 -41
data/app/models/easy_ml/column/selector.rb +2 -2
data/app/models/easy_ml/column.rb +260 -56
data/app/models/easy_ml/column_history.rb +6 -0
data/app/models/easy_ml/column_list.rb +30 -1
data/app/models/easy_ml/dataset/learner/lazy/embedding.rb +10 -0
data/app/models/easy_ml/dataset/learner/lazy/query.rb +2 -0
data/app/models/easy_ml/dataset/learner.rb +11 -0
data/app/models/easy_ml/dataset.rb +6 -19
data/app/models/easy_ml/lineage_history.rb +17 -0
data/app/models/easy_ml/model.rb +11 -1
data/app/models/easy_ml/models/xgboost.rb +37 -7
data/app/models/easy_ml/pca_model.rb +21 -0
data/app/models/easy_ml/prediction.rb +2 -1
data/app/serializers/easy_ml/column_serializer.rb +13 -1
data/config/initializers/inflections.rb +1 -0
data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +6 -8
data/lib/easy_ml/data/dataset_manager/writer/base.rb +15 -2
data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +0 -1
data/lib/easy_ml/data/dataset_manager/writer.rb +2 -0
data/lib/easy_ml/data/embeddings/compressor.rb +179 -0
data/lib/easy_ml/data/embeddings/embedder.rb +226 -0
data/lib/easy_ml/data/embeddings.rb +61 -0
data/lib/easy_ml/data/polars_column.rb +3 -0
data/lib/easy_ml/data/polars_reader.rb +54 -23
data/lib/easy_ml/data/polars_schema.rb +28 -2
data/lib/easy_ml/data/splits/file_split.rb +7 -2
data/lib/easy_ml/data.rb +1 -0
data/lib/easy_ml/embedding_store.rb +92 -0
data/lib/easy_ml/engine.rb +4 -2
data/lib/easy_ml/predict.rb +42 -20
data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +5 -0
data/lib/easy_ml/railtie/templates/migration/add_is_primary_key_to_easy_ml_columns.rb.tt +9 -0
data/lib/easy_ml/railtie/templates/migration/add_metadata_to_easy_ml_predictions.rb.tt +6 -0
data/lib/easy_ml/railtie/templates/migration/add_pca_model_id_to_easy_ml_columns.rb.tt +9 -0
data/lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_dataset_histories.rb.tt +13 -0
data/lib/easy_ml/railtie/templates/migration/create_easy_ml_pca_models.rb.tt +14 -0
data/lib/easy_ml/version.rb +1 -1
data/lib/easy_ml.rb +1 -0
data/public/easy_ml/assets/.vite/manifest.json +2 -2
data/public/easy_ml/assets/assets/Application-DfPoyRr8.css +1 -0
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js +533 -0
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js.map +1 -0
metadata +59 -6
data/lib/tasks/profile.rake +0 -40
data/public/easy_ml/assets/assets/Application-nnn_XLuL.css +0 -1
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js +0 -522
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js.map +0 -1

data/app/models/easy_ml/models/xgboost.rb CHANGED Viewed

@@ -315,12 +315,12 @@ module EasyML
         end
       end
-      def predict(xs)
+      def predicting(xs, &block)
         raise "No trained model! Train a model before calling predict" unless @booster.present?
         raise "Cannot predict on nil — XGBoost" if xs.nil?
         begin
-          y_pred = @booster.predict(preprocess(xs))
+          y_pred = yield(preprocess(xs))
         rescue StandardError => e
           raise e unless e.message.match?(/Number of columns does not match/)
@@ -335,6 +335,12 @@ module EasyML
               #{xs.columns}
             )
         end
+      end
+      def predict(xs)
+        y_pred = predicting(xs) do |d_matrix|
+          @booster.predict(d_matrix)
+        end
         case task.to_sym
         when :classification
@@ -344,12 +350,12 @@ module EasyML
         end
       end
-      def predict_proba(data)
-        dmat = DMatrix.new(data)
-        y_pred = @booster.predict(dmat)
+      def predict_proba(xs)
+        y_pred = predicting(xs) do |d_matrix|
+          @booster.predict(d_matrix)
+        end
         if y_pred.first.is_a?(Array)
-          # multiple classes
           y_pred
         else
           y_pred.map { |v| [1 - v, v] }
@@ -452,6 +458,27 @@ module EasyML
         )
       end
+      def explode_embeddings(df)
+        embedding_cols = dataset.columns.where.not(hidden: true).select(&:embedded?)
+        # Create all extraction expressions at once
+        select_expressions = []
+        # Retain all non-embedding columns
+        base_cols = df.schema.keys - embedding_cols.map(&:embedding_column)
+        select_expressions << Polars.col(base_cols)
+        # Add all embedding extraction expressions
+        embedding_cols.each do |col|
+          dims = col.n_dimensions || 1
+          (0...dims).each do |i|
+            # Create a single expression that extracts one element
+            select_expressions << Polars.col(col.embedding_column).list.get(i).alias("#{col.embedding_column}_#{i}")
+          end
+        end
+        df.select(select_expressions)
+      end
       def preprocess(xs, ys = nil)
         return xs if xs.is_a?(::XGBoost::DMatrix)
         lazy = xs.is_a?(Polars::LazyFrame)
@@ -468,7 +495,10 @@ module EasyML
         feature_cols -= [weights_col] if weights_col
         # Get features, labels and weights
-        features = lazy ? xs.select(feature_cols).collect.to_numo : xs.select(feature_cols).to_numo
+        exploded = explode_embeddings(xs.select(feature_cols))
+        feature_cols = exploded.columns
+        features = lazy ? exploded.collect.to_numo : exploded.to_numo
         weights = weights_col ? (lazy ? xs.select(weights_col).collect.to_numo : xs.select(weights_col).to_numo) : nil
         weights = weights.flatten if weights
         if ys.present?

data/app/models/easy_ml/pca_model.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# == Schema Information
+#
+# Table name: easy_ml_pca_models
+#
+#  id         :bigint           not null, primary key
+#  model      :binary           not null
+#  fit_at     :datetime
+#  created_at :datetime         not null
+#  updated_at :datetime         not null
+#
+module EasyML
+  class PCAModel < ActiveRecord::Base
+    def model
+      Marshal.load(read_attribute(:model))
+    end
+    def model=(model)
+      write_attribute(:model, Marshal.dump(model.dup))
+    end
+  end
+end

data/app/models/easy_ml/prediction.rb CHANGED Viewed

@@ -11,6 +11,7 @@
 #  normalized_input :jsonb
 #  created_at       :datetime         not null
 #  updated_at       :datetime         not null
+#  metadata         :jsonb            not null
 #
 module EasyML
   class Prediction < ActiveRecord::Base
@@ -30,7 +31,7 @@ module EasyML
     end
     def probabilities
-      prediction_value["probabilities"]
+      metadata["probabilities"]
     end
     def regression?

data/app/serializers/easy_ml/column_serializer.rb CHANGED Viewed

@@ -27,13 +27,25 @@ module EasyML
     include JSONAPI::Serializer
     attributes :id, :name, :description, :dataset_id, :datatype, :polars_datatype, :preprocessing_steps,
-               :hidden, :drop_if_null, :sample_values, :statistics, :is_target,
+               :hidden, :drop_if_null, :sample_values, :is_target,
                :is_computed, :computed_by
     attribute :required do |object|
       object.required?
     end
+    attribute :statistics do |column|
+      if column.is_computed?
+        stats = column.statistics
+        {
+          raw: stats[:processed],
+          processed: stats[:processed],
+        }
+      else
+        column.statistics
+      end
+    end
     attribute :lineage do |column|
       column.lineages.map do |lineage|
         LineageSerializer.new(lineage).serializable_hash.dig(:data, :attributes)

data/config/initializers/inflections.rb CHANGED Viewed

@@ -14,6 +14,7 @@ module EasyML
           inflect.acronym "HTML"
           inflect.acronym "API"
           inflect.acronym "APIs"
+          inflect.acronym "PCA"
         end
       end
     end

data/lib/easy_ml/data/dataset_manager/writer/append_only.rb CHANGED Viewed

@@ -13,21 +13,19 @@ module EasyML
           end
           def store
-            # If there are no existing files, just store as normal
+            @df = @df.unique(subset: [primary_key])
             return super if files.empty?
             # Get existing data lazily
-            existing_keys = query(lazy: true)
-              .select(primary_key)
-              .collect[primary_key]
-              .to_a
+            existing_keys = query(lazy: true).select(primary_key)
             # Convert input to lazy if it isn't already
             input_data = df.is_a?(Polars::LazyFrame) ? df : df.lazy
-            # Filter out records that already exist
-            new_records = input_data.filter(
-              Polars.col(primary_key).is_in(existing_keys).not_
+            new_records = input_data.join(
+              existing_keys,
+              on: primary_key,
+              how: "anti",
             )
             # If we have new records, store them

data/lib/easy_ml/data/dataset_manager/writer/base.rb CHANGED Viewed

@@ -66,7 +66,20 @@ module EasyML
           def safe_write(df, path)
             FileUtils.mkdir_p(File.dirname(path))
-            df.is_a?(Polars::LazyFrame) ? df.sink_parquet(path) : df.write_parquet(path)
+            if df.is_a?(Polars::LazyFrame)
+              # Depending on the query plan, sometimes sink_parquet will throw an error...
+              # in this case we have to collect first and fallback to write_parquet
+              begin
+                # Try the faster sink_parquet first
+                df.sink_parquet(path)
+              rescue Polars::InvalidOperationError => e
+                # Fall back to collect().write_parquet()
+                df.collect.write_parquet(path)
+              end
+            else
+              # Already a materialized DataFrame
+              df.write_parquet(path)
+            end
             path
           end
@@ -95,7 +108,7 @@ module EasyML
             keylist = unique_id_key(subdir: "keylist")
             acquire_lock(keylist) do |suo|
-              suo.client.sadd(keylist, key)
+              suo.client.sadd?(keylist, key)
             end
           end

data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb CHANGED Viewed

@@ -65,7 +65,6 @@ module EasyML
               partition_df = df.filter(Polars.col(primary_key).is_between(partition_start, partition_end))
               num_rows = lazy? ? partition_df.select(Polars.length).collect[0, 0] : partition_df.shape[0]
-              binding.pry if num_rows == 0
               next if num_rows == 0
               yield partition_df, partition
             end

data/lib/easy_ml/data/dataset_manager/writer.rb CHANGED Viewed

@@ -32,6 +32,8 @@ module EasyML
         end
         def store(df, *args)
+          return df if df.is_a?(Polars::LazyFrame) ? df.schema.empty? : df.empty?
           adapter_class.new(options.merge!(df: df)).store(*args)
         end

data/lib/easy_ml/data/embeddings/compressor.rb ADDED Viewed

@@ -0,0 +1,179 @@
+module EasyML
+  module Data
+    class Embeddings
+      class Compressor
+        # Quality presets with their respective variance preservation targets
+        PRESETS = {
+          full: {
+            variance_target: 1.0,
+            description: "Preserves all information while reducing dimensions",
+          },
+          high_quality: {
+            variance_target: 0.95,
+            description: "Preserves 95% of information while reducing dimensions",
+          },
+          balanced: {
+            variance_target: 0.85,
+            description: "Balanced approach: 85% information preservation with substantial size reduction",
+          },
+          space_efficient: {
+            variance_target: 0.75,
+            description: "Maximizes storage savings while maintaining 75% of important information",
+          },
+        }
+        attr_reader :original_dimensions, :reduced_dimensions, :preserved_variance,
+                    :compression_ratio, :storage_savings, :preset_used
+        attr_accessor :preset, :dimensions, :column, :embedding_column, :fit, :pca_model
+        def initialize(config = {})
+          @preset = config.dig(:preset)
+          @dimensions = config.dig(:dimensions)
+          @preset = :full unless @preset || @dimensions
+          @pca_model = config.dig(:pca_model)
+          @original_dimensions = nil
+          @reduced_dimensions = nil
+          @preserved_variance = nil
+          @compression_ratio = nil
+          @storage_savings = nil
+          @preset_used = nil
+        end
+        def inspect
+          "#<#{self.class.name} original_dimensions=#{@original_dimensions}, reduced_dimensions=#{@reduced_dimensions}, preserved_variance=#{@preserved_variance}, compression_ratio=#{@compression_ratio}, storage_savings=#{@storage_savings}, preset_used=#{@preset_used}>"
+        end
+        # Right now, enabling OpenBLAS as the Numo::LinAlg backend causes
+        # memory issues with XGBoost due to conflicts with libomp.
+        # Since arm-based OSX has doesn't have support for MKL, we have to fall back to
+        # a very slow matrix factorization implementation which doesn't seem sustainable.
+        #
+        # One potential solution is to create an accelerate backend for Numo::LinAlg,
+        # or compiling OpenBLAS without USE_OPENMP=0,
+        # but for now I'm just disabling compression support.
+        #
+        # http://pypackaging-native.github.io/key-issues/native-dependencies/blas_openmp/
+        #
+        COMPRESSION_ENABLED = false
+        def compress(df, column, embedding_column, fit: false)
+          # begin
+          #   result = actually_compress(df, column, embedding_column, fit: fit)
+          #   GC.start # This might allow us to cleanup after OpenBLAS and fix the thread pool
+          # end
+          # result
+          return df unless COMPRESSION_ENABLED
+          actually_compress(df, column, embedding_column, fit: fit)
+        end
+        def actually_compress(df, column, embedding_column, fit: false)
+          @column = column
+          @embedding_column = embedding_column
+          @fit = fit
+          # Create a dataframe of unique texts and their embeddings
+          unique_df = df.select([column, embedding_column])
+                        .filter(Polars.col(column).is_not_null & (Polars.col(column) != ""))
+                        .unique
+          # Compress the unique embeddings
+          compressed_df = reduce_to_dimensions(unique_df, target_dimensions: dimensions)
+          compressed_df = compressed_df.with_columns(Polars.col(embedding_column).cast(df.schema[embedding_column]).alias(embedding_column))
+          df = df.drop(embedding_column)
+          # Join back to original dataframe to maintain all rows
+          df.join(compressed_df, on: column, how: "left")
+        end
+        # Reduce dimensions using a preset quality level
+        def reduce_with_preset(embeddings_df, preset: :balanced)
+          unless PRESETS.key?(preset)
+            raise ArgumentError, "Unknown preset: #{preset}. Available presets: #{PRESETS.keys.join(", ")}"
+          end
+          @preset_used = preset
+          target_variance = PRESETS[preset][:variance_target]
+          reduce_to_variance(embeddings_df, target_variance: target_variance)
+        end
+        # Reduce dimensions to a specific number
+        def reduce_to_dimensions(embeddings_df, target_dimensions:)
+          puts "reducing model dims..."
+          validate_input(embeddings_df)
+          # Convert embedding columns to Numo::NArray for Rumale
+          x = df_to_narray(embeddings_df, embedding_column)
+          @original_dimensions = x.shape[1]
+          if target_dimensions >= @original_dimensions
+            raise ArgumentError, "Target dimensions must be less than original dimensions"
+          end
+          # Initialize and fit PCA
+          if @pca_model.present?
+            transformed = @pca_model.transform(x)
+          else
+            @pca_model = Rumale::Decomposition::PCA.new(n_components: target_dimensions)
+            transformed = @pca_model.fit_transform(x)
+          end
+          # Create new dataframe with reduced embeddings
+          create_result_dataframe(embeddings_df, embedding_column, transformed)
+        end
+        # Reduce dimensions to preserve a target variance
+        def reduce_to_variance(embeddings_df, target_variance:)
+          validate_input(embeddings_df)
+          # Convert embedding columns to Numo::NArray for Rumale
+          x = df_to_narray(embeddings_df, embedding_column)
+          # Get original dimensions from the first embedding
+          @original_dimensions = x.shape[1]
+          # Calculate the target number of components based on variance preservation
+          target_components = (@original_dimensions * target_variance).ceil
+          # First fit PCA with all components to analyze variance
+          if @pca_model.present?
+            transformed = @pca_model.transform(x)
+          else
+            @pca_model = Rumale::Decomposition::PCA.new(n_components: target_components)
+            transformed = @pca_model.fit_transform(x)
+          end
+          # Create new dataframe with reduced embeddings
+          create_result_dataframe(embeddings_df, embedding_column, transformed)
+        end
+        private
+        def validate_input(df)
+          return if df.is_a?(Polars::DataFrame)
+          raise ArgumentError, "Input must be a Polars DataFrame"
+        end
+        def get_embedding_columns(df)
+          # Assumes embedding columns are numeric and have a pattern like 'embedding_0', 'embedding_1', etc.
+          # Adjust this logic if your embedding columns follow a different naming convention
+          df.columns.select { |col| col.match(/^embedding_\d+$/) || col.match(/^vector_\d+$/) }
+        end
+        def df_to_narray(df, embedding_column)
+          Numo::DFloat.cast(df[embedding_column].to_a)
+        end
+        def create_result_dataframe(original_df, embedding_column, transformed_data)
+          original_df.with_column(
+            Polars.lit(transformed_data).alias(embedding_column)
+          )
+        end
+      end
+    end
+  end
+end

data/lib/easy_ml/data/embeddings/embedder.rb ADDED Viewed

@@ -0,0 +1,226 @@
+module EasyML
+  module Data
+    class Embeddings
+      class Embedder
+        attr_accessor :llm, :config, :adapter
+        # Provider-specific batch size recommendations
+        BATCH_SIZES = {
+          openai: 500,    # OpenAI allows up to 2048 items per batch, but 500 is recommended
+          anthropic: 100, # Conservative default for Anthropic
+          gemini: 100,    # Conservative default for Google's Gemini
+          ollama: 50,     # Local models typically have more limited batch sizes
+          default: 100,    # Default for any other provider
+        }
+        ADAPTERS = {
+          anthropic: Langchain::LLM::Anthropic,
+          gemini: Langchain::LLM::GoogleGemini,
+          openai: Langchain::LLM::OpenAI,
+          ollama: Langchain::LLM::Ollama,
+        }
+        DEFAULTS = {
+          api_key: {
+            anthropic: ENV["ANTHROPIC_API_KEY"],
+            gemini: ENV["GEMINI_API_KEY"],
+            openai: ENV["OPENAI_API_KEY"],
+            ollama: ENV["OLLAMA_API_KEY"],
+          },
+        }
+        def initialize(llm, config = {})
+          @llm = llm.to_sym
+          @config = config.symbolize_keys
+          apply_defaults
+        end
+        def embed(df, col, output_column)
+          pick
+          # Create a dataframe of unique texts and their embeddings
+          unique_df = df.select(col)
+            .filter(Polars.col(col).is_not_null & (Polars.col(col) != ""))
+            .unique
+          unique_texts = unique_df[col].to_a
+          unique_embeddings = batch_embed(unique_texts)
+          # Create a new dataframe with text-embedding pairs
+          embeddings_df = Polars::DataFrame.new(
+            { col => unique_texts, output_column => unique_embeddings }
+          )
+          embeddings_df = embeddings_df.with_columns(
+            Polars.col(col).cast(df.schema[col]).alias(col)
+          )
+          # Join the original dataframe with the embeddings
+          df = df.join(embeddings_df, on: col, how: "left")
+          if df.columns.include?("#{output_column}_right")
+            df = df.with_columns(
+              Polars.when(
+                Polars.col(output_column).is_null.not_
+              ).then(
+                Polars.col(output_column)
+              ).otherwise(
+                Polars.col("#{output_column}_right")
+              )
+            )
+            df = df.drop("#{output_column}_right")
+          end
+          df
+        end
+        private
+        def batch_embed(texts)
+          # Skip empty processing
+          return [] if texts.nil? || texts.empty?
+          # Filter out nil or empty strings
+          texts = texts.compact.reject(&:empty?)
+          return [] if texts.empty?
+          # Get batch size based on provider
+          batch_size = config[:batch_size] || BATCH_SIZES[@llm] || BATCH_SIZES[:default]
+          # Get parallel processing settings
+          parallel_processes = config[:parallel_processes] || 4
+          parallelism_mode = (config[:parallelism_mode] || :threads).to_sym
+          # Calculate optimal number of batches based on input size and processes
+          total_batches = (texts.size.to_f / batch_size).ceil
+          num_batches = [total_batches, parallel_processes].min
+          optimal_batch_size = (texts.size.to_f / num_batches).ceil
+          # Create batches based on the optimal batch size
+          batches = texts.each_slice(optimal_batch_size).to_a
+          parallel_processes = [parallel_processes, num_batches].min
+          # Process in parallel with appropriate error handling
+          all_embeddings = []
+          if parallel_processes > 1 && num_batches > 1
+            case parallelism_mode
+            when :threads
+              all_embeddings = Parallel.map(batches, in_threads: parallel_processes) do |batch|
+                with_retries { process_batch(batch) }
+              end
+            when :processes
+              all_embeddings = Parallel.map(batches, in_processes: parallel_processes) do |batch|
+                with_retries { process_batch(batch) }
+              end
+            else
+              raise ArgumentError, "parallelism_mode must be :threads or :processes"
+            end
+          else
+            # Sequential processing
+            batches.each do |batch|
+              all_embeddings << with_retries { process_batch(batch) }
+            end
+          end
+          # Flatten the results and return
+          all_embeddings.flatten(1)
+        end
+        def process_batch(batch)
+          response = adapter.embed(text: batch)
+          unpack(response)
+        end
+        def unpack(embeddings)
+          raw_response = embeddings.raw_response.deep_symbolize_keys
+          case llm.to_sym
+          when :openai
+            raw_response.dig(:data).map { |e| e[:embedding] }
+          else
+            embeddings
+          end
+        end
+        def with_retries(max_retries: 3, base_delay: 1, max_delay: 60)
+          retries = 0
+          begin
+            yield
+          rescue => e
+            retries += 1
+            if retries <= max_retries
+              # Exponential backoff with jitter
+              delay = [base_delay * (2 ** (retries - 1)) * (1 + rand * 0.1), max_delay].min
+              sleep(delay)
+              retry
+            else
+              raise e
+            end
+          end
+        end
+        # These options are pulled from Langchain
+        #
+        # default_options: {
+        #   embeddings_model_name: "text-embedding-3-small",
+        # },
+        def pick
+          @adapter ||= ADAPTERS[@llm].new(**config)
+          self
+        end
+        def apply_defaults
+          @config = @config.deep_symbolize_keys
+          DEFAULTS.each do |k, v|
+            unless @config.key?(k)
+              @config[k] = v[@llm]
+            end
+          end
+        end
+        def self.constants
+          {
+            providers: [
+              { value: "openai", label: "OpenAI" },
+              { value: "anthropic", label: "Anthropic" },
+              { value: "ollama", label: "Ollama (Local)" },
+            ],
+            models: {
+              openai: [
+                { value: "text-embedding-3-small", label: "text-embedding-3-small", dimensions: 1536 },
+                { value: "text-embedding-3-large", label: "text-embedding-3-large", dimensions: 3072 },
+                { value: "text-embedding-ada-002", label: "text-embedding-ada-002", dimensions: 1536 },
+              ],
+              anthropic: [
+                { value: "claude-3", label: "Claude 3", dimensions: 3072 },
+                { value: "claude-2", label: "Claude 2", dimensions: 1536 },
+              ],
+              ollama: [
+                { value: "llama2", label: "Llama 2", dimensions: 4096 },
+                { value: "mistral", label: "Mistral", dimensions: 4096 },
+                { value: "mixtral", label: "Mixtral", dimensions: 4096 },
+                { value: "nomic-embed-text", label: "Nomic Embed", dimensions: 768 },
+                { value: "starling-lm", label: "Starling", dimensions: 4096 },
+              ],
+            },
+            compression_presets: {
+              high_quality: {
+                description: "Preserves subtle relationships and nuanced meaning",
+                variance_target: 0.95,
+              },
+              balanced: {
+                description: "Good balance of quality and storage efficiency",
+                variance_target: 0.85,
+              },
+              storage_optimized: {
+                description: "Maximizes storage efficiency while maintaining core meaning",
+                variance_target: 0.75,
+              },
+            },
+          }
+        end
+      end
+    end
+  end
+end