RubyGems - fastembed - Versions diffs - 1.0.0 → 1.1.0 - Mend

fastembed 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +4 -4
data/.rubocop.yml +1 -0
data/.yardopts +6 -0
data/BENCHMARKS.md +124 -1
data/CHANGELOG.md +14 -0
data/README.md +395 -74
data/benchmark/compare_all.rb +167 -0
data/benchmark/compare_python.py +60 -0
data/benchmark/memory_profile.rb +70 -0
data/benchmark/profile.rb +198 -0
data/benchmark/reranker_benchmark.rb +158 -0
data/exe/fastembed +6 -0
data/fastembed.gemspec +3 -0
data/lib/fastembed/async.rb +193 -0
data/lib/fastembed/base_model.rb +247 -0
data/lib/fastembed/base_model_info.rb +61 -0
data/lib/fastembed/cli.rb +745 -0
data/lib/fastembed/custom_model_registry.rb +255 -0
data/lib/fastembed/image_embedding.rb +313 -0
data/lib/fastembed/late_interaction_embedding.rb +260 -0
data/lib/fastembed/late_interaction_model_info.rb +91 -0
data/lib/fastembed/model_info.rb +59 -19
data/lib/fastembed/model_management.rb +82 -23
data/lib/fastembed/onnx_embedding_model.rb +25 -4
data/lib/fastembed/pooling.rb +39 -3
data/lib/fastembed/progress.rb +52 -0
data/lib/fastembed/quantization.rb +75 -0
data/lib/fastembed/reranker_model_info.rb +91 -0
data/lib/fastembed/sparse_embedding.rb +261 -0
data/lib/fastembed/sparse_model_info.rb +80 -0
data/lib/fastembed/text_cross_encoder.rb +217 -0
data/lib/fastembed/text_embedding.rb +161 -28
data/lib/fastembed/validators.rb +59 -0
data/lib/fastembed/version.rb +1 -1
data/lib/fastembed.rb +42 -1
data/plan.md +257 -0
data/scripts/verify_models.rb +229 -0
metadata +70 -3

data/lib/fastembed/text_cross_encoder.rb ADDED Viewed

@@ -0,0 +1,217 @@
+# frozen_string_literal: true
+module Fastembed
+  # Cross-encoder model for reranking query-document pairs
+  #
+  # Unlike embedding models that encode texts independently, cross-encoders
+  # process query-document pairs together to produce relevance scores.
+  # This is more accurate but slower (O(n) comparisons vs O(1) with embeddings).
+  #
+  # @example Basic reranking
+  #   reranker = Fastembed::TextCrossEncoder.new
+  #   scores = reranker.rerank(
+  #     query: "What is machine learning?",
+  #     documents: ["ML is a subset of AI...", "The weather is nice today"]
+  #   )
+  #   # => [0.95, 0.02]
+  #
+  # @example Get ranked results
+  #   results = reranker.rerank_with_scores(
+  #     query: "What is Ruby?",
+  #     documents: documents
+  #   )
+  #   # => [{document: "Ruby is a programming...", score: 0.89, index: 2}, ...]
+  #
+  class TextCrossEncoder
+    include BaseModel
+    # Initialize a cross-encoder model for reranking
+    #
+    # @param model_name [String] Name of the model to use
+    # @param cache_dir [String, nil] Custom cache directory for models
+    # @param threads [Integer, nil] Number of threads for ONNX Runtime
+    # @param providers [Array<String>, nil] ONNX execution providers
+    # @param show_progress [Boolean] Whether to show download progress
+    # @param quantization [Symbol] Quantization type (:fp32, :fp16, :int8, :uint8, :q4)
+    # @param local_model_dir [String, nil] Load model from local directory instead of downloading
+    # @param model_file [String, nil] Override model file name (e.g., "model.onnx")
+    # @param tokenizer_file [String, nil] Override tokenizer file name (e.g., "tokenizer.json")
+    def initialize(
+      model_name: DEFAULT_RERANKER_MODEL,
+      cache_dir: nil,
+      threads: nil,
+      providers: nil,
+      show_progress: true,
+      quantization: nil,
+      local_model_dir: nil,
+      model_file: nil,
+      tokenizer_file: nil
+    )
+      if local_model_dir
+        initialize_from_local(
+          local_model_dir: local_model_dir,
+          model_name: model_name,
+          threads: threads,
+          providers: providers,
+          quantization: quantization,
+          model_file: model_file,
+          tokenizer_file: tokenizer_file
+        )
+      else
+        initialize_model(
+          model_name: model_name,
+          cache_dir: cache_dir,
+          threads: threads,
+          providers: providers,
+          show_progress: show_progress,
+          quantization: quantization
+        )
+      end
+      setup_model_and_tokenizer(model_file_override: model_file || quantized_model_file)
+    end
+    # Score query-document pairs and return relevance scores
+    #
+    # @param query [String] The query text
+    # @param documents [Array<String>] Documents to score against the query
+    # @param batch_size [Integer] Number of pairs to process at once
+    # @return [Array<Float>] Relevance scores for each document (higher = more relevant)
+    # @raise [ArgumentError] If query or documents is nil, or documents contains nil
+    def rerank(query:, documents:, batch_size: 64)
+      Validators.validate_rerank_input!(query: query, documents: documents)
+      return [] if documents.empty?
+      scores = []
+      documents.each_slice(batch_size) do |batch|
+        batch_scores = score_pairs(query, batch)
+        scores.concat(batch_scores)
+      end
+      scores
+    end
+    # Rerank documents and return sorted results with scores
+    #
+    # @param query [String] The query text
+    # @param documents [Array<String>] Documents to rerank
+    # @param top_k [Integer, nil] Return only top K results (nil = all)
+    # @param batch_size [Integer] Number of pairs to process at once
+    # @return [Array<Hash>] Sorted results with :document, :score, :index keys
+    def rerank_with_scores(query:, documents:, top_k: nil, batch_size: 64)
+      scores = rerank(query: query, documents: documents, batch_size: batch_size)
+      results = documents.zip(scores).each_with_index.map do |(doc, score), idx|
+        { document: doc, score: score, index: idx }
+      end
+      results.sort_by! { |r| -r[:score] }
+      top_k ? results.first(top_k) : results
+    end
+    # Rerank documents asynchronously
+    #
+    # @param query [String] The query text
+    # @param documents [Array<String>] Documents to score against the query
+    # @param batch_size [Integer] Number of pairs to process at once
+    # @return [Async::Future] Future that resolves to array of scores
+    def rerank_async(query:, documents:, batch_size: 64)
+      Async::Future.new { rerank(query: query, documents: documents, batch_size: batch_size) }
+    end
+    # Rerank documents with scores asynchronously
+    #
+    # @param query [String] The query text
+    # @param documents [Array<String>] Documents to rerank
+    # @param top_k [Integer, nil] Return only top K results (nil = all)
+    # @param batch_size [Integer] Number of pairs to process at once
+    # @return [Async::Future] Future that resolves to sorted results array
+    def rerank_with_scores_async(query:, documents:, top_k: nil, batch_size: 64)
+      Async::Future.new { rerank_with_scores(query: query, documents: documents, top_k: top_k, batch_size: batch_size) }
+    end
+    # List all supported reranker models
+    #
+    # @return [Array<Hash>] Array of model information hashes
+    def self.list_supported_models
+      SUPPORTED_RERANKER_MODELS.values.map(&:to_h)
+    end
+    private
+    def resolve_model_info(model_name)
+      # Check built-in registry first
+      info = SUPPORTED_RERANKER_MODELS[model_name]
+      return info if info
+      # Check custom registry
+      info = CustomModelRegistry.reranker_models[model_name]
+      return info if info
+      raise Error, "Unknown reranker model: #{model_name}"
+    end
+    def create_local_model_info(model_name:, model_file:, tokenizer_file:)
+      RerankerModelInfo.new(
+        model_name: model_name,
+        description: 'Local reranker model',
+        size_in_gb: 0,
+        sources: {},
+        model_file: model_file || 'model.onnx',
+        tokenizer_file: tokenizer_file || 'tokenizer.json'
+      )
+    end
+    def score_pairs(query, documents)
+      encodings = tokenize_pairs(query, documents)
+      inputs = prepare_pair_inputs(encodings)
+      extract_scores(@session.run(nil, inputs))
+    end
+    def tokenize_pairs(query, documents)
+      documents.map { |doc| @tokenizer.encode(query, doc) }
+    end
+    def prepare_pair_inputs(encodings)
+      max_len = encodings.map { |e| e.ids.length }.max
+      input_ids = []
+      attention_mask = []
+      token_type_ids = []
+      encodings.each do |encoding|
+        pad_len = max_len - encoding.ids.length
+        input_ids << pad_sequence(encoding.ids, pad_len)
+        attention_mask << pad_sequence(encoding.attention_mask, pad_len)
+        token_type_ids << pad_sequence(encoding.type_ids, pad_len)
+      end
+      inputs = { 'input_ids' => input_ids }
+      # Only add attention_mask if the model expects it
+      mask_key = if session_input_names.include?('attention_mask')
+                   'attention_mask'
+                 elsif session_input_names.include?('input_mask')
+                   'input_mask'
+                 end
+      inputs[mask_key] = attention_mask if mask_key
+      # Only add token_type_ids if the model expects it
+      type_key = if session_input_names.include?('token_type_ids')
+                   'token_type_ids'
+                 elsif session_input_names.include?('segment_ids')
+                   'segment_ids'
+                 end
+      inputs[type_key] = token_type_ids if type_key
+      inputs
+    end
+    def pad_sequence(sequence, pad_len)
+      sequence + ([0] * pad_len)
+    end
+    def extract_scores(outputs)
+      outputs.first.map { |logit| logit.is_a?(Array) ? logit.first : logit }
+    end
+  end
+end

data/lib/fastembed/text_embedding.rb CHANGED Viewed

@@ -15,8 +15,17 @@ module Fastembed
   #     # Process each vector
   #   end
   #
+  # @example Load from local directory
+  #   embedding = Fastembed::TextEmbedding.new(
+  #     local_model_dir: "/path/to/model",
+  #     model_file: "model.onnx",
+  #     tokenizer_file: "tokenizer.json"
+  #   )
+  #
   class TextEmbedding
-    attr_reader :model_name, :model_info, :dim
+    include BaseModel
+    attr_reader :dim
     # Initialize a text embedding model
     #
@@ -25,55 +34,84 @@ module Fastembed
     # @param threads [Integer, nil] Number of threads for ONNX Runtime
     # @param providers [Array<String>, nil] ONNX execution providers (e.g., ["CoreMLExecutionProvider"])
     # @param show_progress [Boolean] Whether to show download progress
+    # @param quantization [Symbol] Quantization type (:fp32, :fp16, :int8, :uint8, :q4)
+    # @param local_model_dir [String, nil] Load model from local directory instead of downloading
+    # @param model_file [String, nil] Override model file name (e.g., "model.onnx")
+    # @param tokenizer_file [String, nil] Override tokenizer file name (e.g., "tokenizer.json")
     def initialize(
       model_name: DEFAULT_MODEL,
       cache_dir: nil,
       threads: nil,
       providers: nil,
-      show_progress: true
+      show_progress: true,
+      quantization: nil,
+      local_model_dir: nil,
+      model_file: nil,
+      tokenizer_file: nil
     )
-      @model_name = model_name
-      @threads = threads
-      @providers = providers
-      @show_progress = show_progress
-      # Set custom cache directory if provided
-      ModelManagement.cache_dir = cache_dir if cache_dir
+      if local_model_dir
+        initialize_from_local(
+          local_model_dir: local_model_dir,
+          model_name: model_name,
+          threads: threads,
+          providers: providers,
+          quantization: quantization,
+          model_file: model_file,
+          tokenizer_file: tokenizer_file
+        )
+      else
+        initialize_model(
+          model_name: model_name,
+          cache_dir: cache_dir,
+          threads: threads,
+          providers: providers,
+          show_progress: show_progress,
+          quantization: quantization
+        )
+      end
-      # Resolve model info
-      @model_info = ModelManagement.resolve_model_info(model_name)
       @dim = @model_info.dim
-      # Download and load model
-      @model_dir = ModelManagement.retrieve_model(model_name, show_progress: show_progress)
-      @model = OnnxEmbeddingModel.new(@model_info, @model_dir, threads: threads, providers: providers)
+      @model = OnnxEmbeddingModel.new(
+        @model_info,
+        @model_dir,
+        threads: threads,
+        providers: providers,
+        model_file_override: model_file || quantized_model_file
+      )
     end
     # Generate embeddings for documents
     #
     # @param documents [Array<String>, String] Text document(s) to embed
     # @param batch_size [Integer] Number of documents to process at once
+    # @yield [Progress] Optional progress callback called after each batch
     # @return [Enumerator] Lazy enumerator yielding embedding vectors
     # @raise [ArgumentError] If documents is nil or contains nil values
     #
-    # @example
+    # @example Basic usage
     #   vectors = embedding.embed(["Hello", "World"]).to_a
     #   # => [[0.1, 0.2, ...], [0.3, 0.4, ...]]
-    def embed(documents, batch_size: 256)
-      raise ArgumentError, 'documents cannot be nil' if documents.nil?
-      documents = [documents] if documents.is_a?(String)
+    #
+    # @example With progress callback
+    #   embedding.embed(documents, batch_size: 64) do |progress|
+    #     puts "#{progress.percent}% complete"
+    #   end.to_a
+    #
+    def embed(documents, batch_size: 256, &progress_callback)
+      documents = Validators.validate_documents!(documents)
       return Enumerator.new { |_| } if documents.empty?
-      # Validate all documents
-      documents.each_with_index do |doc, i|
-        raise ArgumentError, "document at index #{i} cannot be nil" if doc.nil?
-      end
+      total_batches = (documents.length.to_f / batch_size).ceil
       Enumerator.new do |yielder|
-        documents.each_slice(batch_size) do |batch|
+        documents.each_slice(batch_size).with_index(1) do |batch, batch_num|
           embeddings = @model.embed(batch)
           embeddings.each { |embedding| yielder << embedding }
+          if progress_callback
+            progress = Progress.new(current: batch_num, total: total_batches, batch_size: batch_size)
+            progress_callback.call(progress)
+          end
         end
       end
     end
@@ -100,11 +138,60 @@ module Fastembed
       embed(prefixed, batch_size: batch_size)
     end
-    # List all supported models
+    # Generate embeddings asynchronously in a background thread
+    #
+    # Returns immediately with a Future object. The embedding computation
+    # runs in a background thread. Call `value` on the Future to get results.
+    #
+    # @param documents [Array<String>, String] Text document(s) to embed
+    # @param batch_size [Integer] Number of documents to process at once
+    # @return [Async::Future] Future that resolves to array of embedding vectors
+    #
+    # @example Basic async usage
+    #   future = embedding.embed_async(documents)
+    #   # ... do other work ...
+    #   vectors = future.value  # blocks until complete
+    #
+    # @example Parallel embedding of multiple batches
+    #   futures = documents.each_slice(1000).map do |batch|
+    #     embedding.embed_async(batch)
+    #   end
+    #   all_vectors = futures.flat_map(&:value)
+    #
+    def embed_async(documents, batch_size: 256)
+      Async::Future.new do
+        embed(documents, batch_size: batch_size).to_a
+      end
+    end
+    # Generate query embeddings asynchronously
+    #
+    # @param queries [Array<String>, String] Query text(s) to embed
+    # @param batch_size [Integer] Number of queries to process at once
+    # @return [Async::Future] Future that resolves to array of embedding vectors
+    def query_embed_async(queries, batch_size: 256)
+      Async::Future.new do
+        query_embed(queries, batch_size: batch_size).to_a
+      end
+    end
+    # Generate passage embeddings asynchronously
+    #
+    # @param passages [Array<String>, String] Passage text(s) to embed
+    # @param batch_size [Integer] Number of passages to process at once
+    # @return [Async::Future] Future that resolves to array of embedding vectors
+    def passage_embed_async(passages, batch_size: 256)
+      Async::Future.new do
+        passage_embed(passages, batch_size: batch_size).to_a
+      end
+    end
+    # List all supported models (built-in and custom)
     #
     # @return [Array<Hash>] Array of model information hashes
     def self.list_supported_models
-      SUPPORTED_MODELS.values.map(&:to_h)
+      all_models = SUPPORTED_MODELS.merge(CustomModelRegistry.embedding_models)
+      all_models.values.map(&:to_h)
     end
     # Get information about a specific model
@@ -112,7 +199,53 @@ module Fastembed
     # @param model_name [String] Name of the model
     # @return [Hash, nil] Model information or nil if not found
     def self.get_model_info(model_name)
-      SUPPORTED_MODELS[model_name]&.to_h
+      info = SUPPORTED_MODELS[model_name] || CustomModelRegistry.embedding_models[model_name]
+      info&.to_h
+    end
+    private
+    def resolve_model_info(model_name)
+      ModelManagement.resolve_model_info(model_name)
+    end
+    def create_local_model_info(model_name:, model_file:, tokenizer_file:)
+      # Detect dimension from model output shape if possible
+      # For now, use a placeholder that will be updated after model load
+      ModelInfo.new(
+        model_name: model_name,
+        dim: detect_model_dimension(model_file) || 384,
+        description: 'Local model',
+        size_in_gb: 0,
+        sources: {},
+        model_file: model_file || 'model.onnx',
+        tokenizer_file: tokenizer_file || 'tokenizer.json'
+      )
+    end
+    # Detect embedding dimension from ONNX model output shape
+    #
+    # @param model_file [String, nil] Model filename to inspect
+    # @return [Integer, nil] Detected dimension or nil if detection fails
+    def detect_model_dimension(model_file)
+      model_path = File.join(@model_dir, model_file || 'model.onnx')
+      return nil unless File.exist?(model_path)
+      session = OnnxRuntime::InferenceSession.new(model_path)
+      # Look for output shape - usually [batch, seq_len, hidden_size] or [batch, hidden_size]
+      output = session.outputs.first
+      return nil unless output && output[:shape]
+      shape = output[:shape]
+      # Last dimension is usually the embedding dimension
+      dim = shape.last
+      dim.is_a?(Integer) && dim.positive? ? dim : nil
+    rescue OnnxRuntime::Error => e
+      warn "Warning: Could not detect model dimension: #{e.message}"
+      nil
+    rescue StandardError => e
+      warn "Warning: Unexpected error detecting model dimension: #{e.class} - #{e.message}"
+      nil
     end
   end
 end

data/lib/fastembed/validators.rb ADDED Viewed

@@ -0,0 +1,59 @@
+# frozen_string_literal: true
+module Fastembed
+  # Input validation helpers for embedding models
+  #
+  # Provides consistent validation and normalization of documents, queries,
+  # and other inputs across all model types.
+  #
+  # @api private
+  #
+  module Validators
+    class << self
+      # Validate and normalize document input
+      #
+      # Ensures documents are not nil, converts single strings to arrays,
+      # and validates that no individual document is nil.
+      #
+      # @param documents [Array<String>, String, nil] Documents to validate
+      # @return [Array<String>] Normalized array of documents
+      # @raise [ArgumentError] If documents is nil or contains nil values
+      def validate_documents!(documents)
+        raise ArgumentError, 'documents cannot be nil' if documents.nil?
+        documents = [documents] if documents.is_a?(String)
+        documents.each_with_index do |doc, i|
+          raise ArgumentError, "document at index #{i} cannot be nil" if doc.nil?
+        end
+        documents
+      end
+      # Validate query and documents for reranking
+      #
+      # @param query [String, nil] Query to validate
+      # @param documents [Array<String>, nil] Documents to validate
+      # @return [Array<String>] Validated documents array
+      # @raise [ArgumentError] If query or documents is nil, or documents contains nil
+      def validate_rerank_input!(query:, documents:)
+        raise ArgumentError, 'query cannot be nil' if query.nil?
+        raise ArgumentError, 'documents cannot be nil' if documents.nil?
+        documents.each_with_index do |doc, i|
+          raise ArgumentError, "document at index #{i} cannot be nil" if doc.nil?
+        end
+        documents
+      end
+      # Check if documents array is empty
+      #
+      # @param documents [Array<String>] Documents to check
+      # @return [Boolean] True if empty
+      def empty?(documents)
+        documents.empty?
+      end
+    end
+  end
+end

data/lib/fastembed/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Fastembed
-  VERSION = '1.0.0'
+  VERSION = '1.1.0'
 end

data/lib/fastembed.rb CHANGED Viewed

@@ -2,13 +2,54 @@
 require_relative 'fastembed/version'
+# Fastembed - Fast, lightweight text embeddings for Ruby
+#
+# A Ruby port of FastEmbed providing text embeddings using ONNX Runtime.
+# Supports dense embeddings, sparse embeddings (SPLADE), late interaction (ColBERT),
+# and cross-encoder reranking.
+#
+# @example Basic text embedding
+#   embedding = Fastembed::TextEmbedding.new
+#   vectors = embedding.embed(["Hello world", "Ruby is great"]).to_a
+#
+# @example Reranking documents
+#   reranker = Fastembed::TextCrossEncoder.new
+#   scores = reranker.rerank(query: "What is ML?", documents: docs)
+#
+# @example Sparse embeddings
+#   sparse = Fastembed::TextSparseEmbedding.new
+#   embeddings = sparse.embed(["Hello"]).to_a
+#
+# @example Async embedding
+#   future = embedding.embed_async(large_document_list)
+#   vectors = future.value  # blocks until complete
+#
+# @see https://github.com/khasinski/fastembed-rb
+#
 module Fastembed
+  # Base error class for all Fastembed errors
   class Error < StandardError; end
+  # Raised when model download fails
   class DownloadError < Error; end
 end
+require_relative 'fastembed/pooling'
+require_relative 'fastembed/base_model_info'
 require_relative 'fastembed/model_info'
+require_relative 'fastembed/reranker_model_info'
+require_relative 'fastembed/sparse_model_info'
+require_relative 'fastembed/late_interaction_model_info'
+require_relative 'fastembed/custom_model_registry'
 require_relative 'fastembed/model_management'
-require_relative 'fastembed/pooling'
+require_relative 'fastembed/quantization'
+require_relative 'fastembed/progress'
+require_relative 'fastembed/async'
+require_relative 'fastembed/validators'
+require_relative 'fastembed/base_model'
 require_relative 'fastembed/onnx_embedding_model'
 require_relative 'fastembed/text_embedding'
+require_relative 'fastembed/text_cross_encoder'
+require_relative 'fastembed/sparse_embedding'
+require_relative 'fastembed/late_interaction_embedding'
+require_relative 'fastembed/image_embedding'