RubyGems - fastembed - Versions diffs - 1.0.0 → 1.1.0 - Mend

fastembed 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +4 -4
data/.rubocop.yml +1 -0
data/.yardopts +6 -0
data/BENCHMARKS.md +124 -1
data/CHANGELOG.md +14 -0
data/README.md +395 -74
data/benchmark/compare_all.rb +167 -0
data/benchmark/compare_python.py +60 -0
data/benchmark/memory_profile.rb +70 -0
data/benchmark/profile.rb +198 -0
data/benchmark/reranker_benchmark.rb +158 -0
data/exe/fastembed +6 -0
data/fastembed.gemspec +3 -0
data/lib/fastembed/async.rb +193 -0
data/lib/fastembed/base_model.rb +247 -0
data/lib/fastembed/base_model_info.rb +61 -0
data/lib/fastembed/cli.rb +745 -0
data/lib/fastembed/custom_model_registry.rb +255 -0
data/lib/fastembed/image_embedding.rb +313 -0
data/lib/fastembed/late_interaction_embedding.rb +260 -0
data/lib/fastembed/late_interaction_model_info.rb +91 -0
data/lib/fastembed/model_info.rb +59 -19
data/lib/fastembed/model_management.rb +82 -23
data/lib/fastembed/onnx_embedding_model.rb +25 -4
data/lib/fastembed/pooling.rb +39 -3
data/lib/fastembed/progress.rb +52 -0
data/lib/fastembed/quantization.rb +75 -0
data/lib/fastembed/reranker_model_info.rb +91 -0
data/lib/fastembed/sparse_embedding.rb +261 -0
data/lib/fastembed/sparse_model_info.rb +80 -0
data/lib/fastembed/text_cross_encoder.rb +217 -0
data/lib/fastembed/text_embedding.rb +161 -28
data/lib/fastembed/validators.rb +59 -0
data/lib/fastembed/version.rb +1 -1
data/lib/fastembed.rb +42 -1
data/plan.md +257 -0
data/scripts/verify_models.rb +229 -0
metadata +70 -3

data/lib/fastembed/custom_model_registry.rb ADDED Viewed

@@ -0,0 +1,255 @@
+# frozen_string_literal: true
+module Fastembed
+  # Registry for custom user-defined models
+  #
+  # Allows users to register arbitrary ONNX models that aren't in the built-in registry.
+  # Custom models can be loaded from HuggingFace or local directories.
+  #
+  # @example Register a custom embedding model
+  #   Fastembed.register_model(
+  #     model_name: 'my-org/my-model',
+  #     dim: 768,
+  #     description: 'My custom model',
+  #     sources: { hf: 'my-org/my-model-onnx' }
+  #   )
+  #   embed = Fastembed::TextEmbedding.new(model_name: 'my-org/my-model')
+  #
+  # @example Register a local model
+  #   Fastembed.register_model(
+  #     model_name: 'local-model',
+  #     dim: 384,
+  #     description: 'Local model',
+  #     sources: {}
+  #   )
+  #   embed = Fastembed::TextEmbedding.new(
+  #     model_name: 'local-model',
+  #     local_model_dir: '/path/to/model'
+  #   )
+  #
+  module CustomModelRegistry
+    class << self
+      # Custom embedding models registry
+      # @return [Hash<String, ModelInfo>]
+      def embedding_models
+        @embedding_models ||= {}
+      end
+      # Custom reranker models registry
+      # @return [Hash<String, RerankerModelInfo>]
+      def reranker_models
+        @reranker_models ||= {}
+      end
+      # Custom sparse models registry
+      # @return [Hash<String, SparseModelInfo>]
+      def sparse_models
+        @sparse_models ||= {}
+      end
+      # Custom late interaction models registry
+      # @return [Hash<String, LateInteractionModelInfo>]
+      def late_interaction_models
+        @late_interaction_models ||= {}
+      end
+      # Register a custom embedding model
+      #
+      # @param model_name [String] Unique model identifier
+      # @param dim [Integer] Output embedding dimension
+      # @param description [String] Human-readable description
+      # @param sources [Hash] Source repositories (e.g., { hf: 'org/repo' })
+      # @param size_in_gb [Float] Approximate model size
+      # @param model_file [String] Path to ONNX file within model directory
+      # @param tokenizer_file [String] Path to tokenizer.json
+      # @param pooling [Symbol] Pooling strategy (:mean or :cls)
+      # @param normalize [Boolean] Whether to L2 normalize outputs
+      # @param max_length [Integer] Maximum sequence length
+      # @return [ModelInfo] The registered model info
+      def register_embedding_model(
+        model_name:,
+        dim:,
+        description: 'Custom model',
+        sources: {},
+        size_in_gb: 0,
+        model_file: 'model.onnx',
+        tokenizer_file: 'tokenizer.json',
+        pooling: :mean,
+        normalize: true,
+        max_length: 512
+      )
+        embedding_models[model_name] = ModelInfo.new(
+          model_name: model_name,
+          dim: dim,
+          description: description,
+          sources: sources,
+          size_in_gb: size_in_gb,
+          model_file: model_file,
+          tokenizer_file: tokenizer_file,
+          pooling: pooling,
+          normalize: normalize,
+          max_length: max_length
+        )
+      end
+      # Register a custom reranker model
+      #
+      # @param model_name [String] Unique model identifier
+      # @param description [String] Human-readable description
+      # @param sources [Hash] Source repositories
+      # @param size_in_gb [Float] Approximate model size
+      # @param model_file [String] Path to ONNX file
+      # @param tokenizer_file [String] Path to tokenizer.json
+      # @return [RerankerModelInfo] The registered model info
+      def register_reranker_model(
+        model_name:,
+        description: 'Custom reranker',
+        sources: {},
+        size_in_gb: 0,
+        model_file: 'onnx/model.onnx',
+        tokenizer_file: 'tokenizer.json'
+      )
+        reranker_models[model_name] = RerankerModelInfo.new(
+          model_name: model_name,
+          description: description,
+          sources: sources,
+          size_in_gb: size_in_gb,
+          model_file: model_file,
+          tokenizer_file: tokenizer_file
+        )
+      end
+      # Register a custom sparse embedding model
+      #
+      # @param model_name [String] Unique model identifier
+      # @param description [String] Human-readable description
+      # @param sources [Hash] Source repositories
+      # @param size_in_gb [Float] Approximate model size
+      # @param model_file [String] Path to ONNX file
+      # @param tokenizer_file [String] Path to tokenizer.json
+      # @param max_length [Integer] Maximum sequence length
+      # @return [SparseModelInfo] The registered model info
+      def register_sparse_model(
+        model_name:,
+        description: 'Custom sparse model',
+        sources: {},
+        size_in_gb: 0,
+        model_file: 'onnx/model.onnx',
+        tokenizer_file: 'tokenizer.json',
+        max_length: 512
+      )
+        sparse_models[model_name] = SparseModelInfo.new(
+          model_name: model_name,
+          description: description,
+          sources: sources,
+          size_in_gb: size_in_gb,
+          model_file: model_file,
+          tokenizer_file: tokenizer_file,
+          max_length: max_length
+        )
+      end
+      # Register a custom late interaction model
+      #
+      # @param model_name [String] Unique model identifier
+      # @param dim [Integer] Output embedding dimension per token
+      # @param description [String] Human-readable description
+      # @param sources [Hash] Source repositories
+      # @param size_in_gb [Float] Approximate model size
+      # @param model_file [String] Path to ONNX file
+      # @param tokenizer_file [String] Path to tokenizer.json
+      # @param max_length [Integer] Maximum sequence length
+      # @return [LateInteractionModelInfo] The registered model info
+      def register_late_interaction_model(
+        model_name:,
+        dim:,
+        description: 'Custom late interaction model',
+        sources: {},
+        size_in_gb: 0,
+        model_file: 'onnx/model.onnx',
+        tokenizer_file: 'tokenizer.json',
+        max_length: 512
+      )
+        late_interaction_models[model_name] = LateInteractionModelInfo.new(
+          model_name: model_name,
+          dim: dim,
+          description: description,
+          sources: sources,
+          size_in_gb: size_in_gb,
+          model_file: model_file,
+          tokenizer_file: tokenizer_file,
+          max_length: max_length
+        )
+      end
+      # Unregister a custom model
+      #
+      # @param model_name [String] Model to unregister
+      # @param type [Symbol] Model type (:embedding, :reranker, :sparse, :late_interaction)
+      # @return [Boolean] True if model was removed
+      def unregister_model(model_name, type: :embedding)
+        registry = case type
+                   when :embedding then embedding_models
+                   when :reranker then reranker_models
+                   when :sparse then sparse_models
+                   when :late_interaction then late_interaction_models
+                   else raise ArgumentError, "Unknown model type: #{type}"
+                   end
+        !registry.delete(model_name).nil?
+      end
+      # Clear all custom models
+      # @return [void]
+      def clear_all
+        @embedding_models = {}
+        @reranker_models = {}
+        @sparse_models = {}
+        @late_interaction_models = {}
+      end
+      # List all custom models
+      # @return [Hash] All custom models by type
+      def list_all
+        {
+          embedding: embedding_models.keys,
+          reranker: reranker_models.keys,
+          sparse: sparse_models.keys,
+          late_interaction: late_interaction_models.keys
+        }
+      end
+    end
+  end
+  # Convenience methods on the Fastembed module
+  class << self
+    # Register a custom embedding model
+    # @see CustomModelRegistry.register_embedding_model
+    def register_model(**)
+      CustomModelRegistry.register_embedding_model(**)
+    end
+    # Register a custom reranker model
+    # @see CustomModelRegistry.register_reranker_model
+    def register_reranker(**)
+      CustomModelRegistry.register_reranker_model(**)
+    end
+    # Register a custom sparse model
+    # @see CustomModelRegistry.register_sparse_model
+    def register_sparse_model(**)
+      CustomModelRegistry.register_sparse_model(**)
+    end
+    # Register a custom late interaction model
+    # @see CustomModelRegistry.register_late_interaction_model
+    def register_late_interaction_model(**)
+      CustomModelRegistry.register_late_interaction_model(**)
+    end
+    # List all custom registered models
+    # @return [Hash] Custom models by type
+    def custom_models
+      CustomModelRegistry.list_all
+    end
+  end
+end

data/lib/fastembed/image_embedding.rb ADDED Viewed

@@ -0,0 +1,313 @@
+# frozen_string_literal: true
+module Fastembed
+  # Model information for image embedding models
+  class ImageModelInfo
+    include BaseModelInfo
+    attr_reader :dim, :image_size, :mean, :std
+    def initialize(
+      model_name:,
+      dim:,
+      description:,
+      size_in_gb:,
+      sources:,
+      model_file: 'model.onnx',
+      image_size: 224,
+      mean: [0.485, 0.456, 0.406],
+      std: [0.229, 0.224, 0.225]
+    )
+      initialize_base(
+        model_name: model_name,
+        description: description,
+        size_in_gb: size_in_gb,
+        sources: sources,
+        model_file: model_file,
+        tokenizer_file: '',
+        max_length: 0
+      )
+      @dim = dim
+      @image_size = image_size
+      @mean = mean
+      @std = std
+    end
+    def to_h
+      {
+        model_name: model_name,
+        dim: dim,
+        description: description,
+        size_in_gb: size_in_gb,
+        sources: sources,
+        model_file: model_file,
+        image_size: image_size,
+        mean: mean,
+        std: std
+      }
+    end
+  end
+  # Registry of supported image embedding models
+  SUPPORTED_IMAGE_MODELS = {
+    'Qdrant/clip-ViT-B-32-vision' => ImageModelInfo.new(
+      model_name: 'Qdrant/clip-ViT-B-32-vision',
+      dim: 512,
+      description: 'CLIP ViT-B/32 vision encoder',
+      size_in_gb: 0.34,
+      sources: { hf: 'Qdrant/clip-ViT-B-32-vision' },
+      model_file: 'model.onnx',
+      image_size: 224
+    ),
+    'Qdrant/resnet50-onnx' => ImageModelInfo.new(
+      model_name: 'Qdrant/resnet50-onnx',
+      dim: 2048,
+      description: 'ResNet-50 image encoder',
+      size_in_gb: 0.10,
+      sources: { hf: 'Qdrant/resnet50-onnx' },
+      model_file: 'model.onnx',
+      image_size: 224
+    ),
+    'jinaai/jina-clip-v1' => ImageModelInfo.new(
+      model_name: 'jinaai/jina-clip-v1',
+      dim: 768,
+      description: 'Jina CLIP v1 vision encoder',
+      size_in_gb: 0.35,
+      sources: { hf: 'jinaai/jina-clip-v1' },
+      model_file: 'onnx/vision_model.onnx',
+      image_size: 224
+    )
+  }.freeze
+  DEFAULT_IMAGE_MODEL = 'Qdrant/clip-ViT-B-32-vision'
+  # Image embedding model for converting images to vectors
+  #
+  # Supports CLIP and ResNet models for image search and multimodal applications.
+  # Requires the mini_magick gem for image processing.
+  #
+  # @example Basic usage
+  #   image_embed = Fastembed::ImageEmbedding.new
+  #   vectors = image_embed.embed(["path/to/image.jpg"]).to_a
+  #
+  # @example With URLs
+  #   vectors = image_embed.embed(["https://example.com/image.jpg"]).to_a
+  #
+  class ImageEmbedding
+    attr_reader :model_name, :model_info, :dim
+    # Initialize an image embedding model
+    #
+    # @param model_name [String] Name of the model to use
+    # @param cache_dir [String, nil] Custom cache directory for models
+    # @param threads [Integer, nil] Number of threads for ONNX Runtime
+    # @param providers [Array<String>, nil] ONNX execution providers
+    # @param show_progress [Boolean] Whether to show download progress
+    # @param local_model_dir [String, nil] Load model from local directory instead of downloading
+    # @param model_file [String, nil] Override model file name (e.g., "model.onnx")
+    def initialize(
+      model_name: DEFAULT_IMAGE_MODEL,
+      cache_dir: nil,
+      threads: nil,
+      providers: nil,
+      show_progress: true,
+      local_model_dir: nil,
+      model_file: nil
+    )
+      require_mini_magick!
+      @model_name = model_name
+      @threads = threads
+      @providers = providers || ['CPUExecutionProvider']
+      @model_file_override = model_file
+      if local_model_dir
+        initialize_from_local(local_model_dir: local_model_dir, model_name: model_name, model_file: model_file)
+      else
+        ModelManagement.cache_dir = cache_dir if cache_dir
+        @model_info = resolve_model_info(model_name)
+        @model_dir = retrieve_model(model_name, show_progress: show_progress)
+      end
+      @dim = @model_info.dim
+      setup_model
+    end
+    # Generate embeddings for images
+    #
+    # @param images [Array<String>, String] Image path(s) or URL(s) to embed
+    # @param batch_size [Integer] Number of images to process at once
+    # @yield [Progress] Optional progress callback called after each batch
+    # @return [Enumerator] Lazy enumerator yielding embedding vectors
+    def embed(images, batch_size: 32, &progress_callback)
+      images = [images] if images.is_a?(String)
+      return Enumerator.new { |_| } if images.empty?
+      total_batches = (images.length.to_f / batch_size).ceil
+      Enumerator.new do |yielder|
+        images.each_slice(batch_size).with_index(1) do |batch, batch_num|
+          embeddings = compute_embeddings(batch)
+          embeddings.each { |emb| yielder << emb }
+          if progress_callback
+            progress = Progress.new(current: batch_num, total: total_batches, batch_size: batch_size)
+            progress_callback.call(progress)
+          end
+        end
+      end
+    end
+    # Generate embeddings asynchronously
+    #
+    # @param images [Array<String>, String] Image path(s) or URL(s) to embed
+    # @param batch_size [Integer] Number of images to process at once
+    # @return [Async::Future] Future that resolves to array of embedding vectors
+    def embed_async(images, batch_size: 32)
+      Async::Future.new { embed(images, batch_size: batch_size).to_a }
+    end
+    # List all supported image models
+    #
+    # @return [Array<Hash>] Array of model information hashes
+    def self.list_supported_models
+      SUPPORTED_IMAGE_MODELS.values.map(&:to_h)
+    end
+    private
+    def require_mini_magick!
+      require 'mini_magick'
+    rescue LoadError
+      raise Error, 'Image embedding requires the mini_magick gem. Add it to your Gemfile: gem "mini_magick"'
+    end
+    def resolve_model_info(model_name)
+      info = SUPPORTED_IMAGE_MODELS[model_name]
+      raise Error, "Unknown image model: #{model_name}" unless info
+      info
+    end
+    def initialize_from_local(local_model_dir:, model_name:, model_file:)
+      raise ArgumentError, "Local model directory not found: #{local_model_dir}" unless Dir.exist?(local_model_dir)
+      @model_dir = local_model_dir
+      @model_info = SUPPORTED_IMAGE_MODELS[model_name] || create_local_model_info(
+        model_name: model_name,
+        model_file: model_file
+      )
+    end
+    def create_local_model_info(model_name:, model_file:)
+      ImageModelInfo.new(
+        model_name: model_name,
+        dim: 512, # Default CLIP dimension
+        description: 'Local image model',
+        size_in_gb: 0,
+        sources: {},
+        model_file: model_file || 'model.onnx',
+        image_size: 224
+      )
+    end
+    def retrieve_model(model_name, show_progress:)
+      ModelManagement.retrieve_model(
+        model_name,
+        model_info: @model_info,
+        show_progress: show_progress
+      )
+    end
+    def setup_model
+      model_file = @model_file_override || @model_info.model_file
+      model_path = File.join(@model_dir, model_file)
+      raise Error, "Model file not found: #{model_path}" unless File.exist?(model_path)
+      options = {}
+      options[:inter_op_num_threads] = @threads if @threads
+      options[:intra_op_num_threads] = @threads if @threads
+      @session = OnnxRuntime::InferenceSession.new(
+        model_path,
+        **options,
+        providers: @providers
+      )
+    end
+    def compute_embeddings(image_paths)
+      # Preprocess images into tensor
+      tensors = image_paths.map { |path| preprocess_image(path) }
+      # Stack into batch [batch, channels, height, width]
+      batch_tensor = tensors
+      # Run inference
+      input_name = @session.inputs.first[:name]
+      outputs = @session.run(nil, { input_name => batch_tensor })
+      # Extract and normalize embeddings
+      embeddings = outputs.first
+      embeddings.map { |emb| normalize_embedding(emb) }
+    end
+    def preprocess_image(image_path)
+      # Load image
+      image = load_image(image_path)
+      # Resize to model's expected size
+      size = @model_info.image_size
+      image.resize "#{size}x#{size}!"
+      # Convert to RGB tensor and normalize
+      pixels = extract_pixels(image)
+      normalize_pixels(pixels)
+    end
+    def load_image(path)
+      raise Error, "Image file not found: #{path}" if !path.start_with?('http://', 'https://') && !File.exist?(path)
+      MiniMagick::Image.open(path)
+    end
+    def extract_pixels(image)
+      # Get raw RGB pixel data using ImageMagick's export
+      # depth:8 ensures 8-bit per channel, and 'RGB' gives us raw RGB bytes
+      pixels_str = image.run_command('convert', image.path, '-depth', '8', 'RGB:-')
+      # Convert to array of RGB values [0-255]
+      pixels_str.unpack('C*')
+    end
+    def normalize_pixels(pixels)
+      size = @model_info.image_size
+      mean = @model_info.mean
+      std = @model_info.std
+      # Convert from [H, W, C] flat array to [C, H, W] tensor
+      channels = 3
+      tensor = Array.new(channels) { Array.new(size) { Array.new(size) } }
+      pixels.each_with_index do |pixel, i|
+        h = (i / 3) / size
+        w = (i / 3) % size
+        c = i % 3
+        # Normalize: (pixel/255 - mean) / std
+        normalized = ((pixel / 255.0) - mean[c]) / std[c]
+        tensor[c][h][w] = normalized
+      end
+      tensor
+    end
+    def normalize_embedding(embedding)
+      # L2 normalize the embedding
+      embedding = embedding.flatten if embedding.is_a?(Array) && embedding.first.is_a?(Array)
+      norm = Math.sqrt(embedding.sum { |x| x * x })
+      return embedding if norm.zero?
+      embedding.map { |x| x / norm }
+    end
+  end
+end