RubyGems - fastembed - Versions diffs - 1.0.0 → 1.1.0 - Mend

fastembed 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +4 -4
data/.rubocop.yml +1 -0
data/.yardopts +6 -0
data/BENCHMARKS.md +124 -1
data/CHANGELOG.md +14 -0
data/README.md +395 -74
data/benchmark/compare_all.rb +167 -0
data/benchmark/compare_python.py +60 -0
data/benchmark/memory_profile.rb +70 -0
data/benchmark/profile.rb +198 -0
data/benchmark/reranker_benchmark.rb +158 -0
data/exe/fastembed +6 -0
data/fastembed.gemspec +3 -0
data/lib/fastembed/async.rb +193 -0
data/lib/fastembed/base_model.rb +247 -0
data/lib/fastembed/base_model_info.rb +61 -0
data/lib/fastembed/cli.rb +745 -0
data/lib/fastembed/custom_model_registry.rb +255 -0
data/lib/fastembed/image_embedding.rb +313 -0
data/lib/fastembed/late_interaction_embedding.rb +260 -0
data/lib/fastembed/late_interaction_model_info.rb +91 -0
data/lib/fastembed/model_info.rb +59 -19
data/lib/fastembed/model_management.rb +82 -23
data/lib/fastembed/onnx_embedding_model.rb +25 -4
data/lib/fastembed/pooling.rb +39 -3
data/lib/fastembed/progress.rb +52 -0
data/lib/fastembed/quantization.rb +75 -0
data/lib/fastembed/reranker_model_info.rb +91 -0
data/lib/fastembed/sparse_embedding.rb +261 -0
data/lib/fastembed/sparse_model_info.rb +80 -0
data/lib/fastembed/text_cross_encoder.rb +217 -0
data/lib/fastembed/text_embedding.rb +161 -28
data/lib/fastembed/validators.rb +59 -0
data/lib/fastembed/version.rb +1 -1
data/lib/fastembed.rb +42 -1
data/plan.md +257 -0
data/scripts/verify_models.rb +229 -0
metadata +70 -3

data/lib/fastembed/late_interaction_embedding.rb ADDED Viewed

@@ -0,0 +1,260 @@
+# frozen_string_literal: true
+module Fastembed
+  # Represents a late interaction (ColBERT-style) embedding
+  # Contains multiple token-level embeddings instead of a single vector
+  class LateInteractionEmbedding
+    attr_reader :embeddings, :token_count
+    # @param embeddings [Array<Array<Float>>] Token-level embeddings
+    def initialize(embeddings)
+      @embeddings = embeddings
+      @token_count = embeddings.length
+    end
+    # Get embedding dimension
+    # @return [Integer] Dimension of each token embedding
+    def dim
+      @embeddings.first&.length || 0
+    end
+    # Compute MaxSim score against another late interaction embedding
+    # This is the core ColBERT scoring mechanism
+    # @param other [LateInteractionEmbedding] Document embedding to score against
+    # @return [Float] MaxSim relevance score
+    def max_sim(other)
+      return 0.0 if embeddings.empty? || other.embeddings.empty?
+      # For each query token, find max similarity with any document token
+      embeddings.sum do |query_vec|
+        other.embeddings.map do |doc_vec|
+          dot_product(query_vec, doc_vec)
+        end.max
+      end
+    end
+    def to_a
+      @embeddings
+    end
+    def to_s
+      "LateInteractionEmbedding(tokens=#{token_count}, dim=#{dim})"
+    end
+    def inspect
+      to_s
+    end
+    private
+    def dot_product(a, b)
+      a.zip(b).sum { |x, y| x * y }
+    end
+  end
+  # Late interaction text embedding using ColBERT-style models
+  #
+  # Unlike standard embeddings that produce one vector per document,
+  # late interaction models produce one vector per token. This enables
+  # more fine-grained matching using MaxSim scoring.
+  #
+  # @example Basic usage
+  #   model = Fastembed::LateInteractionTextEmbedding.new
+  #   query_emb = model.query_embed("What is ML?").first
+  #   doc_emb = model.embed("Machine learning is...").first
+  #   score = query_emb.max_sim(doc_emb)
+  #
+  class LateInteractionTextEmbedding
+    include BaseModel
+    attr_reader :dim
+    # Initialize a late interaction embedding model
+    #
+    # @param model_name [String] Name of the model to use
+    # @param cache_dir [String, nil] Custom cache directory for models
+    # @param threads [Integer, nil] Number of threads for ONNX Runtime
+    # @param providers [Array<String>, nil] ONNX execution providers
+    # @param show_progress [Boolean] Whether to show download progress
+    # @param quantization [Symbol] Quantization type (:fp32, :fp16, :int8, :uint8, :q4)
+    # @param local_model_dir [String, nil] Load model from local directory instead of downloading
+    # @param model_file [String, nil] Override model file name (e.g., "model.onnx")
+    # @param tokenizer_file [String, nil] Override tokenizer file name (e.g., "tokenizer.json")
+    def initialize(
+      model_name: DEFAULT_LATE_INTERACTION_MODEL,
+      cache_dir: nil,
+      threads: nil,
+      providers: nil,
+      show_progress: true,
+      quantization: nil,
+      local_model_dir: nil,
+      model_file: nil,
+      tokenizer_file: nil
+    )
+      if local_model_dir
+        initialize_from_local(
+          local_model_dir: local_model_dir,
+          model_name: model_name,
+          threads: threads,
+          providers: providers,
+          quantization: quantization,
+          model_file: model_file,
+          tokenizer_file: tokenizer_file
+        )
+      else
+        initialize_model(
+          model_name: model_name,
+          cache_dir: cache_dir,
+          threads: threads,
+          providers: providers,
+          show_progress: show_progress,
+          quantization: quantization
+        )
+      end
+      @dim = @model_info.dim
+      setup_model_and_tokenizer(model_file_override: model_file || quantized_model_file)
+    end
+    # Generate late interaction embeddings for documents
+    #
+    # @param documents [Array<String>, String] Text document(s) to embed
+    # @param batch_size [Integer] Number of documents to process at once
+    # @yield [Progress] Optional progress callback called after each batch
+    # @return [Enumerator] Lazy enumerator yielding LateInteractionEmbedding objects
+    def embed(documents, batch_size: 32, &progress_callback)
+      documents = Validators.validate_documents!(documents)
+      return Enumerator.new { |_| } if documents.empty?
+      total_batches = (documents.length.to_f / batch_size).ceil
+      Enumerator.new do |yielder|
+        documents.each_slice(batch_size).with_index(1) do |batch, batch_num|
+          embeddings = compute_embeddings(batch)
+          embeddings.each { |emb| yielder << emb }
+          if progress_callback
+            progress = Progress.new(current: batch_num, total: total_batches, batch_size: batch_size)
+            progress_callback.call(progress)
+          end
+        end
+      end
+    end
+    # Generate late interaction embeddings for queries
+    # Queries typically use a special prefix for asymmetric retrieval
+    #
+    # @param queries [Array<String>, String] Query text(s) to embed
+    # @param batch_size [Integer] Number of queries to process at once
+    # @return [Enumerator] Lazy enumerator yielding LateInteractionEmbedding objects
+    def query_embed(queries, batch_size: 32)
+      queries = [queries] if queries.is_a?(String)
+      # ColBERT uses [Q] marker for queries
+      prefixed = queries.map { |q| "[Q] #{q}" }
+      embed(prefixed, batch_size: batch_size)
+    end
+    # Generate late interaction embeddings for passages/documents
+    #
+    # @param passages [Array<String>, String] Passage text(s) to embed
+    # @param batch_size [Integer] Number of passages to process at once
+    # @return [Enumerator] Lazy enumerator yielding LateInteractionEmbedding objects
+    def passage_embed(passages, batch_size: 32)
+      passages = [passages] if passages.is_a?(String)
+      # ColBERT uses [D] marker for documents
+      prefixed = passages.map { |p| "[D] #{p}" }
+      embed(prefixed, batch_size: batch_size)
+    end
+    # Generate embeddings asynchronously
+    #
+    # @param documents [Array<String>, String] Text document(s) to embed
+    # @param batch_size [Integer] Number of documents to process at once
+    # @return [Async::Future] Future that resolves to array of LateInteractionEmbedding objects
+    def embed_async(documents, batch_size: 32)
+      Async::Future.new { embed(documents, batch_size: batch_size).to_a }
+    end
+    # Generate query embeddings asynchronously
+    #
+    # @param queries [Array<String>, String] Query text(s) to embed
+    # @param batch_size [Integer] Number of queries to process at once
+    # @return [Async::Future] Future that resolves to array of LateInteractionEmbedding objects
+    def query_embed_async(queries, batch_size: 32)
+      Async::Future.new { query_embed(queries, batch_size: batch_size).to_a }
+    end
+    # Generate passage embeddings asynchronously
+    #
+    # @param passages [Array<String>, String] Passage text(s) to embed
+    # @param batch_size [Integer] Number of passages to process at once
+    # @return [Async::Future] Future that resolves to array of LateInteractionEmbedding objects
+    def passage_embed_async(passages, batch_size: 32)
+      Async::Future.new { passage_embed(passages, batch_size: batch_size).to_a }
+    end
+    # List all supported late interaction models
+    #
+    # @return [Array<Hash>] Array of model information hashes
+    def self.list_supported_models
+      SUPPORTED_LATE_INTERACTION_MODELS.values.map(&:to_h)
+    end
+    private
+    def resolve_model_info(model_name)
+      # Check built-in registry first
+      info = SUPPORTED_LATE_INTERACTION_MODELS[model_name]
+      return info if info
+      # Check custom registry
+      info = CustomModelRegistry.late_interaction_models[model_name]
+      return info if info
+      raise Error, "Unknown late interaction model: #{model_name}"
+    end
+    def create_local_model_info(model_name:, model_file:, tokenizer_file:)
+      LateInteractionModelInfo.new(
+        model_name: model_name,
+        description: 'Local late interaction model',
+        size_in_gb: 0,
+        sources: {},
+        model_file: model_file || 'model.onnx',
+        tokenizer_file: tokenizer_file || 'tokenizer.json',
+        dim: 128 # Default ColBERT dimension
+      )
+    end
+    def compute_embeddings(texts)
+      prepared = tokenize_and_prepare(texts)
+      outputs = @session.run(nil, prepared[:inputs])
+      token_embeddings = extract_token_embeddings(outputs)
+      # Create LateInteractionEmbedding for each document
+      texts.length.times.map do |i|
+        # Filter out padding tokens using attention mask
+        valid_embeddings = []
+        token_embeddings[i].each_with_index do |emb, j|
+          valid_embeddings << normalize_vector(emb) if prepared[:attention_mask][i][j] == 1
+        end
+        LateInteractionEmbedding.new(valid_embeddings)
+      end
+    end
+    def extract_token_embeddings(outputs)
+      if outputs.is_a?(Hash)
+        outputs['last_hidden_state'] || outputs['token_embeddings'] || outputs.values.first
+      else
+        outputs.first
+      end
+    end
+    def normalize_vector(vec)
+      norm = Math.sqrt(vec.sum { |x| x * x })
+      return vec if norm.zero?
+      vec.map { |x| x / norm }
+    end
+  end
+end

data/lib/fastembed/late_interaction_model_info.rb ADDED Viewed

@@ -0,0 +1,91 @@
+# frozen_string_literal: true
+module Fastembed
+  # Model information for late interaction models (ColBERT, etc.)
+  #
+  # Late interaction models produce token-level embeddings instead of a single
+  # document vector, enabling fine-grained matching via MaxSim scoring.
+  #
+  # @example Access late interaction model info
+  #   info = Fastembed::SUPPORTED_LATE_INTERACTION_MODELS['colbert-ir/colbertv2.0']
+  #   info.dim  # => 128
+  #
+  class LateInteractionModelInfo
+    include BaseModelInfo
+    # @!attribute [r] dim
+    #   @return [Integer] Output embedding dimension per token
+    attr_reader :dim
+    # Create a new LateInteractionModelInfo instance
+    #
+    # @param model_name [String] Full model identifier
+    # @param dim [Integer] Output embedding dimension per token
+    # @param description [String] Human-readable description
+    # @param size_in_gb [Float] Model size in GB
+    # @param sources [Hash] Source repositories
+    # @param model_file [String] Path to ONNX model file
+    # @param tokenizer_file [String] Path to tokenizer file
+    # @param max_length [Integer] Maximum sequence length
+    def initialize(
+      model_name:,
+      dim:,
+      description:,
+      size_in_gb:,
+      sources:,
+      model_file: 'onnx/model.onnx',
+      tokenizer_file: 'tokenizer.json',
+      max_length: 512
+    )
+      initialize_base(
+        model_name: model_name,
+        description: description,
+        size_in_gb: size_in_gb,
+        sources: sources,
+        model_file: model_file,
+        tokenizer_file: tokenizer_file,
+        max_length: max_length
+      )
+      @dim = dim
+    end
+    # Convert to hash representation
+    # @return [Hash] Model info as a hash
+    def to_h
+      {
+        model_name: model_name,
+        dim: dim,
+        description: description,
+        size_in_gb: size_in_gb,
+        sources: sources,
+        model_file: model_file,
+        tokenizer_file: tokenizer_file,
+        max_length: max_length
+      }
+    end
+  end
+  # Registry of supported late interaction models
+  SUPPORTED_LATE_INTERACTION_MODELS = {
+    'colbert-ir/colbertv2.0' => LateInteractionModelInfo.new(
+      model_name: 'colbert-ir/colbertv2.0',
+      dim: 128,
+      description: 'ColBERTv2 for late interaction retrieval',
+      size_in_gb: 0.44,
+      sources: { hf: 'colbert-ir/colbertv2.0' },
+      model_file: 'model.onnx',
+      max_length: 512
+    ),
+    'jinaai/jina-colbert-v1-en' => LateInteractionModelInfo.new(
+      model_name: 'jinaai/jina-colbert-v1-en',
+      dim: 768,
+      description: 'Jina ColBERT v1 for English with 8192 context',
+      size_in_gb: 0.43,
+      sources: { hf: 'onnx-models/jina-colbert-v1-en-onnx' },
+      model_file: 'model.onnx',
+      max_length: 8192
+    )
+  }.freeze
+  DEFAULT_LATE_INTERACTION_MODEL = 'colbert-ir/colbertv2.0'
+end

data/lib/fastembed/model_info.rb CHANGED Viewed

@@ -1,11 +1,41 @@
 # frozen_string_literal: true
 module Fastembed
-  # Model information structure
+  # Model information for dense embedding models
+  #
+  # Stores metadata and configuration for ONNX embedding models including
+  # output dimensions, pooling strategy, and normalization settings.
+  #
+  # @example Access model info
+  #   info = Fastembed::SUPPORTED_MODELS['BAAI/bge-small-en-v1.5']
+  #   info.dim        # => 384
+  #   info.pooling    # => :mean
+  #   info.normalize  # => true
+  #
   class ModelInfo
-    attr_reader :model_name, :dim, :description, :size_in_gb, :model_file,
-                :tokenizer_file, :sources, :pooling, :normalize
+    include BaseModelInfo
+    # @!attribute [r] dim
+    #   @return [Integer] Output embedding dimension
+    # @!attribute [r] pooling
+    #   @return [Symbol] Pooling strategy (:mean or :cls)
+    # @!attribute [r] normalize
+    #   @return [Boolean] Whether to L2 normalize output embeddings
+    attr_reader :dim, :pooling, :normalize
+    # Create a new ModelInfo instance
+    #
+    # @param model_name [String] Full model identifier
+    # @param dim [Integer] Output embedding dimension
+    # @param description [String] Human-readable description
+    # @param size_in_gb [Float] Model size in GB
+    # @param sources [Hash] Source repositories
+    # @param model_file [String] Path to ONNX model file
+    # @param tokenizer_file [String] Path to tokenizer file
+    # @param pooling [Symbol] Pooling strategy (:mean or :cls)
+    # @param normalize [Boolean] Whether to L2 normalize outputs
+    # @param max_length [Integer] Maximum sequence length
+    # @raise [ArgumentError] If pooling strategy is invalid
     def initialize(
       model_name:,
       dim:,
@@ -15,23 +45,28 @@ module Fastembed
       model_file: 'model.onnx',
       tokenizer_file: 'tokenizer.json',
       pooling: :mean,
-      normalize: true
+      normalize: true,
+      max_length: BaseModelInfo::DEFAULT_MAX_LENGTH
     )
-      @model_name = model_name
+      unless Pooling.valid?(pooling)
+        valid = Pooling::VALID_STRATEGIES.join(', ')
+        raise ArgumentError, "Invalid pooling strategy: #{pooling}. Valid strategies: #{valid}"
+      end
+      initialize_base(
+        model_name: model_name,
+        description: description,
+        size_in_gb: size_in_gb,
+        sources: sources,
+        model_file: model_file,
+        tokenizer_file: tokenizer_file,
+        max_length: max_length
+      )
       @dim = dim
-      @description = description
-      @size_in_gb = size_in_gb
-      @sources = sources
-      @model_file = model_file
-      @tokenizer_file = tokenizer_file
       @pooling = pooling
       @normalize = normalize
     end
-    def hf_repo
-      sources[:hf]
-    end
     def to_h
       {
         model_name: model_name,
@@ -42,7 +77,8 @@ module Fastembed
         model_file: model_file,
         tokenizer_file: tokenizer_file,
         pooling: pooling,
-        normalize: normalize
+        normalize: normalize,
+        max_length: max_length
       }
     end
   end
@@ -103,7 +139,8 @@ module Fastembed
       description: 'Long context (8192 tokens) English embedding model',
       size_in_gb: 0.52,
       sources: { hf: 'nomic-ai/nomic-embed-text-v1' },
-      model_file: 'onnx/model.onnx'
+      model_file: 'onnx/model.onnx',
+      max_length: 8192
     ),
     'nomic-ai/nomic-embed-text-v1.5' => ModelInfo.new(
       model_name: 'nomic-ai/nomic-embed-text-v1.5',
@@ -111,7 +148,8 @@ module Fastembed
       description: 'Improved long context embedding with Matryoshka support',
       size_in_gb: 0.52,
       sources: { hf: 'nomic-ai/nomic-embed-text-v1.5' },
-      model_file: 'onnx/model.onnx'
+      model_file: 'onnx/model.onnx',
+      max_length: 8192
     ),
     'jinaai/jina-embeddings-v2-small-en' => ModelInfo.new(
       model_name: 'jinaai/jina-embeddings-v2-small-en',
@@ -119,7 +157,8 @@ module Fastembed
       description: 'Small English embedding with 8192 token context',
       size_in_gb: 0.06,
       sources: { hf: 'Xenova/jina-embeddings-v2-small-en' },
-      model_file: 'onnx/model.onnx'
+      model_file: 'onnx/model.onnx',
+      max_length: 8192
     ),
     'jinaai/jina-embeddings-v2-base-en' => ModelInfo.new(
       model_name: 'jinaai/jina-embeddings-v2-base-en',
@@ -127,7 +166,8 @@ module Fastembed
       description: 'Base English embedding with 8192 token context',
       size_in_gb: 0.52,
       sources: { hf: 'Xenova/jina-embeddings-v2-base-en' },
-      model_file: 'onnx/model.onnx'
+      model_file: 'onnx/model.onnx',
+      max_length: 8192
     ),
     'sentence-transformers/paraphrase-MiniLM-L6-v2' => ModelInfo.new(
       model_name: 'sentence-transformers/paraphrase-MiniLM-L6-v2',

data/lib/fastembed/model_management.rb CHANGED Viewed

@@ -6,9 +6,24 @@ require 'json'
 require 'fileutils'
 module Fastembed
-  # Handles model downloading and caching
+  # Handles model downloading and caching from HuggingFace
+  #
+  # Downloads ONNX models and tokenizer files from HuggingFace repositories,
+  # caching them locally for subsequent use. Supports custom cache directories
+  # via environment variables.
+  #
+  # @example Check cache location
+  #   Fastembed::ModelManagement.cache_dir
+  #   # => "/home/user/.cache/fastembed"
+  #
+  # @example Use custom cache directory
+  #   Fastembed::ModelManagement.cache_dir = "/custom/path"
+  #
   module ModelManagement
+    # Base URL for HuggingFace API
     HF_API_BASE = 'https://huggingface.co'
+    # Files required for model operation (in addition to model.onnx and tokenizer.json)
     REQUIRED_FILES = %w[
       config.json
       tokenizer.json
@@ -18,7 +33,13 @@ module Fastembed
     class << self
       # Returns the cache directory for storing models
-      # Priority: FASTEMBED_CACHE_PATH > XDG_CACHE_HOME > ~/.cache
+      #
+      # Priority order:
+      # 1. FASTEMBED_CACHE_PATH environment variable
+      # 2. XDG_CACHE_HOME environment variable
+      # 3. ~/.cache (fallback)
+      #
+      # @return [String] Absolute path to cache directory
       def cache_dir
         @cache_dir ||= begin
           base = ENV['FASTEMBED_CACHE_PATH'] ||
@@ -29,11 +50,22 @@ module Fastembed
       end
       # Set a custom cache directory
+      # @!attribute [w] cache_dir
+      # @return [String] Path to use as cache directory
       attr_writer :cache_dir
       # Returns the path to a cached model, downloading if necessary
-      def retrieve_model(model_name, show_progress: true)
-        model_info = resolve_model_info(model_name)
+      #
+      # Downloads the model from HuggingFace if not already cached.
+      # The model directory will contain the ONNX model file and tokenizer.
+      #
+      # @param model_name [String] Name of the model (e.g., "BAAI/bge-small-en-v1.5")
+      # @param model_info [BaseModelInfo, nil] Optional pre-resolved model info
+      # @param show_progress [Boolean] Whether to print download progress
+      # @return [String] Absolute path to the model directory
+      # @raise [DownloadError] If the download fails
+      def retrieve_model(model_name, model_info: nil, show_progress: true)
+        model_info ||= resolve_model_info(model_name)
         model_dir = model_directory(model_info)
         # Check if model is already cached
@@ -45,6 +77,10 @@ module Fastembed
       end
       # Check if a model exists in cache
+      #
+      # @param model_dir [String] Path to model directory
+      # @param model_info [BaseModelInfo] Model info with required file paths
+      # @return [Boolean] True if model files exist
       def model_cached?(model_dir, model_info)
         return false unless Dir.exist?(model_dir)
@@ -56,21 +92,33 @@ module Fastembed
       end
       # Get the directory path for a model
+      #
+      # @param model_info [BaseModelInfo] Model info
+      # @return [String] Path where model should be stored
       def model_directory(model_info)
         # Create a safe directory name from the model name
         safe_name = model_info.model_name.gsub('/', '--')
         File.join(cache_dir, 'models', safe_name)
       end
-      # Resolve model name to ModelInfo
+      # Resolve model name to ModelInfo from registry
+      #
+      # Checks both built-in registry and custom model registry.
+      #
+      # @param model_name [String] Model name to look up
+      # @return [ModelInfo] The model information
+      # @raise [ArgumentError] If model is not found in any registry
       def resolve_model_info(model_name)
+        # Check built-in registry first
         model_info = SUPPORTED_MODELS[model_name]
-        unless model_info
-          raise ArgumentError,
-                "Unknown model: #{model_name}. Use TextEmbedding.list_supported_models to see available models."
-        end
+        return model_info if model_info
-        model_info
+        # Check custom registry
+        model_info = CustomModelRegistry.embedding_models[model_name]
+        return model_info if model_info
+        raise ArgumentError,
+              "Unknown model: #{model_name}. Use TextEmbedding.list_supported_models to see available models."
       end
       private
@@ -84,6 +132,11 @@ module Fastembed
         # Download model file
         download_file(repo_id, model_info.model_file, model_dir, show_progress: show_progress)
+        # Some large models store weights in a separate .onnx_data file
+        # Try to download it if it exists (not required)
+        data_file = "#{model_info.model_file}_data"
+        download_file(repo_id, data_file, model_dir, show_progress: show_progress, required: false)
         # Download tokenizer and config files
         files_to_download = REQUIRED_FILES + [model_info.tokenizer_file]
         files_to_download.uniq.each do |file|
@@ -129,23 +182,29 @@ module Fastembed
           raise DownloadError, "Invalid URL scheme: #{url}"
         end
-        Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https', read_timeout: 300,
+        # Use longer timeout for large files
+        Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https', read_timeout: 600,
                                             open_timeout: 30) do |http|
           request = Net::HTTP::Get.new(uri)
           request['User-Agent'] = "fastembed-ruby/#{VERSION}"
-          response = http.request(request)
-          case response
-          when Net::HTTPSuccess
-            File.binwrite(local_path, response.body)
-          when Net::HTTPRedirection
-            new_url = response['location']
-            # Handle relative redirects
-            new_url = "#{uri.scheme}://#{uri.host}#{new_url}" if new_url.start_with?('/')
-            download_with_redirect(new_url, local_path, show_progress: show_progress, max_redirects: max_redirects - 1)
-          else
-            raise DownloadError, "HTTP #{response.code}: #{response.message}"
+          http.request(request) do |response|
+            case response
+            when Net::HTTPSuccess
+              # Stream to file to handle large files without loading into memory
+              File.open(local_path, 'wb') do |file|
+                response.read_body do |chunk|
+                  file.write(chunk)
+                end
+              end
+            when Net::HTTPRedirection
+              new_url = response['location']
+              # Handle relative redirects
+              new_url = "#{uri.scheme}://#{uri.host}#{new_url}" if new_url.start_with?('/')
+              download_with_redirect(new_url, local_path, show_progress: show_progress, max_redirects: max_redirects - 1)
+            else
+              raise DownloadError, "HTTP #{response.code}: #{response.message}"
+            end
           end
         end
       end