RubyGems - woods - Versions diffs - 1.2.0 → 1.3.0 - Mend

woods 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +169 -0
data/README.md +20 -8
data/exe/woods-console +51 -6
data/exe/woods-console-mcp +24 -4
data/exe/woods-mcp +30 -7
data/exe/woods-mcp-http +47 -6
data/lib/generators/woods/install_generator.rb +13 -4
data/lib/generators/woods/templates/woods.rb.tt +155 -0
data/lib/tasks/woods.rake +15 -50
data/lib/woods/builder.rb +174 -9
data/lib/woods/cache/cache_middleware.rb +360 -31
data/lib/woods/chunking/semantic_chunker.rb +334 -7
data/lib/woods/console/adapters/job_adapter.rb +10 -4
data/lib/woods/console/audit_logger.rb +76 -4
data/lib/woods/console/bridge.rb +48 -15
data/lib/woods/console/bridge_protocol.rb +44 -0
data/lib/woods/console/confirmation.rb +3 -4
data/lib/woods/console/console_response_renderer.rb +56 -18
data/lib/woods/console/credential_index.rb +201 -0
data/lib/woods/console/credential_scanner.rb +302 -0
data/lib/woods/console/dispatch_pipeline.rb +138 -0
data/lib/woods/console/embedded_executor.rb +682 -35
data/lib/woods/console/eval_guard.rb +319 -0
data/lib/woods/console/model_validator.rb +1 -3
data/lib/woods/console/rack_middleware.rb +185 -29
data/lib/woods/console/redactor.rb +161 -0
data/lib/woods/console/response_context.rb +127 -0
data/lib/woods/console/safe_context.rb +220 -23
data/lib/woods/console/scope_predicate_parser.rb +131 -0
data/lib/woods/console/server.rb +417 -486
data/lib/woods/console/sql_noise_stripper.rb +87 -0
data/lib/woods/console/sql_table_scanner.rb +213 -0
data/lib/woods/console/sql_validator.rb +81 -31
data/lib/woods/console/table_gate.rb +93 -0
data/lib/woods/console/tool_specs.rb +552 -0
data/lib/woods/console/tools/tier1.rb +3 -3
data/lib/woods/console/tools/tier4.rb +7 -1
data/lib/woods/dependency_graph.rb +66 -7
data/lib/woods/embedding/indexer.rb +190 -6
data/lib/woods/embedding/openai.rb +40 -4
data/lib/woods/embedding/provider.rb +104 -8
data/lib/woods/embedding/text_preparer.rb +23 -3
data/lib/woods/embedding/token_counter.rb +133 -0
data/lib/woods/evaluation/baseline_runner.rb +20 -2
data/lib/woods/evaluation/metrics.rb +4 -1
data/lib/woods/extracted_unit.rb +1 -0
data/lib/woods/extractor.rb +7 -1
data/lib/woods/extractors/controller_extractor.rb +6 -0
data/lib/woods/extractors/mailer_extractor.rb +16 -2
data/lib/woods/extractors/model_extractor.rb +6 -1
data/lib/woods/extractors/phlex_extractor.rb +13 -4
data/lib/woods/extractors/rails_source_extractor.rb +2 -0
data/lib/woods/extractors/route_helper_resolver.rb +130 -0
data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
data/lib/woods/extractors/view_component_extractor.rb +12 -1
data/lib/woods/extractors/view_engines/base.rb +141 -0
data/lib/woods/extractors/view_engines/erb.rb +145 -0
data/lib/woods/extractors/view_template_extractor.rb +92 -133
data/lib/woods/flow_assembler.rb +23 -15
data/lib/woods/flow_precomputer.rb +21 -2
data/lib/woods/graph_analyzer.rb +3 -4
data/lib/woods/index_artifact.rb +173 -0
data/lib/woods/mcp/bearer_auth.rb +45 -0
data/lib/woods/mcp/bootstrap_state.rb +94 -0
data/lib/woods/mcp/bootstrapper.rb +337 -16
data/lib/woods/mcp/config_resolver.rb +288 -0
data/lib/woods/mcp/errors.rb +134 -0
data/lib/woods/mcp/index_reader.rb +265 -30
data/lib/woods/mcp/origin_guard.rb +132 -0
data/lib/woods/mcp/provider_probe.rb +166 -0
data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
data/lib/woods/mcp/server.rb +737 -137
data/lib/woods/model_name_cache.rb +78 -2
data/lib/woods/notion/client.rb +25 -2
data/lib/woods/notion/mappers/model_mapper.rb +36 -2
data/lib/woods/railtie.rb +55 -15
data/lib/woods/resilience/circuit_breaker.rb +9 -2
data/lib/woods/resilience/retryable_provider.rb +40 -3
data/lib/woods/resolved_config.rb +299 -0
data/lib/woods/retrieval/context_assembler.rb +112 -5
data/lib/woods/retrieval/query_classifier.rb +1 -1
data/lib/woods/retrieval/ranker.rb +55 -6
data/lib/woods/retrieval/search_executor.rb +42 -13
data/lib/woods/retriever.rb +330 -24
data/lib/woods/session_tracer/middleware.rb +35 -1
data/lib/woods/storage/graph_store.rb +39 -0
data/lib/woods/storage/inapplicable_backend.rb +14 -0
data/lib/woods/storage/metadata_store.rb +129 -1
data/lib/woods/storage/pgvector.rb +70 -8
data/lib/woods/storage/qdrant.rb +196 -5
data/lib/woods/storage/snapshotter/metadata.rb +172 -0
data/lib/woods/storage/snapshotter/vector.rb +238 -0
data/lib/woods/storage/snapshotter.rb +24 -0
data/lib/woods/storage/vector_store.rb +184 -35
data/lib/woods/tasks.rb +85 -0
data/lib/woods/temporal/snapshot_store.rb +49 -1
data/lib/woods/token_utils.rb +44 -5
data/lib/woods/unblocked/client.rb +1 -1
data/lib/woods/unblocked/document_builder.rb +35 -10
data/lib/woods/unblocked/exporter.rb +1 -1
data/lib/woods/util/host_guard.rb +61 -0
data/lib/woods/version.rb +1 -1
data/lib/woods.rb +126 -6
metadata +69 -4

data/lib/woods/dependency_graph.rb CHANGED Viewed

@@ -23,8 +23,9 @@ module Woods
   class DependencyGraph
     def initialize
       @nodes = {}      # identifier => { type:, file_path: }
-      @edges = {}      # identifier => [dependency identifiers]
+      @edges = {}      # identifier => [{ target:, via: }]
       @reverse = {}    # identifier => Set of dependent identifiers
+      @reverse_via = {} # [target, via] => Set of dependent identifiers
       @file_map = {}   # file_path => identifier
       @type_index = {} # type => Set of identifiers
       @to_h = nil
@@ -42,7 +43,7 @@ module Woods
         namespace: unit.namespace
       }
-      @edges[unit.identifier] = unit.dependencies.map { |d| d[:target] }
+      @edges[unit.identifier] = unit.dependencies.map { |d| { target: d[:target], via: d[:via] } }
       @file_map[unit.file_path] = unit.identifier if unit.file_path
       # Type index for filtering (Set-based for O(1) insert)
@@ -51,6 +52,7 @@ module Woods
       # Build reverse edges (Set-based for O(1) insert)
       unit.dependencies.each do |dep|
         (@reverse[dep[:target]] ||= Set.new).add(unit.identifier)
+        (@reverse_via[[dep[:target], dep[:via]]] ||= Set.new).add(unit.identifier)
       end
     end
@@ -107,17 +109,28 @@ module Woods
     # Get direct dependencies of a unit
     #
     # @param identifier [String] Unit identifier
+    # @param via [Symbol, Array<Symbol>, nil] Filter by relationship type(s)
     # @return [Array<String>] List of dependency identifiers
-    def dependencies_of(identifier)
-      @edges[identifier] || []
+    def dependencies_of(identifier, via: nil)
+      edges = @edges[identifier] || []
+      if via
+        via_set = Array(via)
+        edges = edges.select { |e| via_set.include?(e[:via]) }
+      end
+      edges.map { |e| e[:target] }
     end
     # Get direct dependents of a unit (what depends on it)
     #
     # @param identifier [String] Unit identifier
+    # @param via [Symbol, Array<Symbol>, nil] Filter by relationship type(s)
     # @return [Array<String>] List of dependent identifiers
-    def dependents_of(identifier)
-      @reverse.fetch(identifier, Set.new).to_a
+    def dependents_of(identifier, via: nil)
+      return @reverse.fetch(identifier, Set.new).to_a unless via
+      Array(via).each_with_object(Set.new) do |v, result|
+        @reverse_via.fetch([identifier, v], Set.new).each { |dep| result.add(dep) }
+      end.to_a
     end
     # Get all units of a specific type
@@ -204,7 +217,8 @@ module Woods
       raw_nodes = data[:nodes] || data['nodes'] || {}
       graph.instance_variable_set(:@nodes, raw_nodes.transform_values { |v| symbolize_node(v) })
-      graph.instance_variable_set(:@edges, data[:edges] || data['edges'] || {})
+      raw_edges = data[:edges] || data['edges'] || {}
+      graph.instance_variable_set(:@edges, raw_edges.transform_values { |edges| normalize_edges(edges) })
       raw_reverse = data[:reverse] || data['reverse'] || {}
       graph.instance_variable_set(:@reverse, raw_reverse.transform_values { |v| v.is_a?(Set) ? v : Set.new(v) })
@@ -216,6 +230,15 @@ module Woods
         v.is_a?(Set) ? v : Set.new(v)
       end)
+      # Rebuild reverse_via index from edges
+      reverse_via = {}
+      graph.instance_variable_get(:@edges).each do |source_id, edges|
+        edges.each do |edge|
+          (reverse_via[[edge[:target], edge[:via]]] ||= Set.new).add(source_id)
+        end
+      end
+      graph.instance_variable_set(:@reverse_via, reverse_via)
       graph
     end
@@ -232,5 +255,41 @@ module Woods
         namespace: node[:namespace] || node['namespace']
       }
     end
+    # Normalize edge data from either old format (bare strings) or new format (hashes).
+    #
+    # ROUND-TRIP INVARIANT (do not break when refactoring):
+    #   DependencyGraph#to_h -> JSON.generate -> JSON.parse -> DependencyGraph.from_h
+    # must always yield the same in-memory shape. The two normalizers that
+    # sit at either end of this round trip are INTENTIONALLY SEPARATE — do
+    # not merge them:
+    #
+    # - This method ({.normalize_edges}) runs on Ruby objects. It produces
+    #   `{ target:, via: }` with SYMBOL keys because consumers
+    #   ({DependencyGraph#dependencies_of}, {GraphAnalyzer}) key on symbols.
+    # - {Woods::MCP::IndexReader.normalize_all_edges} runs on parsed JSON,
+    #   producing `{ 'target' => ..., 'via' => ... }` with STRING keys,
+    #   because the MCP tools serialize straight through to the client and
+    #   symbol keys would become `:target` on the wire.
+    #
+    # This method also accepts OLD-format bare-string edges so graphs
+    # serialized before the `{target, via}` migration still load without
+    # explicit data conversion.
+    #
+    # @param edges [Array] Edge entries — either strings or hashes
+    # @return [Array<Hash>] Normalized edges with :target and :via keys
+    def self.normalize_edges(edges)
+      return [] unless edges.is_a?(Array)
+      edges.map do |edge|
+        if edge.is_a?(String)
+          { target: edge, via: nil }
+        elsif edge.is_a?(Hash)
+          { target: edge[:target] || edge['target'], via: (edge[:via] || edge['via'])&.to_sym }
+        else
+          { target: edge.to_s, via: nil }
+        end
+      end
+    end
   end
 end

data/lib/woods/embedding/indexer.rb CHANGED Viewed

@@ -2,27 +2,65 @@
 require 'json'
 require 'digest'
+require 'fileutils'
+require_relative '../extracted_unit'
+require_relative '../chunking/semantic_chunker'
 module Woods
   module Embedding
     # Orchestrates the indexing pipeline: reads extracted units, prepares text,
     # generates embeddings, and stores vectors. Supports full and incremental
     # modes with checkpoint-based resumability.
-    class Indexer
+    #
+    # When the vector store is an in-memory adapter (responds to +#each_entry+
+    # and +#bulk_load+) and +output_dir+ is set, a successful {#index_all} run
+    # also persists the stores to disk via the Snapshotter pair and atomically
+    # flips the +dumps/latest+ pointer. Persistent backends (pgvector, Qdrant)
+    # see zero behaviour change — no Snapshotter is invoked.
+    class Indexer # rubocop:disable Metrics/ClassLength
+      # @param chunker [Chunking::SemanticChunker, nil] Splits oversize units
+      #   into semantically coherent chunks before embedding. +nil+ disables
+      #   chunking — units go to the provider whole (useful in tests).
       # @param checkpoint_interval [Integer] Save checkpoint every N batches (default: 10)
-      def initialize(provider:, text_preparer:, vector_store:, output_dir:, batch_size: 32, checkpoint_interval: 10) # rubocop:disable Metrics/ParameterLists
+      # @param metadata_store [#each_entry, #bulk_load, nil] Optional metadata store.
+      #   When present alongside an in-memory vector store, both are persisted
+      #   at the end of a successful {#index_all} run.
+      # @param resolved_config [Woods::ResolvedConfig, nil] Captured config for
+      #   +woods.json+ — written to +output_dir+ on {#index_all} completion.
+      # @param dump_retention_count [Integer] Number of completed dump directories
+      #   to keep under +output_dir/dumps/+. Older dumps are removed after a
+      #   successful {#index_all} run (default: 3).
+      def initialize(provider:, text_preparer:, vector_store:, output_dir:, # rubocop:disable Metrics/ParameterLists
+                     chunker: Chunking::SemanticChunker.new,
+                     batch_size: 32, checkpoint_interval: 10,
+                     metadata_store: nil,
+                     resolved_config: nil,
+                     dump_retention_count: 3)
         @provider = provider
         @text_preparer = text_preparer
         @vector_store = vector_store
         @output_dir = output_dir
+        @chunker = chunker
         @batch_size = batch_size
         @checkpoint_interval = checkpoint_interval
+        @metadata_store = metadata_store
+        @resolved_config = resolved_config
+        @dump_retention_count = dump_retention_count
       end
       # Index all extracted units (full mode). Returns stats hash.
+      #
+      # When the vector store is an in-memory adapter, persists the embedded
+      # vectors (and metadata, if a metadata store was provided) to disk under
+      # +output_dir/dumps/<timestamp>/+ and atomically flips the +latest+
+      # pointer. Writes +woods.json+ when +resolved_config+ was supplied.
+      #
       # @return [Hash] Stats with :processed, :skipped, :errors counts
       def index_all
-        process_units(load_units, incremental: false)
+        stats = process_units(load_units, incremental: false)
+        persist_snapshot if persistable?
+        stats
       end
       # Index only changed units (incremental mode). Returns stats hash.
@@ -37,7 +75,11 @@ module Woods
         Dir.glob(File.join(@output_dir, '**', '*.json')).filter_map do |path|
           next if File.basename(path) == 'checkpoint.json'
-          JSON.parse(File.read(path))
+          data = JSON.parse(File.read(path))
+          # Extraction output also contains index listings (_index.json arrays) and
+          # summary files (manifest.json, dependency_graph.json, graph_analysis.json)
+          # that live alongside per-unit JSON. Filter to the unit shape.
+          data if data.is_a?(Hash) && data.key?('type') && data.key?('identifier')
         rescue JSON::ParserError
           nil
         end
@@ -62,6 +104,12 @@ module Woods
       def process_batch(batch, checkpoint, stats, incremental:)
         to_embed = batch.each_with_object([]) do |unit_data, items|
+          persist_unit_metadata(unit_data)
+          # Incremental skip uses `source_hash`, which the extractor derives
+          # from the unit's *source_code string only* (see ExtractedUnit#to_h
+          # and Extractor#dump_units). It is NOT a hash of the serialized
+          # unit_data JSON — so key ordering or whitespace in the _index.json
+          # does not invalidate checkpoints across Ruby-minor upgrades.
           if incremental && checkpoint[unit_data['identifier']] == unit_data['source_hash']
             stats[:skipped] += 1
             next
@@ -72,6 +120,20 @@ module Woods
         embed_and_store(to_embed, checkpoint, stats)
       end
+      # Persist a unit's metadata under its base identifier so retrieval can
+      # resolve vector-search hits back to their unit data. Without this,
+      # the metadata store is left empty at end of run — Snapshotter::Metadata
+      # dumps a header with record_count: 0 and every MCP +codebase_retrieve+
+      # call silently returns empty text, because ContextAssembler#find_batch
+      # misses every candidate identifier. No-op when metadata_store is nil
+      # (hosts that don't configure one). Stored under the base identifier,
+      # not the chunk-suffixed id — chunks are an embedding-side concern only.
+      def persist_unit_metadata(unit_data)
+        return unless @metadata_store
+        @metadata_store.store(unit_data['identifier'], unit_data)
+      end
       def collect_embed_items(unit_data, items)
         texts = prepare_texts(unit_data)
         identifier = unit_data['identifier']
@@ -83,9 +145,71 @@ module Woods
         end
       end
-      def prepare_texts(unit_data)
+      def prepare_texts(unit_data) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
         unit = build_unit(unit_data)
-        unit.chunks&.any? ? @text_preparer.prepare_chunks(unit) : [@text_preparer.prepare(unit)]
+        apply_chunking(unit) if @chunker && unit.chunks.empty? && needs_chunking?(unit)
+        # Extraction may have emitted chunks larger than the provider's
+        # budget (rails_source in particular). Enforce the ceiling on
+        # whatever chunks we have before handing off to the provider.
+        @chunker&.enforce_chunk_limits!(unit) if unit.chunks.any?
+        texts = unit.chunks.any? ? @text_preparer.prepare_chunks(unit) : [@text_preparer.prepare(unit)]
+        # Drop empty/whitespace-only texts — embedding providers reject
+        # them with 400 and retrying never succeeds. Unit is effectively
+        # skipped when every text is empty (zero-source unit).
+        texts.reject { |t| t.nil? || t.strip.empty? || content_portion_empty?(t, unit) }
+      end
+      # True when a prepared text is just the metadata prefix with no
+      # underlying source content (empty source_code + empty chunks).
+      # Avoids embedding prefix-only stubs that have no semantic value
+      # and would poison the vector space with identical headers.
+      def content_portion_empty?(text, unit)
+        return false unless unit.chunks.empty?
+        return false unless unit.source_code.nil? || unit.source_code.strip.empty?
+        !text.nil?
+      end
+      # Does this unit exceed the embedding provider's single-input
+      # budget? Returns false when the provider reports no budget, when
+      # the TextPreparer has no calibrated chars-per-token ratio, or when
+      # the unit's source fits.
+      #
+      # When the configured chunker carries a real tokenizer
+      # (Embedding::TokenCounter) we also consult it — dense Ruby source
+      # tokenizes hotter than chars/token averages suggest, and Ollama
+      # rejects over-budget input outright (see ollama/ollama#14186).
+      def needs_chunking?(unit)
+        budget_tokens = @provider.respond_to?(:max_input_tokens) ? @provider.max_input_tokens : nil
+        return false if budget_tokens.nil?
+        return false unless @text_preparer.respond_to?(:chars_per_token)
+        source = unit.source_code || ''
+        return true if chunker_token_oversize?(source)
+        # Subtract a small prefix allowance — the TextPreparer adds a few
+        # hundred characters of context header ([type] identifier / file /
+        # dependencies) that count toward the budget too.
+        char_budget = (budget_tokens * @text_preparer.chars_per_token).floor - PREFIX_CHAR_ALLOWANCE
+        char_budget.positive? && source.length > char_budget
+      end
+      # Ask the chunker's real tokenizer whether +source+ already exceeds
+      # the token budget. Returns false when the chunker wasn't built with
+      # one (e.g., OpenAI path), leaving the char-based check in charge.
+      def chunker_token_oversize?(source)
+        return false unless @chunker&.token_counter && @chunker.max_tokens
+        @chunker.token_counter.count(source) > @chunker.max_tokens
+      end
+      # Populate unit.chunks from the configured chunker. The chunker's
+      # own +max_chars+ safety net is what guarantees each chunk fits,
+      # so we pass the same char budget through here.
+      def apply_chunking(unit)
+        unit.chunks = @chunker.chunk(unit).map do |chunk|
+          { content: chunk.content, chunk_type: chunk.chunk_type }
+        end
       end
       def build_unit(data)
@@ -98,6 +222,12 @@ module Woods
         unit
       end
+      # Character budget reserved for the TextPreparer context prefix
+      # ("[type] id / namespace / file / dependencies: …"). Typical
+      # prefixes run ~200–400 chars; 512 gives room to spare.
+      PREFIX_CHAR_ALLOWANCE = 512
+      private_constant :PREFIX_CHAR_ALLOWANCE
       def embed_and_store(items, checkpoint, stats)
         return if items.empty?
@@ -135,6 +265,60 @@ module Woods
       def save_checkpoint(checkpoint)
         File.write(File.join(@output_dir, 'checkpoint.json'), JSON.generate(checkpoint))
       end
+      # Returns true when the vector store is an in-memory adapter that supports
+      # the persistence seam (+#each_entry+ / +#bulk_load+) and output_dir is set.
+      # Persistent backends (pgvector, Qdrant) never respond to +#each_entry+.
+      def persistable?
+        @output_dir &&
+          @vector_store.respond_to?(:each_entry) &&
+          @vector_store.respond_to?(:bulk_load)
+      end
+      # Persist stores to a timestamped dump directory, write +woods.json+,
+      # flip the +latest+ pointer, then prune old dumps.
+      def persist_snapshot
+        require_relative '../index_artifact'
+        require_relative '../storage/snapshotter'
+        artifact = IndexArtifact.new(@output_dir)
+        dump_dir = artifact.new_dump_dir
+        Storage::Snapshotter::Vector.dump(@vector_store, artifact, dump_dir)
+        if @metadata_store.respond_to?(:each_entry) && @metadata_store.respond_to?(:bulk_load)
+          Storage::Snapshotter::Metadata.dump(@metadata_store, artifact, dump_dir)
+        end
+        artifact.write_config(@resolved_config) if @resolved_config
+        artifact.promote(dump_dir)
+        prune_old_dumps(artifact)
+      end
+      # Remove old dump directories beyond the retention window.
+      #
+      # Keeps the +@dump_retention_count+ most-recently-created directories
+      # (sorted by name, which is a UTC timestamp so lexicographic order equals
+      # chronological order). The current +latest+ directory is always kept.
+      def prune_old_dumps(artifact)
+        return if @dump_retention_count.nil? || @dump_retention_count <= 0
+        dumps_root = artifact.dumps_root
+        return unless dumps_root.exist?
+        dirs = sorted_dump_dirs(dumps_root)
+        excess = dirs.length - @dump_retention_count
+        dirs.first(excess).each { |dir| FileUtils.rm_rf(dir) } if excess.positive?
+      end
+      def sorted_dump_dirs(dumps_root)
+        dumps_root.children
+                  .select(&:directory?)
+                  .sort_by(&:basename)
+                  .map(&:to_s)
+      end
     end
   end
 end

data/lib/woods/embedding/openai.rb CHANGED Viewed

@@ -24,6 +24,12 @@ module Woods
           'text-embedding-3-small' => 1536,
           'text-embedding-3-large' => 3072
         }.freeze
+        # OpenAI embedding models share an 8191-token input cap across
+        # text-embedding-3-small / -3-large / ada-002. The chunker uses
+        # this as a hard ceiling — the actual chunk size lands well
+        # below it once chars-per-token estimation and the prefix
+        # allowance are factored in (see Builder#build_chunker).
+        MAX_INPUT_TOKENS = 8191
         # @param api_key [String] OpenAI API key
         # @param model [String] OpenAI embedding model name (default: text-embedding-3-small)
@@ -37,7 +43,10 @@ module Woods
         # @param text [String] the text to embed
         # @return [Array<Float>] the embedding vector
         # @raise [Woods::Error] if the API returns an error
+        # @raise [ArgumentError] if the text is nil or empty (OpenAI rejects these with 400)
         def embed(text)
+          raise ArgumentError, 'embed(text) requires a non-empty string' if text.nil? || text.to_s.strip.empty?
           response = post_request({ model: @model, input: text })
           response['data'].first['embedding']
         end
@@ -49,7 +58,13 @@ module Woods
         # @param texts [Array<String>] the texts to embed
         # @return [Array<Array<Float>>] array of embedding vectors
         # @raise [Woods::Error] if the API returns an error
-        def embed_batch(texts)
+        # @raise [ArgumentError] if the array is empty or any element is nil/empty
+        def embed_batch(texts) # rubocop:disable Metrics/CyclomaticComplexity
+          raise ArgumentError, 'embed_batch(texts) requires a non-empty array' if texts.nil? || texts.empty?
+          if texts.any? { |t| t.nil? || t.to_s.strip.empty? }
+            raise ArgumentError, 'embed_batch(texts) rejects nil/empty entries (OpenAI returns 400)'
+          end
           response = post_request({ model: @model, input: texts })
           response['data']
             .sort_by { |item| item['index'] }
@@ -73,14 +88,35 @@ module Woods
           @model
         end
+        # Maximum input length OpenAI will accept for a single embedding
+        # text. All current text-embedding-* models cap at ~8k tokens.
+        #
+        # @return [Integer]
+        def max_input_tokens
+          MAX_INPUT_TOKENS
+        end
         private
+        # Cap interpolated response bodies so misconfigured API errors
+        # (which occasionally echo request metadata, including headers) don't
+        # unbounded-leak into logs or re-raised messages.
+        #
+        # @param body [String, nil]
+        # @return [String]
+        def truncate_response_body(body)
+          return '' if body.nil?
+          s = body.to_s
+          s.length > 500 ? "#{s[0, 500]}... [truncated]" : s
+        end
         # Send a POST request to the OpenAI embeddings API.
         #
         # @param body [Hash] request body
         # @return [Hash] parsed JSON response
         # @raise [Woods::Error] if the API returns a non-success status
-        def post_request(body)
+        def post_request(body) # rubocop:disable Metrics/AbcSize
           request = Net::HTTP::Post.new(ENDPOINT.path)
           request['Content-Type'] = 'application/json'
           request['Authorization'] = "Bearer #{@api_key}"
@@ -89,7 +125,7 @@ module Woods
           response = http_client.request(request)
           unless response.is_a?(Net::HTTPSuccess)
-            raise Woods::Error, "OpenAI API error: #{response.code} #{response.body}"
+            raise Woods::Error, "OpenAI API error: #{response.code} #{truncate_response_body(response.body)}"
           end
           JSON.parse(response.body)
@@ -98,7 +134,7 @@ module Woods
           @http_client = nil
           response = http_client.request(request)
           unless response.is_a?(Net::HTTPSuccess)
-            raise Woods::Error, "OpenAI API error: #{response.code} #{response.body}"
+            raise Woods::Error, "OpenAI API error: #{response.code} #{truncate_response_body(response.body)}"
           end
           JSON.parse(response.body)

data/lib/woods/embedding/provider.rb CHANGED Viewed

@@ -49,6 +49,16 @@ module Woods
         def model_name
           raise NotImplementedError
         end
+        # Return the maximum input length the provider will accept for a
+        # single text, in tokens. Used by the indexer to decide when a unit
+        # must be chunked before embedding.
+        #
+        # @return [Integer, nil] token budget, or nil if the provider has no hard cap
+        # @raise [NotImplementedError] if not implemented by the provider
+        def max_input_tokens
+          raise NotImplementedError
+        end
       end
       # Ollama adapter for local embeddings via the Ollama HTTP API.
@@ -66,11 +76,56 @@ module Woods
         DEFAULT_MODEL = 'nomic-embed-text'
         DEFAULT_HOST = 'http://localhost:11434'
-        # @param model [String] Ollama model name (default: nomic-embed-text)
+        # Ollama enforces the model's native context length on `/api/embed`
+        # regardless of the `num_ctx` override — we've validated this
+        # against 0.15.x for nomic-embed-text (rejects >2048) and bge-m3
+        # (accepts up to 8192, silently truncates above). Advertise the
+        # native ceiling so the chunker can size inputs correctly. Models
+        # outside this registry fall back to Ollama's conservative 2048
+        # default.
+        #
+        # See `docs/EMBEDDING_MODELS.md` for the tradeoff matrix and
+        # instructions for adding a new model here.
+        MODEL_CONTEXT_LENGTHS = {
+          'nomic-embed-text' => 2048,
+          'bge-m3' => 8192,
+          'mxbai-embed-large' => 512,
+          'snowflake-arctic-embed' => 512,
+          'snowflake-arctic-embed2' => 8192,
+          # all-minilm: 512 is the model's context length, NOT the 384
+          # embedding dimension and NOT the 256 some sources confuse with
+          # the dimension. With a 256-token budget the chunker formula
+          # produces a negative max_chars and silently drops every chunk.
+          'all-minilm' => 512
+        }.freeze
+        # Fallback when the configured model isn't in the registry.
+        FALLBACK_NUM_CTX = 2048
+        # Default read timeout for /api/embed. The previous 30s default
+        # was too short for batched embed calls on cold models — Ollama
+        # has to load the model on first call, and an N-item batch can
+        # easily exceed 30s on a CPU-only host. 120s leaves headroom
+        # without wedging the whole pipeline on a genuinely dead server.
+        DEFAULT_READ_TIMEOUT = 120
+        # @param model [String] Ollama model name (default: nomic-embed-text).
+        #   Set to `"bge-m3"` or `"snowflake-arctic-embed2"` for an 8192-token
+        #   context and skip most chunking for dense Rails units.
         # @param host [String] Ollama server URL (default: http://localhost:11434)
-        def initialize(model: DEFAULT_MODEL, host: DEFAULT_HOST)
+        # @param num_ctx [Integer, nil] Ollama context window in tokens. When
+        #   `nil` (the default), the provider picks the model's native
+        #   context from `MODEL_CONTEXT_LENGTHS`, falling back to 2048 for
+        #   unknown models. Set explicitly only if running a model with a
+        #   known-larger native context that isn't in the registry yet.
+        # @param read_timeout [Integer] HTTP read timeout in seconds.
+        #   Bump this for slow / cold-start hosts or very large batches.
+        def initialize(model: DEFAULT_MODEL, host: DEFAULT_HOST, num_ctx: nil,
+                       read_timeout: DEFAULT_READ_TIMEOUT)
           @model = model
           @host = host
+          @num_ctx = num_ctx || MODEL_CONTEXT_LENGTHS.fetch(model, FALLBACK_NUM_CTX)
+          @read_timeout = read_timeout
           @uri = URI("#{host}/api/embed")
         end
@@ -79,8 +134,11 @@ module Woods
         # @param text [String] the text to embed
         # @return [Array<Float>] the embedding vector
         # @raise [Woods::Error] if the API returns an error
+        # @raise [ArgumentError] if the text is nil or empty (avoids provider 400)
         def embed(text)
-          response = post_request({ model: @model, input: text })
+          raise ArgumentError, 'embed(text) requires a non-empty string' if text.nil? || text.to_s.strip.empty?
+          response = post_request(build_body(text))
           response['embeddings'].first
         end
@@ -89,8 +147,14 @@ module Woods
         # @param texts [Array<String>] the texts to embed
         # @return [Array<Array<Float>>] array of embedding vectors
         # @raise [Woods::Error] if the API returns an error
+        # @raise [ArgumentError] if the array is empty or any element is nil/empty
         def embed_batch(texts)
-          response = post_request({ model: @model, input: texts })
+          raise ArgumentError, 'embed_batch(texts) requires a non-empty array' if texts.nil? || texts.empty?
+          if texts.any? { |t| t.nil? || t.to_s.strip.empty? }
+            raise ArgumentError, 'embed_batch(texts) rejects nil/empty entries'
+          end
+          response = post_request(build_body(texts))
           response['embeddings']
         end
@@ -110,20 +174,52 @@ module Woods
           @model
         end
+        # Maximum input length Ollama will accept — tracks the configured
+        # context window. Always populated: the constructor resolves
+        # `num_ctx` to the model's registry entry or {FALLBACK_NUM_CTX},
+        # so this method never returns nil for an Ollama provider.
+        #
+        # @return [Integer]
+        def max_input_tokens
+          @num_ctx
+        end
         private
+        # Cap interpolated response bodies so misconfigured Ollama responses
+        # (e.g. proxied HTML error pages) don't unbounded-leak into logs or
+        # re-raised error messages.
+        #
+        # @param body [String, nil]
+        # @return [String]
+        def truncate_response_body(body)
+          return '' if body.nil?
+          s = body.to_s
+          s.length > 500 ? "#{s[0, 500]}... [truncated]" : s
+        end
+        # Build the JSON body for an `/api/embed` call. Adds `options.num_ctx`
+        # when configured — without it, Ollama silently truncates to 2048
+        # tokens and returns 400 when the input exceeds that default.
+        def build_body(input)
+          body = { model: @model, input: input }
+          body[:options] = { num_ctx: @num_ctx } if @num_ctx
+          body
+        end
         # Send a POST request to the Ollama API.
         #
         # @param body [Hash] request body
         # @return [Hash] parsed JSON response
         # @raise [Woods::Error] if the API returns a non-success status
-        def post_request(body)
+        def post_request(body) # rubocop:disable Metrics/AbcSize
           request = Net::HTTP::Post.new(@uri.path, 'Content-Type' => 'application/json')
           request.body = body.to_json
           response = http_client.request(request)
           unless response.is_a?(Net::HTTPSuccess)
-            raise Woods::Error, "Ollama API error: #{response.code} #{response.body}"
+            raise Woods::Error, "Ollama API error: #{response.code} #{truncate_response_body(response.body)}"
           end
           JSON.parse(response.body)
@@ -136,7 +232,7 @@ module Woods
             raise Woods::Error, "Ollama API error (retry failed): #{retry_error.message}"
           end
           unless response.is_a?(Net::HTTPSuccess)
-            raise Woods::Error, "Ollama API error: #{response.code} #{response.body}"
+            raise Woods::Error, "Ollama API error: #{response.code} #{truncate_response_body(response.body)}"
           end
           JSON.parse(response.body)
@@ -151,7 +247,7 @@ module Woods
           http = Net::HTTP.new(@uri.host, @uri.port)
           http.use_ssl = @uri.scheme == 'https'
           http.open_timeout = 10
-          http.read_timeout = 30
+          http.read_timeout = @read_timeout
           http.keep_alive_timeout = 30
           http.start
           @http_client = http