RubyGems - woods - Versions diffs - 1.2.0 → 1.3.0 - Mend

woods 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +169 -0
data/README.md +20 -8
data/exe/woods-console +51 -6
data/exe/woods-console-mcp +24 -4
data/exe/woods-mcp +30 -7
data/exe/woods-mcp-http +47 -6
data/lib/generators/woods/install_generator.rb +13 -4
data/lib/generators/woods/templates/woods.rb.tt +155 -0
data/lib/tasks/woods.rake +15 -50
data/lib/woods/builder.rb +174 -9
data/lib/woods/cache/cache_middleware.rb +360 -31
data/lib/woods/chunking/semantic_chunker.rb +334 -7
data/lib/woods/console/adapters/job_adapter.rb +10 -4
data/lib/woods/console/audit_logger.rb +76 -4
data/lib/woods/console/bridge.rb +48 -15
data/lib/woods/console/bridge_protocol.rb +44 -0
data/lib/woods/console/confirmation.rb +3 -4
data/lib/woods/console/console_response_renderer.rb +56 -18
data/lib/woods/console/credential_index.rb +201 -0
data/lib/woods/console/credential_scanner.rb +302 -0
data/lib/woods/console/dispatch_pipeline.rb +138 -0
data/lib/woods/console/embedded_executor.rb +682 -35
data/lib/woods/console/eval_guard.rb +319 -0
data/lib/woods/console/model_validator.rb +1 -3
data/lib/woods/console/rack_middleware.rb +185 -29
data/lib/woods/console/redactor.rb +161 -0
data/lib/woods/console/response_context.rb +127 -0
data/lib/woods/console/safe_context.rb +220 -23
data/lib/woods/console/scope_predicate_parser.rb +131 -0
data/lib/woods/console/server.rb +417 -486
data/lib/woods/console/sql_noise_stripper.rb +87 -0
data/lib/woods/console/sql_table_scanner.rb +213 -0
data/lib/woods/console/sql_validator.rb +81 -31
data/lib/woods/console/table_gate.rb +93 -0
data/lib/woods/console/tool_specs.rb +552 -0
data/lib/woods/console/tools/tier1.rb +3 -3
data/lib/woods/console/tools/tier4.rb +7 -1
data/lib/woods/dependency_graph.rb +66 -7
data/lib/woods/embedding/indexer.rb +190 -6
data/lib/woods/embedding/openai.rb +40 -4
data/lib/woods/embedding/provider.rb +104 -8
data/lib/woods/embedding/text_preparer.rb +23 -3
data/lib/woods/embedding/token_counter.rb +133 -0
data/lib/woods/evaluation/baseline_runner.rb +20 -2
data/lib/woods/evaluation/metrics.rb +4 -1
data/lib/woods/extracted_unit.rb +1 -0
data/lib/woods/extractor.rb +7 -1
data/lib/woods/extractors/controller_extractor.rb +6 -0
data/lib/woods/extractors/mailer_extractor.rb +16 -2
data/lib/woods/extractors/model_extractor.rb +6 -1
data/lib/woods/extractors/phlex_extractor.rb +13 -4
data/lib/woods/extractors/rails_source_extractor.rb +2 -0
data/lib/woods/extractors/route_helper_resolver.rb +130 -0
data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
data/lib/woods/extractors/view_component_extractor.rb +12 -1
data/lib/woods/extractors/view_engines/base.rb +141 -0
data/lib/woods/extractors/view_engines/erb.rb +145 -0
data/lib/woods/extractors/view_template_extractor.rb +92 -133
data/lib/woods/flow_assembler.rb +23 -15
data/lib/woods/flow_precomputer.rb +21 -2
data/lib/woods/graph_analyzer.rb +3 -4
data/lib/woods/index_artifact.rb +173 -0
data/lib/woods/mcp/bearer_auth.rb +45 -0
data/lib/woods/mcp/bootstrap_state.rb +94 -0
data/lib/woods/mcp/bootstrapper.rb +337 -16
data/lib/woods/mcp/config_resolver.rb +288 -0
data/lib/woods/mcp/errors.rb +134 -0
data/lib/woods/mcp/index_reader.rb +265 -30
data/lib/woods/mcp/origin_guard.rb +132 -0
data/lib/woods/mcp/provider_probe.rb +166 -0
data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
data/lib/woods/mcp/server.rb +737 -137
data/lib/woods/model_name_cache.rb +78 -2
data/lib/woods/notion/client.rb +25 -2
data/lib/woods/notion/mappers/model_mapper.rb +36 -2
data/lib/woods/railtie.rb +55 -15
data/lib/woods/resilience/circuit_breaker.rb +9 -2
data/lib/woods/resilience/retryable_provider.rb +40 -3
data/lib/woods/resolved_config.rb +299 -0
data/lib/woods/retrieval/context_assembler.rb +112 -5
data/lib/woods/retrieval/query_classifier.rb +1 -1
data/lib/woods/retrieval/ranker.rb +55 -6
data/lib/woods/retrieval/search_executor.rb +42 -13
data/lib/woods/retriever.rb +330 -24
data/lib/woods/session_tracer/middleware.rb +35 -1
data/lib/woods/storage/graph_store.rb +39 -0
data/lib/woods/storage/inapplicable_backend.rb +14 -0
data/lib/woods/storage/metadata_store.rb +129 -1
data/lib/woods/storage/pgvector.rb +70 -8
data/lib/woods/storage/qdrant.rb +196 -5
data/lib/woods/storage/snapshotter/metadata.rb +172 -0
data/lib/woods/storage/snapshotter/vector.rb +238 -0
data/lib/woods/storage/snapshotter.rb +24 -0
data/lib/woods/storage/vector_store.rb +184 -35
data/lib/woods/tasks.rb +85 -0
data/lib/woods/temporal/snapshot_store.rb +49 -1
data/lib/woods/token_utils.rb +44 -5
data/lib/woods/unblocked/client.rb +1 -1
data/lib/woods/unblocked/document_builder.rb +35 -10
data/lib/woods/unblocked/exporter.rb +1 -1
data/lib/woods/util/host_guard.rb +61 -0
data/lib/woods/version.rb +1 -1
data/lib/woods.rb +126 -6
metadata +69 -4

data/lib/woods/storage/vector_store.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # frozen_string_literal: true
+require 'set'
 module Woods
   module Storage
     # VectorStore provides an interface for storing and searching embedding vectors.
@@ -36,11 +38,39 @@ module Woods
           entries.each { |e| store(e[:id], e[:vector], e[:metadata] || {}) }
         end
+        # Iterate over every live entry, yielding `(id, vector, metadata)`.
+        #
+        # Persistence seam for Snapshotter and similar consumers. Default
+        # implementation falls through to `NotImplementedError`; adapters
+        # that need to support dumping must implement it. Persistent
+        # backends (pgvector, Qdrant) aren't expected to implement this —
+        # the Snapshotter only touches non-persistent stores.
+        #
+        # @yield [id, vector, metadata]
+        # @return [Enumerator] when no block given
+        def each_entry
+          raise NotImplementedError
+        end
+        # Bulk-load pre-computed entries. Dual of {#each_entry} — the
+        # Snapshotter hydrates a store by feeding this the dump contents.
+        #
+        # @param entries [Enumerable<Hash>] Each entry has :id, :vector, :metadata keys
+        def bulk_load(entries)
+          store_batch(entries.to_a)
+        end
         # Search for similar vectors using cosine similarity.
         #
+        # Filter values may be scalars (exact match) or Arrays (membership
+        # match — "value ∈ array"). Adapters implement the membership
+        # semantics natively: in-memory loops, pgvector IN (...), Qdrant
+        # `match: { any: [...] }`.
+        #
         # @param query_vector [Array<Float>] The query embedding vector
         # @param limit [Integer] Maximum number of results to return
-        # @param filters [Hash] Optional metadata filters to apply
+        # @param filters [Hash] Optional metadata filters — values may be
+        #   scalars or Arrays
         # @return [Array<SearchResult>] Results sorted by descending similarity
         # @raise [NotImplementedError] if not implemented by adapter
         def search(query_vector, limit: 10, filters: {})
@@ -87,79 +117,198 @@ module Woods
       #   store.search([1.0, 0.0], limit: 1)
       #   # => [#<SearchResult id="doc1", score=1.0, metadata={type: "model"}>]
       #
-      class InMemory
+      class InMemory # rubocop:disable Metrics/ClassLength
         include Interface
+        # Flat-buffer backing. One Array<Float> of length count*dim holds
+        # every vector contiguously; two parallel Arrays hold the ids and
+        # metadata at matching positions. Deleted entries are tombstoned
+        # (their index is added to @tombstones) rather than removed, so
+        # stored vector positions stay stable under concurrent iteration
+        # and dumps. Tombstones are compacted at next full-embed run.
+        #
+        # The flat buffer exists both for cache friendliness during the
+        # cosine kernel (all vectors live in one contiguous allocation)
+        # and to make dump/load via `pack("e*")` a single call rather
+        # than a per-vector concatenation.
         def initialize
-          @entries = {} # id => { vector:, metadata: }
+          @dim = nil
+          @ids = [] # Array<String> (frozen)
+          @vectors_flat = [] # flat Array<Float>, length @ids.size * @dim
+          @metadata = []     # Array<Hash>, index-aligned with @ids
+          @id_to_index = {}  # id => Integer for O(1) delete/overwrite
+          @tombstones = Set.new
         end
+        # @return [Integer, nil] dimension of stored vectors, nil if empty
+        attr_reader :dim
         # @see Interface#store
         def store(id, vector, metadata = {})
-          @entries[id] = { vector: vector, metadata: metadata }
+          @dim ||= vector.length
+          unless vector.length == @dim
+            raise ArgumentError,
+                  "Vector dimension mismatch (#{vector.length} vs #{@dim})"
+          end
+          frozen_id = id.frozen? ? id : id.dup.freeze
+          existing = @id_to_index[frozen_id]
+          if existing
+            overwrite(existing, vector, metadata)
+          else
+            append(frozen_id, vector, metadata)
+          end
+        end
+        # @see Interface#bulk_load
+        # Single-pass hydrate — more efficient than N store calls when
+        # the Snapshotter feeds a large dump at boot time.
+        def bulk_load(entries)
+          entries.each { |entry| store(entry[:id], entry[:vector], entry[:metadata] || {}) }
+        end
+        # Drop every stored entry, restoring the store to its post-+new+ state.
+        #
+        # Used by the MCP +reload+ tool to pick up a fresh embed run without
+        # restarting the process. A subsequent +#bulk_load+ then repopulates
+        # from disk. Safe on an already-empty store.
+        def clear!
+          @dim = nil
+          @ids = []
+          @vectors_flat = []
+          @metadata = []
+          @id_to_index = {}
+          @tombstones = Set.new
+        end
+        # @see Interface#each_entry
+        def each_entry(&block)
+          return enum_for(:each_entry) unless block
+          @ids.each_with_index do |id, idx|
+            next if @tombstones.include?(idx)
+            base = idx * @dim
+            yield(id, @vectors_flat[base, @dim], @metadata[idx])
+          end
         end
         # @see Interface#search
         def search(query_vector, limit: 10, filters: {})
-          candidates = filter_entries(filters)
+          return [] if @dim.nil?
-          scored = candidates.map do |id, entry|
-            score = cosine_similarity(query_vector, entry[:vector])
-            SearchResult.new(id: id, score: score, metadata: entry[:metadata])
+          unless query_vector.length == @dim
+            raise ArgumentError,
+                  "Vector dimension mismatch (#{query_vector.length} vs #{@dim})"
           end
-          scored.sort_by { |r| -r.score }.first(limit)
+          scored = gather_candidates(query_vector, filters)
+          scored.sort_by! { |r| -r.score }
+          scored.first(limit)
         end
         # @see Interface#delete
         def delete(id)
-          @entries.delete(id)
+          idx = @id_to_index.delete(id)
+          @tombstones << idx if idx
         end
         # @see Interface#delete_by_filter
         def delete_by_filter(filters)
-          @entries.reject! do |_id, entry|
-            filters.all? { |key, value| entry[:metadata][key] == value }
+          @ids.each_with_index do |id, idx|
+            next if @tombstones.include?(idx)
+            next unless filters.all? { |key, value| @metadata[idx][key] == value }
+            @tombstones << idx
+            @id_to_index.delete(id)
           end
         end
         # @see Interface#count
         def count
-          @entries.size
+          @ids.size - @tombstones.size
         end
         private
-        # Filter entries by metadata key-value pairs.
-        #
-        # @param filters [Hash] Metadata filters
-        # @return [Hash] Filtered entries
-        def filter_entries(filters)
-          return @entries if filters.empty?
+        # Match a filter value against a metadata value. Arrays are
+        # membership filters ("any of"); scalars are equality.
+        def filter_match?(filter_value, meta_value)
+          filter_value.is_a?(Array) ? filter_value.include?(meta_value) : filter_value == meta_value
+        end
+        # Append a new entry to the flat buffer.
+        def append(id, vector, metadata)
+          idx = @ids.size
+          @ids << id
+          @vectors_flat.concat(vector)
+          @metadata << metadata
+          @id_to_index[id] = idx
+        end
-          @entries.select do |_id, entry|
-            filters.all? { |key, value| entry[:metadata][key] == value }
+        # Overwrite an existing entry in place. Tombstones the old slot's
+        # deletion marker (if any) so the new vector is live again.
+        def overwrite(idx, vector, metadata)
+          base = idx * @dim
+          i = 0
+          while i < @dim
+            @vectors_flat[base + i] = vector[i]
+            i += 1
           end
+          @metadata[idx] = metadata
+          @tombstones.delete(idx)
         end
-        # Compute cosine similarity between two vectors.
-        #
-        # @param vec_a [Array<Float>] First vector
-        # @param vec_b [Array<Float>] Second vector
-        # @return [Float] Cosine similarity between -1.0 and 1.0
-        # @raise [ArgumentError] if vectors have different dimensions
-        def cosine_similarity(vec_a, vec_b)
-          unless vec_a.length == vec_b.length
-            raise ArgumentError,
-                  "Vector dimension mismatch (#{vec_a.length} vs #{vec_b.length})"
+        # Walk every non-tombstoned index, apply filters, score survivors.
+        # Filter check runs BEFORE the cosine kernel — avoids computing
+        # 12k dot products only to discard most of them.
+        def gather_candidates(query_vector, filters)
+          scored = []
+          len = @ids.size
+          idx = 0
+          while idx < len
+            if @tombstones.include?(idx)
+              idx += 1
+              next
+            end
+            meta = @metadata[idx]
+            unless filters.empty? || filters.all? { |k, v| filter_match?(v, meta[k]) }
+              idx += 1
+              next
+            end
+            score = cosine_similarity_strided(query_vector, idx * @dim)
+            scored << SearchResult.new(id: @ids[idx], score: score, metadata: meta)
+            idx += 1
           end
+          scored
+        end
-          dot = vec_a.zip(vec_b).sum { |x, y| x * y }
-          mag_a = Math.sqrt(vec_a.sum { |x| x**2 })
-          mag_b = Math.sqrt(vec_b.sum { |x| x**2 })
+        # Cosine similarity between a query Array<Float> and a vector
+        # that lives at @vectors_flat[base, @dim]. Strided access avoids
+        # allocating a copy of the stored vector on every comparison.
+        #
+        # See bench/vector_query_and_serialization.rb for the allocation
+        # story — the old Enumerable path allocated ~770 objects per pair;
+        # this loop allocates none inside the hot path.
+        def cosine_similarity_strided(query, base)
+          len = @dim
+          i = 0
+          dot = 0.0
+          mag_a = 0.0
+          mag_b = 0.0
+          while i < len
+            a = query[i]
+            b = @vectors_flat[base + i]
+            dot += a * b
+            mag_a += a * a
+            mag_b += b * b
+            i += 1
+          end
           return 0.0 if mag_a.zero? || mag_b.zero?
-          dot / (mag_a * mag_b)
+          dot / (Math.sqrt(mag_a) * Math.sqrt(mag_b))
         end
       end
     end

data/lib/woods/tasks.rb ADDED Viewed

@@ -0,0 +1,85 @@
+# frozen_string_literal: true
+require_relative 'builder'
+require_relative 'embedding/indexer'
+require_relative 'embedding/text_preparer'
+require_relative 'resolved_config'
+module Woods
+  # Small helpers invoked from `lib/tasks/woods.rake`.
+  #
+  # Keeps rake task bodies to a couple of lines each so the real work lives in
+  # plain Ruby that can be unit-tested without Rake's global state.
+  module Tasks
+    module_function
+    # Build an {Embedding::Indexer} wired to the provider and stores described
+    # by {Woods.configuration}. Uses {Builder} so `config.embedding_provider`,
+    # `config.embedding_options`, and `config.vector_store(_options)` are all
+    # honoured — prior to this the rake tasks hardcoded Ollama + InMemory and
+    # silently ignored configuration, which was invisible until the provider
+    # tried to reach an unreachable default host.
+    #
+    # The TextPreparer and SemanticChunker are tuned to the selected
+    # provider so oversize units are split into chunks that fit the
+    # provider's input budget (e.g. Ollama's num_ctx, OpenAI's 8k cap).
+    #
+    # @return [Embedding::Indexer]
+    def build_embed_indexer
+      config = Woods.configuration
+      builder = Builder.new(config)
+      provider = builder.build_embedding_provider
+      # Wire the persistence-arc pieces (resolved_config, metadata_store,
+      # dump_retention_count) so Indexer#persist_snapshot can write
+      # woods.json, dump metadata, and honour the user's retention setting.
+      # Without these kwargs, embed writes vectors.bin + latest pointer but
+      # never writes woods.json — which breaks the standalone woods-mcp
+      # Shape-2 boot path entirely.
+      #
+      # metadata_store and resolved_config are nil-safe — hosts that don't
+      # configure metadata or that pre-date the persistence arc still work.
+      Embedding::Indexer.new(
+        provider: provider,
+        text_preparer: builder.build_text_preparer(provider),
+        vector_store: builder.build_vector_store,
+        metadata_store: config.metadata_store ? builder.build_metadata_store : nil,
+        resolved_config: build_resolved_config(config, provider: provider),
+        chunker: builder.build_chunker(provider),
+        dump_retention_count: config.dump_retention_count,
+        output_dir: ENV.fetch('WOODS_OUTPUT', config.output_dir)
+      )
+    end
+    # Build a ResolvedConfig snapshot from the live Woods::Configuration.
+    # Returns nil if the configuration doesn't have enough to produce one
+    # (pre-persistence-arc hosts) so the Indexer falls back to the legacy
+    # dump-without-woods.json behaviour.
+    #
+    # Passes the live +provider+ so {ResolvedConfig.from_configuration} can
+    # probe +provider.dimensions+ — without this, Ollama snapshots record
+    # +dimension: 0+ and every subsequent MCP boot fails a spurious
+    # dimension-mismatch check against the real stored vectors.
+    def build_resolved_config(config, provider: nil)
+      return nil unless config.embedding_provider
+      ResolvedConfig.from_configuration(config, provider: provider)
+    rescue StandardError
+      nil
+    end
+    # Print an indexer stats hash in the format the rake tasks have historically
+    # used. `mode:` only affects the header line.
+    #
+    # @param stats [Hash]
+    # @param mode [Symbol] :full or :incremental
+    def print_embed_stats(stats, mode:)
+      header = mode == :incremental ? 'Incremental embedding complete!' : 'Embedding complete!'
+      puts
+      puts header
+      puts "  Processed: #{stats[:processed]}"
+      puts "  Skipped:   #{stats[:skipped]}"
+      puts "  Errors:    #{stats[:errors]}"
+    end
+  end
+end

data/lib/woods/temporal/snapshot_store.rb CHANGED Viewed

@@ -23,10 +23,58 @@ module Woods
     #
     class SnapshotStore # rubocop:disable Metrics/ClassLength
       # @param connection [Object] Database connection supporting #execute and #get_first_row
-      def initialize(connection:)
+      # @param validate_schema [Boolean] If true (default), probe both required
+      #   tables at construction time and raise a descriptive error pointing at
+      #   migrations 004+005 when they are missing. Set false in tests that
+      #   construct the store with a bare mock.
+      def initialize(connection:, validate_schema: true)
         @db = connection
+        validate_schema! if validate_schema
       end
+      REQUIRED_TABLES = %w[woods_snapshots woods_snapshot_units].freeze
+      # Probe that `woods_snapshots` and `woods_snapshot_units` exist. If
+      # they don't, raise with guidance to run migrations 004 + 005 —
+      # without this, the first call to {#capture}/{#find} raises a generic
+      # adapter error that doesn't tell operators why.
+      #
+      # When the connection responds to `#columns` (ActiveRecord-shaped) or
+      # `#table_exists?`, use that — these are hard to spoof from a test
+      # mock, so a partial mock can no longer silently pass. Falls back to
+      # the `SELECT 1 FROM t LIMIT 1` probe for minimal connections.
+      #
+      # @raise [Woods::Error]
+      def validate_schema!
+        REQUIRED_TABLES.each { |t| probe_table!(t) }
+      rescue Woods::Error
+        raise
+      rescue StandardError => e
+        raise Woods::Error, schema_error_message(e)
+      end
+      private
+      def probe_table!(table)
+        if @db.respond_to?(:table_exists?)
+          raise Woods::Error, schema_error_message("table `#{table}` does not exist") unless @db.table_exists?(table)
+        elsif @db.respond_to?(:columns)
+          cols = @db.columns(table)
+          raise Woods::Error, schema_error_message("no columns for `#{table}`") if cols.nil? || cols.empty?
+        else
+          @db.execute("SELECT 1 FROM #{table} LIMIT 1")
+        end
+      end
+      def schema_error_message(detail)
+        'SnapshotStore requires the `woods_snapshots` and ' \
+          '`woods_snapshot_units` tables (migrations 004 + 005 under ' \
+          '`lib/woods/db/migrations/`). Run `rake woods:migrate` on the ' \
+          "metadata DB and retry. Underlying error: #{detail}"
+      end
+      public
       # Capture a snapshot after extraction completes.
       #
       # Stores the manifest metadata and per-unit content hashes.

data/lib/woods/token_utils.rb CHANGED Viewed

@@ -1,19 +1,58 @@
 # frozen_string_literal: true
 module Woods
-  # Shared token estimation utility.
+  # Shared token estimation utility — the single source of truth for the
+  # chars-per-token ratio used across cost estimation, context assembly,
+  # and embedding budgeting.
   #
-  # Uses project convention: (string.length / 4.0).ceil
-  # See docs/TOKEN_BENCHMARK.md — conservative floor (~10.6% overestimate).
+  # Ratios:
+  # - `:openai` / default — 4.0 chars/token. Benchmarked against tiktoken
+  #   (cl100k_base) on 19 Ruby source files (mean 4.41 chars/token). We use
+  #   4.0 as a conservative floor (~10.6 % overestimate) so truncation never
+  #   hands the model more tokens than it budgeted for. See
+  #   `docs/TOKEN_BENCHMARK.md`.
+  # - `:ollama` — 1.5 chars/token. Matches the BERT WordPiece tokenizers
+  #   used by nomic-embed-text and mxbai-embed-large. See
+  #   `docs/EMBEDDING_MODELS.md` and `Woods::Builder#chars_per_token_for`.
+  #
+  # Callers should prefer {.chars_per_token_for} over hardcoding a divisor
+  # so future tokenizer changes propagate in one place instead of drifting
+  # between {ContextAssembler}, {Builder}, and cost-model components.
   module TokenUtils
+    CHARS_PER_TOKEN_BY_PROVIDER = {
+      openai: 4.0,
+      ollama: 1.5
+    }.freeze
+    DEFAULT_CHARS_PER_TOKEN = CHARS_PER_TOKEN_BY_PROVIDER[:openai]
     module_function
-    # Estimate token count for a string.
+    # Chars-per-token ratio for the given embedding provider.
+    #
+    # @param provider [Symbol, String, nil] Provider identifier. Unknown or
+    #   nil providers fall back to {DEFAULT_CHARS_PER_TOKEN}.
+    # @return [Float]
+    def chars_per_token_for(provider)
+      CHARS_PER_TOKEN_BY_PROVIDER.fetch(provider&.to_sym, DEFAULT_CHARS_PER_TOKEN)
+    end
+    # Estimate token count for a string using the default (OpenAI) ratio.
+    # Use {.estimate_tokens_for} when a specific provider is in play.
     #
     # @param text [String] Text to estimate
     # @return [Integer] Estimated token count
     def estimate_tokens(text)
-      (text.length / 4.0).ceil
+      estimate_tokens_for(text, provider: nil)
+    end
+    # Estimate token count for a string using the provider's native ratio.
+    #
+    # @param text [String] Text to estimate
+    # @param provider [Symbol, String, nil] `:openai`, `:ollama`, or nil.
+    # @return [Integer] Estimated token count
+    def estimate_tokens_for(text, provider:)
+      (text.length / chars_per_token_for(provider)).ceil
     end
   end
 end

data/lib/woods/unblocked/client.rb CHANGED Viewed

@@ -118,7 +118,7 @@ module Woods
           http.request(req)
         rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNRESET, Errno::ECONNREFUSED => e
           attempts += 1
-          raise Woods::Error, "Network error after #{attempts} retries: #{e.message}" if attempts >= MAX_RETRIES
+          raise Woods::Error, "Network error after #{attempts} retries: #{e.message}" if attempts > MAX_RETRIES
           sleep(2**attempts)
           retry

data/lib/woods/unblocked/document_builder.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module Woods
     # side effects, and structural complexity.
     #
     # @example
-    #   builder = DocumentBuilder.new(repo_url: "https://github.com/bigcartel/admin")
+    #   builder = DocumentBuilder.new(repo_url: "https://github.com/acme/myapp")
     #   doc = builder.build(unit_data)
     #   # => { title: "Order (model)", body: "# Order (model)\n...", uri: "https://..." }
     #
@@ -46,15 +46,40 @@ module Woods
       def build_body(unit_data)
         type = unit_data['type']
-        case type
-        when 'model' then build_model_body(unit_data)
-        when 'controller' then build_controller_body(unit_data)
-        when 'service', 'job', 'mailer', 'manager', 'decorator', 'concern'
-          build_generic_body(unit_data)
-        when 'graphql', 'graphql_type', 'graphql_mutation', 'graphql_resolver', 'graphql_query'
-          build_graphql_body(unit_data)
-        else build_generic_body(unit_data)
-        end
+        body = case type
+               when 'model' then build_model_body(unit_data)
+               when 'controller' then build_controller_body(unit_data)
+               when 'service', 'job', 'mailer', 'manager', 'decorator', 'concern'
+                 build_generic_body(unit_data)
+               when 'graphql', 'graphql_type', 'graphql_mutation', 'graphql_resolver', 'graphql_query'
+                 build_graphql_body(unit_data)
+               else build_generic_body(unit_data)
+               end
+        # Defensive credential scrub — current builders only emit structured
+        # metadata, but if a future formatter adds source_code or comments
+        # (mirroring Notion's `ModelMapper#extract_description`) the scrub
+        # keeps credential material from reaching Unblocked.
+        redact_credentials(body)
+      end
+      # Run the assembled body through CredentialScanner. Fails closed (empty
+      # body) if the scanner raises, so a shipping failure never leaks
+      # unredacted content.
+      #
+      # @param body [String]
+      # @return [String]
+      def redact_credentials(body)
+        return body if body.nil? || body.empty?
+        require 'woods/console/credential_scanner'
+        redacted, _counts = credential_scanner.scan(body)
+        redacted
+      rescue StandardError
+        ''
+      end
+      def credential_scanner
+        @credential_scanner ||= Woods::Console::CredentialScanner.new
       end
       # ── Model formatting ─────────────────────────────────────────────

data/lib/woods/unblocked/exporter.rb CHANGED Viewed

@@ -51,7 +51,7 @@ module Woods
         api_token = config.unblocked_api_token
         raise ConfigurationError, 'unblocked_api_token is required' unless api_token
-        budget = ENV.fetch('UNBLOCKED_DAILY_BUDGET', RateLimiter::DEFAULT_BUDGET).to_i
+        budget = ENV.fetch('UNBLOCKED_DAILY_BUDGET', RateLimiter::DEFAULT_BUDGET.to_s).to_i
         limiter = RateLimiter.new(daily_budget: budget)
         @client = client || Client.new(api_token: api_token, rate_limiter: limiter)

data/lib/woods/util/host_guard.rb ADDED Viewed

@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+module Woods
+  module Util
+    # Shared host-header / URL-host canonicalization used by {MCP::OriginGuard}
+    # and the {Storage::VectorStore::Qdrant} URL validator.
+    #
+    # Both components need to reject numeric IPv4 notations that `URI` and
+    # `getaddrinfo` accept but `IPAddr` does not — hex (`0x7f000001`),
+    # bare integer (`2130706433`), octal (`017700000001` or
+    # `0177.0.0.1`), short-form (`127.1`), mixed-radix (`0x7f.0.0.1`).
+    # Keeping the logic in one place prevents drift between the two
+    # defenses (which previously had slightly different regex lists).
+    module HostGuard
+      # Non-canonical numeric IPv4 forms that legitimate clients never
+      # emit but `getaddrinfo` will happily resolve — rejecting the form
+      # is safer than trying to intuit the intended IPv4.
+      NUMERIC_HOST_BYPASS = Regexp.union(
+        /\A0x[0-9a-f]+\z/,           # hex: `0x7f000001`
+        /\A\d+\z/,                   # bare integer: `2130706433`
+        /\A0[0-7]+\z/,               # bare octal: `017700000001`
+        /\A\d+\.\d+\z/,              # short-form two-part: `127.1`
+        /\A\d+\.\d+\.\d+\z/          # short-form three-part: `127.0.1`
+      ).freeze
+      # Octets inside a four-part dotted form that tag the form as
+      # non-canonical: leading zero (octal interpretation), or `0x`
+      # prefix (hex interpretation).
+      SUSPICIOUS_OCTET = Regexp.union(
+        /\A0\d+\z/,                  # leading-zero octal: `0177`
+        /\A0x[0-9a-f]+\z/            # hex octet: `0x7f`
+      ).freeze
+      module_function
+      # Canonicalize a host string: downcase, strip port, strip the
+      # FQDN trailing dot, drop IPv6 brackets. Returns a plain host.
+      #
+      # @param host [String, nil]
+      # @return [String] canonical host, lowercase, without port/brackets.
+      def canonicalize(host)
+        host.to_s.downcase.sub(/:\d+\z/, '').sub(/\.\z/, '').delete('[]')
+      end
+      # Does this canonicalized host smuggle a private IP via a notation
+      # that `IPAddr.new` won't parse? Callers should reject any match
+      # rather than try to resolve it.
+      #
+      # @param canonical [String] Output of {.canonicalize}.
+      # @return [Boolean]
+      def suspicious_numeric_host?(canonical)
+        return true if canonical.match?(NUMERIC_HOST_BYPASS)
+        four_octet = canonical.match(/\A(\w+)\.(\w+)\.(\w+)\.(\w+)\z/)
+        return false unless four_octet
+        four_octet.captures.any? { |octet| octet.match?(SUSPICIOUS_OCTET) }
+      end
+    end
+  end
+end

data/lib/woods/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Woods
-  VERSION = '1.2.0'
+  VERSION = '1.3.0'
 end