RubyGems - woods - Versions diffs - 1.2.0 → 1.3.0 - Mend

woods 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +169 -0
data/README.md +20 -8
data/exe/woods-console +51 -6
data/exe/woods-console-mcp +24 -4
data/exe/woods-mcp +30 -7
data/exe/woods-mcp-http +47 -6
data/lib/generators/woods/install_generator.rb +13 -4
data/lib/generators/woods/templates/woods.rb.tt +155 -0
data/lib/tasks/woods.rake +15 -50
data/lib/woods/builder.rb +174 -9
data/lib/woods/cache/cache_middleware.rb +360 -31
data/lib/woods/chunking/semantic_chunker.rb +334 -7
data/lib/woods/console/adapters/job_adapter.rb +10 -4
data/lib/woods/console/audit_logger.rb +76 -4
data/lib/woods/console/bridge.rb +48 -15
data/lib/woods/console/bridge_protocol.rb +44 -0
data/lib/woods/console/confirmation.rb +3 -4
data/lib/woods/console/console_response_renderer.rb +56 -18
data/lib/woods/console/credential_index.rb +201 -0
data/lib/woods/console/credential_scanner.rb +302 -0
data/lib/woods/console/dispatch_pipeline.rb +138 -0
data/lib/woods/console/embedded_executor.rb +682 -35
data/lib/woods/console/eval_guard.rb +319 -0
data/lib/woods/console/model_validator.rb +1 -3
data/lib/woods/console/rack_middleware.rb +185 -29
data/lib/woods/console/redactor.rb +161 -0
data/lib/woods/console/response_context.rb +127 -0
data/lib/woods/console/safe_context.rb +220 -23
data/lib/woods/console/scope_predicate_parser.rb +131 -0
data/lib/woods/console/server.rb +417 -486
data/lib/woods/console/sql_noise_stripper.rb +87 -0
data/lib/woods/console/sql_table_scanner.rb +213 -0
data/lib/woods/console/sql_validator.rb +81 -31
data/lib/woods/console/table_gate.rb +93 -0
data/lib/woods/console/tool_specs.rb +552 -0
data/lib/woods/console/tools/tier1.rb +3 -3
data/lib/woods/console/tools/tier4.rb +7 -1
data/lib/woods/dependency_graph.rb +66 -7
data/lib/woods/embedding/indexer.rb +190 -6
data/lib/woods/embedding/openai.rb +40 -4
data/lib/woods/embedding/provider.rb +104 -8
data/lib/woods/embedding/text_preparer.rb +23 -3
data/lib/woods/embedding/token_counter.rb +133 -0
data/lib/woods/evaluation/baseline_runner.rb +20 -2
data/lib/woods/evaluation/metrics.rb +4 -1
data/lib/woods/extracted_unit.rb +1 -0
data/lib/woods/extractor.rb +7 -1
data/lib/woods/extractors/controller_extractor.rb +6 -0
data/lib/woods/extractors/mailer_extractor.rb +16 -2
data/lib/woods/extractors/model_extractor.rb +6 -1
data/lib/woods/extractors/phlex_extractor.rb +13 -4
data/lib/woods/extractors/rails_source_extractor.rb +2 -0
data/lib/woods/extractors/route_helper_resolver.rb +130 -0
data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
data/lib/woods/extractors/view_component_extractor.rb +12 -1
data/lib/woods/extractors/view_engines/base.rb +141 -0
data/lib/woods/extractors/view_engines/erb.rb +145 -0
data/lib/woods/extractors/view_template_extractor.rb +92 -133
data/lib/woods/flow_assembler.rb +23 -15
data/lib/woods/flow_precomputer.rb +21 -2
data/lib/woods/graph_analyzer.rb +3 -4
data/lib/woods/index_artifact.rb +173 -0
data/lib/woods/mcp/bearer_auth.rb +45 -0
data/lib/woods/mcp/bootstrap_state.rb +94 -0
data/lib/woods/mcp/bootstrapper.rb +337 -16
data/lib/woods/mcp/config_resolver.rb +288 -0
data/lib/woods/mcp/errors.rb +134 -0
data/lib/woods/mcp/index_reader.rb +265 -30
data/lib/woods/mcp/origin_guard.rb +132 -0
data/lib/woods/mcp/provider_probe.rb +166 -0
data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
data/lib/woods/mcp/server.rb +737 -137
data/lib/woods/model_name_cache.rb +78 -2
data/lib/woods/notion/client.rb +25 -2
data/lib/woods/notion/mappers/model_mapper.rb +36 -2
data/lib/woods/railtie.rb +55 -15
data/lib/woods/resilience/circuit_breaker.rb +9 -2
data/lib/woods/resilience/retryable_provider.rb +40 -3
data/lib/woods/resolved_config.rb +299 -0
data/lib/woods/retrieval/context_assembler.rb +112 -5
data/lib/woods/retrieval/query_classifier.rb +1 -1
data/lib/woods/retrieval/ranker.rb +55 -6
data/lib/woods/retrieval/search_executor.rb +42 -13
data/lib/woods/retriever.rb +330 -24
data/lib/woods/session_tracer/middleware.rb +35 -1
data/lib/woods/storage/graph_store.rb +39 -0
data/lib/woods/storage/inapplicable_backend.rb +14 -0
data/lib/woods/storage/metadata_store.rb +129 -1
data/lib/woods/storage/pgvector.rb +70 -8
data/lib/woods/storage/qdrant.rb +196 -5
data/lib/woods/storage/snapshotter/metadata.rb +172 -0
data/lib/woods/storage/snapshotter/vector.rb +238 -0
data/lib/woods/storage/snapshotter.rb +24 -0
data/lib/woods/storage/vector_store.rb +184 -35
data/lib/woods/tasks.rb +85 -0
data/lib/woods/temporal/snapshot_store.rb +49 -1
data/lib/woods/token_utils.rb +44 -5
data/lib/woods/unblocked/client.rb +1 -1
data/lib/woods/unblocked/document_builder.rb +35 -10
data/lib/woods/unblocked/exporter.rb +1 -1
data/lib/woods/util/host_guard.rb +61 -0
data/lib/woods/version.rb +1 -1
data/lib/woods.rb +126 -6
metadata +69 -4

data/lib/woods/resolved_config.rb ADDED Viewed

@@ -0,0 +1,299 @@
+# frozen_string_literal: true
+require 'json'
+require 'time'
+require_relative 'mcp/errors'
+module Woods
+  # Immutable Whole Value representing the resolved embedding configuration
+  # captured at embed time and read back by the MCP server at boot.
+  #
+  # This is NOT a declared-config bag of fields — it records what was
+  # *actually used* during embedding (provider class, model, host, dimension,
+  # store types). The MCP server compares a stored {ResolvedConfig} against
+  # the current host config to detect incompatible re-deployments.
+  #
+  # Build via {.from_hash} (parses +woods.json+) or {.from_configuration}
+  # (captures the current {Woods::Configuration} at embed time).
+  #
+  # @example Parsing woods.json
+  #   config = Woods::ResolvedConfig.from_hash(JSON.parse(File.read("woods.json")))
+  #   config.dimension          # => 768
+  #   config.provider_signature # => "Ollama/nomic-embed-text@http://host.docker.internal:11434"
+  #
+  # @example Asserting compatibility before hydrating stores
+  #   stored = Woods::ResolvedConfig.from_hash(snapshot)
+  #   live   = Woods::ResolvedConfig.from_configuration(Woods.configuration)
+  #   live.assert_compatible!(stored)
+  class ResolvedConfig # rubocop:disable Metrics/ClassLength
+    # The only schema version this gem release can read or write.
+    SCHEMA_VERSION_SUPPORTED = 1
+    # @return [Integer]
+    attr_reader :schema_version
+    # @return [String] Gem version at embed time (e.g. "1.2.0")
+    attr_reader :gem_version
+    # @return [Time]
+    attr_reader :created_at
+    # @return [Hash] Provider details — :class, :model, :host, :num_ctx, :read_timeout, :dimension
+    attr_reader :embedding_provider
+    # @return [Hash] Store types — :vector_store, :metadata_store, :graph_store (Symbols)
+    attr_reader :stores
+    # Parse a +woods.json+ hash into a {ResolvedConfig}.
+    #
+    # @param raw [Hash] Parsed JSON hash (string or symbol keys)
+    # @return [ResolvedConfig]
+    # @raise [Woods::MCP::UnsupportedArtifact] if schema_version is not supported
+    def self.from_hash(raw)
+      data = normalize_keys(raw)
+      validate_schema_version!(data[:schema_version].to_i)
+      new(
+        schema_version: data[:schema_version].to_i,
+        gem_version: data[:gem_version].to_s,
+        created_at: parse_time(data[:created_at]),
+        embedding_provider: parse_provider(data[:embedding_provider] || {}),
+        stores: parse_stores(data[:stores] || {})
+      )
+    end
+    # Capture the current {Woods::Configuration} as a {ResolvedConfig}.
+    #
+    # The +provider:+ kwarg lets callers pass a live embedding provider so
+    # the dimension is discovered at runtime instead of being read from a
+    # declared-only field. This matters for Ollama — dimensions come from
+    # the model, not the config — and doesn't hurt OpenAI, whose provider
+    # exposes the same +#dimensions+ interface.
+    #
+    # When +provider:+ is omitted, dimension falls back to
+    # +config.embedding_options[:dimension]+ (useful for specs and for
+    # offline ResolvedConfig construction where no provider exists).
+    #
+    # @param config [Woods::Configuration]
+    # @param gem_version [String] Defaults to {Woods::VERSION}
+    # @param provider [#dimensions, nil] Optional live provider to probe
+    #   for dimension when +config.embedding_options[:dimension]+ is absent.
+    # @return [ResolvedConfig]
+    def self.from_configuration(config, gem_version: nil, provider: nil) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
+      require_relative 'version'
+      opts = config.embedding_options || {}
+      declared_dim = opts[:dimension] || opts['dimension']
+      dim = declared_dim || (provider.respond_to?(:dimensions) ? provider.dimensions : nil)
+      provider_hash = {
+        class: resolve_provider_class(config.embedding_provider),
+        model: (opts[:model] || opts['model'] || config.embedding_model).to_s,
+        dimension: dim.to_i,
+        host: opts[:host] || opts['host'],
+        num_ctx: opts[:num_ctx] || opts['num_ctx'],
+        read_timeout: opts[:read_timeout] || opts['read_timeout']
+      }.compact
+      new(
+        schema_version: SCHEMA_VERSION_SUPPORTED,
+        gem_version: (gem_version || Woods::VERSION).to_s,
+        created_at: Time.now.utc,
+        embedding_provider: provider_hash,
+        stores: {
+          vector_store: config.vector_store,
+          metadata_store: config.metadata_store,
+          graph_store: config.graph_store
+        }
+      )
+    end
+    # @param schema_version [Integer]
+    # @param gem_version [String]
+    # @param created_at [Time]
+    # @param embedding_provider [Hash]
+    # @param stores [Hash]
+    def initialize(schema_version:, gem_version:, created_at:, embedding_provider:, stores:)
+      @schema_version = schema_version
+      @gem_version = gem_version.to_s.freeze
+      @created_at = created_at
+      @embedding_provider = deep_freeze(embedding_provider)
+      @stores = deep_freeze(stores)
+      freeze
+    end
+    # @return [Integer] Embedding dimension declared by the provider
+    def dimension
+      embedding_provider[:dimension].to_i
+    end
+    # Short string identifying this provider configuration, useful for log
+    # messages and {ConfigMismatch} error text.
+    #
+    # @return [String] e.g. "Ollama/nomic-embed-text@http://host.docker.internal:11434"
+    def provider_signature
+      klass = embedding_provider[:class].to_s.split('::').last
+      model = embedding_provider[:model]
+      host  = embedding_provider[:host]
+      host ? "#{klass}/#{model}@#{host}" : "#{klass}/#{model}"
+    end
+    # Returns +true+ if +other+ uses the same provider class, model, dimension,
+    # and store types. Ignores gem version, read_timeout, num_ctx, and created_at.
+    #
+    # @param other [ResolvedConfig]
+    # @return [Boolean]
+    def matches?(other)
+      embedding_provider[:class] == other.embedding_provider[:class] &&
+        embedding_provider[:model] == other.embedding_provider[:model] &&
+        dimension == other.dimension &&
+        stores[:vector_store] == other.stores[:vector_store] &&
+        stores[:metadata_store] == other.stores[:metadata_store] &&
+        stores[:graph_store] == other.stores[:graph_store]
+    end
+    # Assert that +stored_config+ (the config captured at embed time) is
+    # compatible with +self+ (the live host config). Raises typed errors
+    # so the operator can diagnose the mismatch without reading source.
+    #
+    # @param stored_config [ResolvedConfig]
+    # @raise [Woods::MCP::DimensionMismatch] if dimensions differ
+    # @raise [Woods::MCP::ConfigMismatch] if provider class or model differs
+    # @return [void]
+    def assert_compatible!(stored_config)
+      assert_dimensions_match!(stored_config)
+      assert_provider_matches!(stored_config)
+    end
+    # Serialize to a Hash suitable for +JSON.generate+ and round-trippable
+    # through {.from_hash}.
+    #
+    # @return [Hash]
+    def to_snapshot_json
+      {
+        'schema_version' => schema_version,
+        'gem_version' => gem_version,
+        'created_at' => created_at.iso8601,
+        'embedding_provider' => embedding_provider.transform_keys(&:to_s),
+        'stores' => stores.transform_keys(&:to_s).transform_values(&:to_s)
+      }
+    end
+    # @return [Hash]
+    def to_h
+      to_snapshot_json.freeze
+    end
+    private
+    # Recursively freeze a Hash and every Hash/Array/String it transitively
+    # holds. The previous shallow `.freeze` left nested Hash values mutable
+    # — a caller reaching `config.embedding_provider[:options][:foo] = …`
+    # could mutate the supposedly-immutable snapshot. Public ResolvedConfig
+    # is documented as a frozen Whole Value; this enforces it.
+    def deep_freeze(obj) # rubocop:disable Metrics/CyclomaticComplexity
+      case obj
+      when Hash
+        obj.each_pair do |k, v|
+          deep_freeze(k)
+          deep_freeze(v)
+        end
+        obj.frozen? ? obj : obj.freeze
+      when Array
+        obj.each { |v| deep_freeze(v) }
+        obj.frozen? ? obj : obj.freeze
+      when String
+        obj.frozen? ? obj : obj.dup.freeze
+      else
+        obj
+      end
+    end
+    def assert_dimensions_match!(stored_config)
+      return if dimension == stored_config.dimension
+      raise Woods::MCP::DimensionMismatch.new(
+        "Provider dimension #{dimension} does not match stored dimension #{stored_config.dimension}. " \
+        'Re-run `rake woods:embed` to rebuild the index.',
+        details: {
+          expected: stored_config.dimension,
+          actual: dimension,
+          stored_at: stored_config.created_at.iso8601
+        }
+      )
+    end
+    def assert_provider_matches!(stored_config)
+      return if embedding_provider[:class] == stored_config.embedding_provider[:class] &&
+                embedding_provider[:model] == stored_config.embedding_provider[:model]
+      raise Woods::MCP::ConfigMismatch.new(
+        "Host provider #{provider_signature} does not match stored provider #{stored_config.provider_signature}. " \
+        'Re-run `rake woods:embed` or align host configuration.',
+        details: {
+          host: provider_signature,
+          stored: stored_config.provider_signature,
+          stored_at: stored_config.created_at.iso8601
+        }
+      )
+    end
+    class << self
+      private
+      def validate_schema_version!(version)
+        # Forwards-compatibility rule: accept any version at or below the
+        # supported ceiling. An older dump (schema_version 1) must still
+        # load cleanly on a newer gem (schema_version 2), matching the
+        # behaviour of the binary snapshotters (vectors.bin, metadata.msgpack)
+        # which both use `<=`.
+        return if version.positive? && version <= SCHEMA_VERSION_SUPPORTED
+        raise Woods::MCP::UnsupportedArtifact.new(
+          "woods.json schema_version #{version} is not supported (supported: #{SCHEMA_VERSION_SUPPORTED})",
+          details: { found: version, supported: SCHEMA_VERSION_SUPPORTED }
+        )
+      end
+      def parse_provider(raw)
+        data = normalize_keys(raw)
+        {
+          class: data[:class].to_s,
+          model: data[:model].to_s,
+          dimension: data[:dimension].to_i,
+          host: data[:host],
+          num_ctx: data[:num_ctx],
+          read_timeout: data[:read_timeout]
+        }.compact
+      end
+      def parse_stores(raw)
+        data = normalize_keys(raw)
+        {
+          vector_store: data[:vector_store]&.to_sym,
+          metadata_store: data[:metadata_store]&.to_sym,
+          graph_store: data[:graph_store]&.to_sym
+        }
+      end
+      def parse_time(value)
+        value ? Time.parse(value.to_s) : Time.now.utc
+      end
+      def normalize_keys(hash)
+        hash.transform_keys(&:to_sym)
+      end
+      def resolve_provider_class(provider)
+        case provider
+        when :openai then 'Woods::Embedding::Provider::OpenAI'
+        when :ollama then 'Woods::Embedding::Provider::Ollama'
+        when String  then provider
+        when Class   then provider.name
+        when nil     then ''
+        else provider.to_s
+        end
+      end
+    end
+  end
+end

data/lib/woods/retrieval/context_assembler.rb CHANGED Viewed

@@ -1,5 +1,8 @@
 # frozen_string_literal: true
+require_relative 'search_executor'
+require_relative '../token_utils'
 module Woods
   module Retrieval
     # Transforms ranked search candidates into a token-budgeted context string
@@ -34,13 +37,48 @@ module Woods
       # Minimum token count for a section to be worth including.
       MIN_USEFUL_TOKENS = 200
+      # Default chars-per-token ratio. Delegates to {Woods::TokenUtils} —
+      # the single source of truth — which uses 4.0 (OpenAI / tiktoken
+      # cl100k_base average for Ruby source; see docs/TOKEN_BENCHMARK.md).
+      # Callers embedding with BERT/WordPiece tokenizers (nomic-embed-text,
+      # bge-*) should pass the tighter ratio from their TextPreparer
+      # (~1.5–2.5) so truncation stays honest for that provider — or use
+      # {TokenUtils.chars_per_token_for(:ollama)} for the shipped default.
+      DEFAULT_CHARS_PER_TOKEN = TokenUtils::DEFAULT_CHARS_PER_TOKEN
       # @param metadata_store [#find] Store that resolves identifiers to unit data
       # @param budget [Integer] Total token budget
-      def initialize(metadata_store:, budget: DEFAULT_BUDGET)
+      # @param chars_per_token [Float] Tokenizer-calibrated char/token ratio used
+      #   for truncation sizing. Match this to the embedding provider in use —
+      #   {Woods::Embedding::TextPreparer#chars_per_token} exposes the live
+      #   value from the indexing-time preparer.
+      # @param token_counter [#count, nil] Optional exact tokenizer (typically
+      #   {Woods::Embedding::TokenCounter}). When provided, token estimation
+      #   uses the model's real WordPiece/BPE output instead of the
+      #   `chars / chars_per_token` heuristic, which matters most for the
+      #   Ollama path (ratios vary widely across Rails source, 1.5–2.5).
+      #   The heuristic remains the fallback when the counter is nil or the
+      #   tokenizer gem isn't installed.
+      def initialize(metadata_store:, budget: DEFAULT_BUDGET,
+                     chars_per_token: DEFAULT_CHARS_PER_TOKEN,
+                     token_counter: nil)
         @metadata_store = metadata_store
         @budget = budget
+        # Guard against 0 / negative / NaN ratios — any of those would make
+        # `estimate_tokens` div-by-zero or return a negative budget, which
+        # would silently truncate every section to empty. Fall back to the
+        # default ratio rather than propagate the bogus input.
+        ratio = chars_per_token.to_f
+        @chars_per_token = ratio.positive? ? ratio : DEFAULT_CHARS_PER_TOKEN
+        @token_counter = token_counter
       end
+      # @return [Float] the configured chars-per-token ratio
+      attr_reader :chars_per_token
+      # @return [#count, nil] the exact tokenizer, if one was injected
+      attr_reader :token_counter
       # Assemble context from ranked candidates within token budget.
       #
       # @param candidates [Array<Candidate>] Ranked search candidates
@@ -54,6 +92,13 @@ module Woods
         sources = []
         tokens_used = 0
+        # Collapse +User#chunk_0+, +User#chunk_1+, … back to their base unit
+        # BEFORE metadata lookup and section assembly. Chunk IDs are an
+        # embedding-side concern — the metadata store is keyed by the base
+        # identifier, and callers don't want the same unit formatted twice
+        # just because multiple chunks matched the query.
+        candidates = collapse_chunk_candidates(candidates)
         # Pre-fetch all candidate metadata in one batch query
         @unit_cache = @metadata_store.find_batch(candidates.map(&:identifier))
@@ -78,6 +123,46 @@ module Woods
       private
+      # Suffix the Indexer appends when a single unit is split into multiple
+      # embedding vectors (rails_source and other large units). Separator
+      # is +#+ so it can never collide with a Ruby constant (+::+) or a
+      # method ref (+#instance_method+) in an identifier.
+      CHUNK_SUFFIX_PATTERN = /#chunk_\d+\z/
+      private_constant :CHUNK_SUFFIX_PATTERN
+      # Strip the +#chunk_N+ suffix from an identifier, if present.
+      # +User#chunk_3+ → +User+; +User+ stays +User+.
+      def base_identifier(identifier)
+        identifier.sub(CHUNK_SUFFIX_PATTERN, '')
+      end
+      # Rewrite every candidate to point at its base identifier and keep only
+      # the highest-scoring candidate per base unit. Preserves original score
+      # ordering on the output so downstream +sort_by(-score)+ gets the same
+      # input it would on an unchunked corpus.
+      def collapse_chunk_candidates(candidates)
+        best = {}
+        candidates.each do |c|
+          base = base_identifier(c.identifier)
+          rewritten = c.identifier == base ? c : rewrite_identifier(c, base)
+          best[base] = rewritten if best[base].nil? || rewritten.score > best[base].score
+        end
+        best.values
+      end
+      # Return a clone of +candidate+ with its identifier replaced. Kept as
+      # its own method so the Candidate struct shape is referenced in exactly
+      # one place — if SearchExecutor::Candidate grows fields, this is the
+      # only spot to update.
+      def rewrite_identifier(candidate, new_identifier)
+        SearchExecutor::Candidate.new(
+          identifier: new_identifier,
+          score: candidate.score,
+          source: candidate.source,
+          metadata: candidate.metadata
+        )
+      end
       # Add structural context section if provided.
       #
       # @return [Integer] Updated tokens_used count
@@ -224,17 +309,39 @@ module Woods
       def truncate_to_budget(text, token_budget)
         return text if estimate_tokens(text) <= token_budget
-        # Estimate target character count with 10% safety margin
-        target_chars = (token_budget * 4.0 * 0.9).to_i
+        # Target-char sizing uses the effective ratio: the provider's live
+        # ratio when we have an exact counter, otherwise @chars_per_token.
+        # 10 % safety margin keeps us below the budget after the imprecise
+        # tokenizer runs again on the truncated output.
+        target_chars = (token_budget * effective_chars_per_token * 0.9).to_i
         "#{text[0...target_chars]}\n... [truncated]"
       end
-      # Estimate token count using the project convention.
+      # Estimate token count. Prefers the injected {TokenCounter} — which
+      # loads the provider's real tokenizer and returns exact counts — and
+      # falls back to the configured chars-per-token ratio when no counter
+      # is wired.
       #
       # @param text [String]
       # @return [Integer]
       def estimate_tokens(text)
-        (text.length / 4.0).ceil
+        return 0 if text.nil? || text.empty?
+        return @token_counter.count(text) if @token_counter
+        (text.length / @chars_per_token).ceil
+      end
+      # Effective chars-per-token for chunk-size sizing. When an exact
+      # counter is present, prefer its native ratio (e.g. 1.2 for
+      # nomic-embed-text) so truncation and estimation agree. Falls back
+      # to the configured ratio if the counter reports 0 or a non-positive
+      # value (which would make truncation target zero chars).
+      def effective_chars_per_token
+        if @token_counter.respond_to?(:chars_per_token) && @token_counter.chars_per_token
+          ratio = @token_counter.chars_per_token.to_f
+          return ratio if ratio.positive?
+        end
+        @chars_per_token
       end
       # Build the final AssembledContext result.

data/lib/woods/retrieval/query_classifier.rb CHANGED Viewed

@@ -31,7 +31,7 @@ module Woods
         implement: /\b(implement|add|create|build|write|make|generate)\b/i,
         compare: /\b(compare|difference|vs|versus|between|contrast)\b/i,
         # rubocop:disable Layout/LineLength
-        framework: /\b(how does rails|what does rails|rails .+ work|work.+\brails\b|in rails\b|activerecord|actioncontroller|activejob)\b/i,
+        framework: /\b(how does rails|what does rails|rails .+ work|work.+\brails\b|in rails\b|activerecord|actioncontroller|activejob|actionmailer|actioncable|actiontext|activestorage|solid_queue|solid_cache|solid_cable|kamal|propshaft|importmap|hotwire|turbo|stimulus|zeitwerk)\b/i,
         # rubocop:enable Layout/LineLength
         reference: /\b(show me|what is|what are|list|options for|api|interface|signature)\b/i,
         understand: /\b(how|why|explain|understand|what happens|describe|overview)\b/i

data/lib/woods/retrieval/ranker.rb CHANGED Viewed

@@ -35,8 +35,11 @@ module Woods
       RRF_K = 60
       # @param metadata_store [#find] Store that resolves identifiers to unit metadata
-      def initialize(metadata_store:)
+      # @param graph_store [#pagerank, nil] Optional graph store exposing PageRank scores.
+      #   When present, PageRank rank-percentile replaces the bucketed importance signal.
+      def initialize(metadata_store:, graph_store: nil)
         @metadata_store = metadata_store
+        @graph_store = graph_store
       end
       # Rank candidates by weighted signal scoring with diversity adjustment.
@@ -89,8 +92,9 @@ module Woods
         candidates.group_by(&:source).each_value do |source_candidates|
           ranked = source_candidates.sort_by { |c| -c.score }
-          ranked.each_with_index do |candidate, rank|
-            rrf_scores[candidate.identifier] += 1.0 / (RRF_K + rank)
+          ranked.each_with_index do |candidate, idx|
+            # RRF is 1-based (Cormack et al., 2009): top-ranked doc uses rank 1, not 0.
+            rrf_scores[candidate.identifier] += 1.0 / (RRF_K + idx + 1)
             metadata_map[candidate.identifier] ||= candidate.metadata
           end
         end
@@ -102,7 +106,14 @@ module Woods
       #
       # @return [Array<Candidate>]
       def rebuild_rrf_candidates(candidates, rrf_scores, metadata_map)
-        original_by_id = candidates.index_by(&:identifier)
+        # Plain-Ruby `index_by` substitute — the ActiveSupport version
+        # isn't loaded when the gem runs outside a Rails boot. Preserve
+        # last-wins semantics to match ActiveSupport's `Enumerable#index_by`
+        # so the merged candidate's `source` continues to reflect the
+        # final source a given identifier appeared in (relevant when
+        # observability/debug tools read `.source` on an RRF result).
+        original_by_id = {}
+        candidates.each { |c| original_by_id[c.identifier] = c }
         rrf_scores.sort_by { |_id, score| -score }.map do |identifier, score|
           original = original_by_id[identifier]
           build_candidate(
@@ -133,7 +144,7 @@ module Woods
               semantic: candidate.score.to_f,
               keyword: keyword_score(candidate),
               recency: recency_score(unit),
-              importance: importance_score(unit),
+              importance: importance_score(unit, candidate.identifier),
               type_match: type_match_score(unit, classification),
               diversity: 1.0 # Adjusted after initial sort
             }
@@ -184,9 +195,18 @@ module Woods
       # Importance score based on PageRank / structural importance.
       #
+      # Prefers live PageRank from the graph store (rank-percentile 0.0–1.0) when
+      # available. Falls back to bucketed importance metadata (`:high`/`:medium`/`:low`)
+      # when there is no graph store, the PageRank map is empty, or the identifier
+      # is not yet indexed (e.g., a new unit since the last extraction).
+      #
       # @param unit [Hash, nil] Unit metadata from store
+      # @param identifier [String] Candidate identifier (matched against PageRank keys)
       # @return [Float] 0.0 to 1.0
-      def importance_score(unit)
+      def importance_score(unit, identifier)
+        pagerank = pagerank_importance_map[identifier]
+        return pagerank if pagerank
         return 0.5 unless unit
         importance = dig_metadata(unit, :importance)
@@ -198,6 +218,35 @@ module Woods
         end
       end
+      # Lazily-computed rank-percentile map derived from the graph store's PageRank.
+      #
+      # Top-ranked identifier gets 1.0, bottom-ranked gets 1/n. Identifiers absent
+      # from PageRank (new units, ephemeral candidates) return nil and fall back
+      # to the bucketed importance signal.
+      #
+      # @return [Hash{String => Float}]
+      def pagerank_importance_map
+        @pagerank_importance_map ||= compute_pagerank_importance_map
+      end
+      # Compute rank-percentile scores from the graph store's PageRank hash.
+      #
+      # @return [Hash{String => Float}] Empty hash when no graph store or no scores.
+      def compute_pagerank_importance_map
+        return {} unless @graph_store.respond_to?(:pagerank)
+        scores = @graph_store.pagerank
+        return {} if scores.nil? || scores.empty?
+        ranked = scores.sort_by { |_id, score| -score }
+        total = ranked.size.to_f
+        ranked.each_with_index.to_h do |(identifier, _score), rank|
+          [identifier, 1.0 - (rank / total)]
+        end
+      rescue StandardError
+        {}
+      end
       # Type match score — bonus when result type matches query target_type.
       #
       # @param unit [Hash, nil] Unit metadata from store

data/lib/woods/retrieval/search_executor.rb CHANGED Viewed

@@ -56,10 +56,27 @@ module Woods
       # @param query [String] The original query text
       # @param classification [QueryClassifier::Classification] Classified query
       # @param limit [Integer] Maximum candidates to return
+      # @param type_filter [Array<String>, nil] When set, vector and hybrid
+      #   strategies push this down into the vector store's metadata
+      #   filter — used by {Retriever#retrieve} to rank-within-type when
+      #   the unfiltered global top-K had no candidate of the requested type.
+      #   Overrides the classifier-derived +target_type+ in filter construction.
+      # @param strategy [Symbol, nil] Override the classifier-selected strategy.
+      #   {Retriever#within_type_fallback} passes +:vector+ here because the
+      #   vector path is the only one that honors +type_filter+; if the
+      #   classifier picked +:keyword+ / +:graph+ / +:direct+ the fallback
+      #   would otherwise silently re-run the same strategy, get filtered to
+      #   empty, and violate the "never empty when units exist" contract.
       # @return [ExecutionResult] Candidates with strategy metadata
-      def execute(query:, classification:, limit: 20)
-        strategy = select_strategy(classification)
-        candidates = run_strategy(strategy, query: query, classification: classification, limit: limit)
+      def execute(query:, classification:, limit: 20, type_filter: nil, strategy: nil)
+        strategy ||= select_strategy(classification)
+        candidates = run_strategy(
+          strategy,
+          query: query,
+          classification: classification,
+          limit: limit,
+          type_filter: type_filter
+        )
         ExecutionResult.new(
           candidates: candidates.first(limit),
@@ -104,17 +121,18 @@ module Woods
       # @param query [String] Original query text
       # @param classification [QueryClassifier::Classification]
       # @param limit [Integer] Max results
+      # @param type_filter [Array<String>, nil] Pushed into vector filters
       # @return [Array<Candidate>]
-      def run_strategy(strategy, query:, classification:, limit:)
+      def run_strategy(strategy, query:, classification:, limit:, type_filter: nil)
         case strategy
         when :vector
-          execute_vector(query, classification: classification, limit: limit)
+          execute_vector(query, classification: classification, limit: limit, type_filter: type_filter)
         when :keyword
           execute_keyword(classification: classification, limit: limit)
         when :graph
           execute_graph(classification: classification, limit: limit)
         when :hybrid
-          execute_hybrid(query, classification: classification, limit: limit)
+          execute_hybrid(query, classification: classification, limit: limit, type_filter: type_filter)
         when :direct
           execute_direct(classification: classification, limit: limit)
         end
@@ -123,9 +141,9 @@ module Woods
       # Vector strategy: embed the query and search by similarity.
       #
       # @return [Array<Candidate>]
-      def execute_vector(query, classification:, limit:)
+      def execute_vector(query, classification:, limit:, type_filter: nil)
         query_vector = @embedding_provider.embed(query)
-        filters = build_vector_filters(classification)
+        filters = build_vector_filters(classification, type_filter: type_filter)
         results = @vector_store.search(query_vector, limit: limit, filters: filters)
         results.map do |r|
@@ -209,9 +227,10 @@ module Woods
       # Hybrid strategy: combine vector, keyword, and graph expansion.
       #
       # @return [Array<Candidate>]
-      def execute_hybrid(query, classification:, limit:)
+      def execute_hybrid(query, classification:, limit:, type_filter: nil)
         # Gather from all three sources
-        vector_candidates = execute_vector(query, classification: classification, limit: limit)
+        vector_candidates = execute_vector(query, classification: classification, limit: limit,
+                                                  type_filter: type_filter)
         keyword_candidates = execute_keyword(classification: classification, limit: limit)
         # Graph expansion on top vector results
@@ -266,13 +285,23 @@ module Woods
         candidates
       end
-      # Build metadata filters for vector search based on classification.
+      # Build metadata filters for vector search based on classification
+      # and an optional explicit type filter from the caller.
+      #
+      # The caller's explicit +type_filter+ overrides classifier-derived
+      # +target_type+ when both are present — the caller opted into a
+      # specific set of types and that intent beats a heuristic.
       #
       # @param classification [QueryClassifier::Classification]
+      # @param type_filter [Array<String>, nil]
       # @return [Hash]
-      def build_vector_filters(classification)
+      def build_vector_filters(classification, type_filter: nil)
         filters = {}
-        filters[:type] = classification.target_type.to_s if classification.target_type
+        if type_filter && !type_filter.empty?
+          filters[:type] = type_filter.map(&:to_s)
+        elsif classification.target_type
+          filters[:type] = classification.target_type.to_s
+        end
         filters
       end