woods 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +169 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +15 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +3 -4
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +737 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +1 -1
  102. data/lib/woods/unblocked/document_builder.rb +35 -10
  103. data/lib/woods/unblocked/exporter.rb +1 -1
  104. data/lib/woods/util/host_guard.rb +61 -0
  105. data/lib/woods/version.rb +1 -1
  106. data/lib/woods.rb +126 -6
  107. metadata +69 -4
@@ -0,0 +1,299 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'time'
5
+ require_relative 'mcp/errors'
6
+
7
+ module Woods
8
+ # Immutable Whole Value representing the resolved embedding configuration
9
+ # captured at embed time and read back by the MCP server at boot.
10
+ #
11
+ # This is NOT a declared-config bag of fields — it records what was
12
+ # *actually used* during embedding (provider class, model, host, dimension,
13
+ # store types). The MCP server compares a stored {ResolvedConfig} against
14
+ # the current host config to detect incompatible re-deployments.
15
+ #
16
+ # Build via {.from_hash} (parses +woods.json+) or {.from_configuration}
17
+ # (captures the current {Woods::Configuration} at embed time).
18
+ #
19
+ # @example Parsing woods.json
20
+ # config = Woods::ResolvedConfig.from_hash(JSON.parse(File.read("woods.json")))
21
+ # config.dimension # => 768
22
+ # config.provider_signature # => "Ollama/nomic-embed-text@http://host.docker.internal:11434"
23
+ #
24
+ # @example Asserting compatibility before hydrating stores
25
+ # stored = Woods::ResolvedConfig.from_hash(snapshot)
26
+ # live = Woods::ResolvedConfig.from_configuration(Woods.configuration)
27
+ # live.assert_compatible!(stored)
28
+ class ResolvedConfig # rubocop:disable Metrics/ClassLength
29
+ # The only schema version this gem release can read or write.
30
+ SCHEMA_VERSION_SUPPORTED = 1
31
+
32
+ # @return [Integer]
33
+ attr_reader :schema_version
34
+
35
+ # @return [String] Gem version at embed time (e.g. "1.2.0")
36
+ attr_reader :gem_version
37
+
38
+ # @return [Time]
39
+ attr_reader :created_at
40
+
41
+ # @return [Hash] Provider details — :class, :model, :host, :num_ctx, :read_timeout, :dimension
42
+ attr_reader :embedding_provider
43
+
44
+ # @return [Hash] Store types — :vector_store, :metadata_store, :graph_store (Symbols)
45
+ attr_reader :stores
46
+
47
+ # Parse a +woods.json+ hash into a {ResolvedConfig}.
48
+ #
49
+ # @param raw [Hash] Parsed JSON hash (string or symbol keys)
50
+ # @return [ResolvedConfig]
51
+ # @raise [Woods::MCP::UnsupportedArtifact] if schema_version is not supported
52
+ def self.from_hash(raw)
53
+ data = normalize_keys(raw)
54
+ validate_schema_version!(data[:schema_version].to_i)
55
+
56
+ new(
57
+ schema_version: data[:schema_version].to_i,
58
+ gem_version: data[:gem_version].to_s,
59
+ created_at: parse_time(data[:created_at]),
60
+ embedding_provider: parse_provider(data[:embedding_provider] || {}),
61
+ stores: parse_stores(data[:stores] || {})
62
+ )
63
+ end
64
+
65
+ # Capture the current {Woods::Configuration} as a {ResolvedConfig}.
66
+ #
67
+ # The +provider:+ kwarg lets callers pass a live embedding provider so
68
+ # the dimension is discovered at runtime instead of being read from a
69
+ # declared-only field. This matters for Ollama — dimensions come from
70
+ # the model, not the config — and doesn't hurt OpenAI, whose provider
71
+ # exposes the same +#dimensions+ interface.
72
+ #
73
+ # When +provider:+ is omitted, dimension falls back to
74
+ # +config.embedding_options[:dimension]+ (useful for specs and for
75
+ # offline ResolvedConfig construction where no provider exists).
76
+ #
77
+ # @param config [Woods::Configuration]
78
+ # @param gem_version [String] Defaults to {Woods::VERSION}
79
+ # @param provider [#dimensions, nil] Optional live provider to probe
80
+ # for dimension when +config.embedding_options[:dimension]+ is absent.
81
+ # @return [ResolvedConfig]
82
+ def self.from_configuration(config, gem_version: nil, provider: nil) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
83
+ require_relative 'version'
84
+
85
+ opts = config.embedding_options || {}
86
+ declared_dim = opts[:dimension] || opts['dimension']
87
+ dim = declared_dim || (provider.respond_to?(:dimensions) ? provider.dimensions : nil)
88
+
89
+ provider_hash = {
90
+ class: resolve_provider_class(config.embedding_provider),
91
+ model: (opts[:model] || opts['model'] || config.embedding_model).to_s,
92
+ dimension: dim.to_i,
93
+ host: opts[:host] || opts['host'],
94
+ num_ctx: opts[:num_ctx] || opts['num_ctx'],
95
+ read_timeout: opts[:read_timeout] || opts['read_timeout']
96
+ }.compact
97
+
98
+ new(
99
+ schema_version: SCHEMA_VERSION_SUPPORTED,
100
+ gem_version: (gem_version || Woods::VERSION).to_s,
101
+ created_at: Time.now.utc,
102
+ embedding_provider: provider_hash,
103
+ stores: {
104
+ vector_store: config.vector_store,
105
+ metadata_store: config.metadata_store,
106
+ graph_store: config.graph_store
107
+ }
108
+ )
109
+ end
110
+
111
+ # @param schema_version [Integer]
112
+ # @param gem_version [String]
113
+ # @param created_at [Time]
114
+ # @param embedding_provider [Hash]
115
+ # @param stores [Hash]
116
+ def initialize(schema_version:, gem_version:, created_at:, embedding_provider:, stores:)
117
+ @schema_version = schema_version
118
+ @gem_version = gem_version.to_s.freeze
119
+ @created_at = created_at
120
+ @embedding_provider = deep_freeze(embedding_provider)
121
+ @stores = deep_freeze(stores)
122
+ freeze
123
+ end
124
+
125
+ # @return [Integer] Embedding dimension declared by the provider
126
+ def dimension
127
+ embedding_provider[:dimension].to_i
128
+ end
129
+
130
+ # Short string identifying this provider configuration, useful for log
131
+ # messages and {ConfigMismatch} error text.
132
+ #
133
+ # @return [String] e.g. "Ollama/nomic-embed-text@http://host.docker.internal:11434"
134
+ def provider_signature
135
+ klass = embedding_provider[:class].to_s.split('::').last
136
+ model = embedding_provider[:model]
137
+ host = embedding_provider[:host]
138
+ host ? "#{klass}/#{model}@#{host}" : "#{klass}/#{model}"
139
+ end
140
+
141
+ # Returns +true+ if +other+ uses the same provider class, model, dimension,
142
+ # and store types. Ignores gem version, read_timeout, num_ctx, and created_at.
143
+ #
144
+ # @param other [ResolvedConfig]
145
+ # @return [Boolean]
146
+ def matches?(other)
147
+ embedding_provider[:class] == other.embedding_provider[:class] &&
148
+ embedding_provider[:model] == other.embedding_provider[:model] &&
149
+ dimension == other.dimension &&
150
+ stores[:vector_store] == other.stores[:vector_store] &&
151
+ stores[:metadata_store] == other.stores[:metadata_store] &&
152
+ stores[:graph_store] == other.stores[:graph_store]
153
+ end
154
+
155
+ # Assert that +stored_config+ (the config captured at embed time) is
156
+ # compatible with +self+ (the live host config). Raises typed errors
157
+ # so the operator can diagnose the mismatch without reading source.
158
+ #
159
+ # @param stored_config [ResolvedConfig]
160
+ # @raise [Woods::MCP::DimensionMismatch] if dimensions differ
161
+ # @raise [Woods::MCP::ConfigMismatch] if provider class or model differs
162
+ # @return [void]
163
+ def assert_compatible!(stored_config)
164
+ assert_dimensions_match!(stored_config)
165
+ assert_provider_matches!(stored_config)
166
+ end
167
+
168
+ # Serialize to a Hash suitable for +JSON.generate+ and round-trippable
169
+ # through {.from_hash}.
170
+ #
171
+ # @return [Hash]
172
+ def to_snapshot_json
173
+ {
174
+ 'schema_version' => schema_version,
175
+ 'gem_version' => gem_version,
176
+ 'created_at' => created_at.iso8601,
177
+ 'embedding_provider' => embedding_provider.transform_keys(&:to_s),
178
+ 'stores' => stores.transform_keys(&:to_s).transform_values(&:to_s)
179
+ }
180
+ end
181
+
182
+ # @return [Hash]
183
+ def to_h
184
+ to_snapshot_json.freeze
185
+ end
186
+
187
+ private
188
+
189
+ # Recursively freeze a Hash and every Hash/Array/String it transitively
190
+ # holds. The previous shallow `.freeze` left nested Hash values mutable
191
+ # — a caller reaching `config.embedding_provider[:options][:foo] = …`
192
+ # could mutate the supposedly-immutable snapshot. Public ResolvedConfig
193
+ # is documented as a frozen Whole Value; this enforces it.
194
+ def deep_freeze(obj) # rubocop:disable Metrics/CyclomaticComplexity
195
+ case obj
196
+ when Hash
197
+ obj.each_pair do |k, v|
198
+ deep_freeze(k)
199
+ deep_freeze(v)
200
+ end
201
+ obj.frozen? ? obj : obj.freeze
202
+ when Array
203
+ obj.each { |v| deep_freeze(v) }
204
+ obj.frozen? ? obj : obj.freeze
205
+ when String
206
+ obj.frozen? ? obj : obj.dup.freeze
207
+ else
208
+ obj
209
+ end
210
+ end
211
+
212
+ def assert_dimensions_match!(stored_config)
213
+ return if dimension == stored_config.dimension
214
+
215
+ raise Woods::MCP::DimensionMismatch.new(
216
+ "Provider dimension #{dimension} does not match stored dimension #{stored_config.dimension}. " \
217
+ 'Re-run `rake woods:embed` to rebuild the index.',
218
+ details: {
219
+ expected: stored_config.dimension,
220
+ actual: dimension,
221
+ stored_at: stored_config.created_at.iso8601
222
+ }
223
+ )
224
+ end
225
+
226
+ def assert_provider_matches!(stored_config)
227
+ return if embedding_provider[:class] == stored_config.embedding_provider[:class] &&
228
+ embedding_provider[:model] == stored_config.embedding_provider[:model]
229
+
230
+ raise Woods::MCP::ConfigMismatch.new(
231
+ "Host provider #{provider_signature} does not match stored provider #{stored_config.provider_signature}. " \
232
+ 'Re-run `rake woods:embed` or align host configuration.',
233
+ details: {
234
+ host: provider_signature,
235
+ stored: stored_config.provider_signature,
236
+ stored_at: stored_config.created_at.iso8601
237
+ }
238
+ )
239
+ end
240
+
241
+ class << self
242
+ private
243
+
244
+ def validate_schema_version!(version)
245
+ # Forwards-compatibility rule: accept any version at or below the
246
+ # supported ceiling. An older dump (schema_version 1) must still
247
+ # load cleanly on a newer gem (schema_version 2), matching the
248
+ # behaviour of the binary snapshotters (vectors.bin, metadata.msgpack)
249
+ # which both use `<=`.
250
+ return if version.positive? && version <= SCHEMA_VERSION_SUPPORTED
251
+
252
+ raise Woods::MCP::UnsupportedArtifact.new(
253
+ "woods.json schema_version #{version} is not supported (supported: #{SCHEMA_VERSION_SUPPORTED})",
254
+ details: { found: version, supported: SCHEMA_VERSION_SUPPORTED }
255
+ )
256
+ end
257
+
258
+ def parse_provider(raw)
259
+ data = normalize_keys(raw)
260
+ {
261
+ class: data[:class].to_s,
262
+ model: data[:model].to_s,
263
+ dimension: data[:dimension].to_i,
264
+ host: data[:host],
265
+ num_ctx: data[:num_ctx],
266
+ read_timeout: data[:read_timeout]
267
+ }.compact
268
+ end
269
+
270
+ def parse_stores(raw)
271
+ data = normalize_keys(raw)
272
+ {
273
+ vector_store: data[:vector_store]&.to_sym,
274
+ metadata_store: data[:metadata_store]&.to_sym,
275
+ graph_store: data[:graph_store]&.to_sym
276
+ }
277
+ end
278
+
279
+ def parse_time(value)
280
+ value ? Time.parse(value.to_s) : Time.now.utc
281
+ end
282
+
283
+ def normalize_keys(hash)
284
+ hash.transform_keys(&:to_sym)
285
+ end
286
+
287
+ def resolve_provider_class(provider)
288
+ case provider
289
+ when :openai then 'Woods::Embedding::Provider::OpenAI'
290
+ when :ollama then 'Woods::Embedding::Provider::Ollama'
291
+ when String then provider
292
+ when Class then provider.name
293
+ when nil then ''
294
+ else provider.to_s
295
+ end
296
+ end
297
+ end
298
+ end
299
+ end
@@ -1,5 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative 'search_executor'
4
+ require_relative '../token_utils'
5
+
3
6
  module Woods
4
7
  module Retrieval
5
8
  # Transforms ranked search candidates into a token-budgeted context string
@@ -34,13 +37,48 @@ module Woods
34
37
  # Minimum token count for a section to be worth including.
35
38
  MIN_USEFUL_TOKENS = 200
36
39
 
40
+ # Default chars-per-token ratio. Delegates to {Woods::TokenUtils} —
41
+ # the single source of truth — which uses 4.0 (OpenAI / tiktoken
42
+ # cl100k_base average for Ruby source; see docs/TOKEN_BENCHMARK.md).
43
+ # Callers embedding with BERT/WordPiece tokenizers (nomic-embed-text,
44
+ # bge-*) should pass the tighter ratio from their TextPreparer
45
+ # (~1.5–2.5) so truncation stays honest for that provider — or use
46
+ # {TokenUtils.chars_per_token_for(:ollama)} for the shipped default.
47
+ DEFAULT_CHARS_PER_TOKEN = TokenUtils::DEFAULT_CHARS_PER_TOKEN
48
+
37
49
  # @param metadata_store [#find] Store that resolves identifiers to unit data
38
50
  # @param budget [Integer] Total token budget
39
- def initialize(metadata_store:, budget: DEFAULT_BUDGET)
51
+ # @param chars_per_token [Float] Tokenizer-calibrated char/token ratio used
52
+ # for truncation sizing. Match this to the embedding provider in use —
53
+ # {Woods::Embedding::TextPreparer#chars_per_token} exposes the live
54
+ # value from the indexing-time preparer.
55
+ # @param token_counter [#count, nil] Optional exact tokenizer (typically
56
+ # {Woods::Embedding::TokenCounter}). When provided, token estimation
57
+ # uses the model's real WordPiece/BPE output instead of the
58
+ # `chars / chars_per_token` heuristic, which matters most for the
59
+ # Ollama path (ratios vary widely across Rails source, 1.5–2.5).
60
+ # The heuristic remains the fallback when the counter is nil or the
61
+ # tokenizer gem isn't installed.
62
+ def initialize(metadata_store:, budget: DEFAULT_BUDGET,
63
+ chars_per_token: DEFAULT_CHARS_PER_TOKEN,
64
+ token_counter: nil)
40
65
  @metadata_store = metadata_store
41
66
  @budget = budget
67
+ # Guard against 0 / negative / NaN ratios — any of those would make
68
+ # `estimate_tokens` div-by-zero or return a negative budget, which
69
+ # would silently truncate every section to empty. Fall back to the
70
+ # default ratio rather than propagate the bogus input.
71
+ ratio = chars_per_token.to_f
72
+ @chars_per_token = ratio.positive? ? ratio : DEFAULT_CHARS_PER_TOKEN
73
+ @token_counter = token_counter
42
74
  end
43
75
 
76
+ # @return [Float] the configured chars-per-token ratio
77
+ attr_reader :chars_per_token
78
+
79
+ # @return [#count, nil] the exact tokenizer, if one was injected
80
+ attr_reader :token_counter
81
+
44
82
  # Assemble context from ranked candidates within token budget.
45
83
  #
46
84
  # @param candidates [Array<Candidate>] Ranked search candidates
@@ -54,6 +92,13 @@ module Woods
54
92
  sources = []
55
93
  tokens_used = 0
56
94
 
95
+ # Collapse +User#chunk_0+, +User#chunk_1+, … back to their base unit
96
+ # BEFORE metadata lookup and section assembly. Chunk IDs are an
97
+ # embedding-side concern — the metadata store is keyed by the base
98
+ # identifier, and callers don't want the same unit formatted twice
99
+ # just because multiple chunks matched the query.
100
+ candidates = collapse_chunk_candidates(candidates)
101
+
57
102
  # Pre-fetch all candidate metadata in one batch query
58
103
  @unit_cache = @metadata_store.find_batch(candidates.map(&:identifier))
59
104
 
@@ -78,6 +123,46 @@ module Woods
78
123
 
79
124
  private
80
125
 
126
+ # Suffix the Indexer appends when a single unit is split into multiple
127
+ # embedding vectors (rails_source and other large units). Separator
128
+ # is +#+ so it can never collide with a Ruby constant (+::+) or a
129
+ # method ref (+#instance_method+) in an identifier.
130
+ CHUNK_SUFFIX_PATTERN = /#chunk_\d+\z/
131
+ private_constant :CHUNK_SUFFIX_PATTERN
132
+
133
+ # Strip the +#chunk_N+ suffix from an identifier, if present.
134
+ # +User#chunk_3+ → +User+; +User+ stays +User+.
135
+ def base_identifier(identifier)
136
+ identifier.sub(CHUNK_SUFFIX_PATTERN, '')
137
+ end
138
+
139
+ # Rewrite every candidate to point at its base identifier and keep only
140
+ # the highest-scoring candidate per base unit. Preserves original score
141
+ # ordering on the output so downstream +sort_by(-score)+ gets the same
142
+ # input it would on an unchunked corpus.
143
+ def collapse_chunk_candidates(candidates)
144
+ best = {}
145
+ candidates.each do |c|
146
+ base = base_identifier(c.identifier)
147
+ rewritten = c.identifier == base ? c : rewrite_identifier(c, base)
148
+ best[base] = rewritten if best[base].nil? || rewritten.score > best[base].score
149
+ end
150
+ best.values
151
+ end
152
+
153
+ # Return a clone of +candidate+ with its identifier replaced. Kept as
154
+ # its own method so the Candidate struct shape is referenced in exactly
155
+ # one place — if SearchExecutor::Candidate grows fields, this is the
156
+ # only spot to update.
157
+ def rewrite_identifier(candidate, new_identifier)
158
+ SearchExecutor::Candidate.new(
159
+ identifier: new_identifier,
160
+ score: candidate.score,
161
+ source: candidate.source,
162
+ metadata: candidate.metadata
163
+ )
164
+ end
165
+
81
166
  # Add structural context section if provided.
82
167
  #
83
168
  # @return [Integer] Updated tokens_used count
@@ -224,17 +309,39 @@ module Woods
224
309
  def truncate_to_budget(text, token_budget)
225
310
  return text if estimate_tokens(text) <= token_budget
226
311
 
227
- # Estimate target character count with 10% safety margin
228
- target_chars = (token_budget * 4.0 * 0.9).to_i
312
+ # Target-char sizing uses the effective ratio: the provider's live
313
+ # ratio when we have an exact counter, otherwise @chars_per_token.
314
+ # 10 % safety margin keeps us below the budget after the imprecise
315
+ # tokenizer runs again on the truncated output.
316
+ target_chars = (token_budget * effective_chars_per_token * 0.9).to_i
229
317
  "#{text[0...target_chars]}\n... [truncated]"
230
318
  end
231
319
 
232
- # Estimate token count using the project convention.
320
+ # Estimate token count. Prefers the injected {TokenCounter} — which
321
+ # loads the provider's real tokenizer and returns exact counts — and
322
+ # falls back to the configured chars-per-token ratio when no counter
323
+ # is wired.
233
324
  #
234
325
  # @param text [String]
235
326
  # @return [Integer]
236
327
  def estimate_tokens(text)
237
- (text.length / 4.0).ceil
328
+ return 0 if text.nil? || text.empty?
329
+ return @token_counter.count(text) if @token_counter
330
+
331
+ (text.length / @chars_per_token).ceil
332
+ end
333
+
334
+ # Effective chars-per-token for chunk-size sizing. When an exact
335
+ # counter is present, prefer its native ratio (e.g. 1.2 for
336
+ # nomic-embed-text) so truncation and estimation agree. Falls back
337
+ # to the configured ratio if the counter reports 0 or a non-positive
338
+ # value (which would make truncation target zero chars).
339
+ def effective_chars_per_token
340
+ if @token_counter.respond_to?(:chars_per_token) && @token_counter.chars_per_token
341
+ ratio = @token_counter.chars_per_token.to_f
342
+ return ratio if ratio.positive?
343
+ end
344
+ @chars_per_token
238
345
  end
239
346
 
240
347
  # Build the final AssembledContext result.
@@ -31,7 +31,7 @@ module Woods
31
31
  implement: /\b(implement|add|create|build|write|make|generate)\b/i,
32
32
  compare: /\b(compare|difference|vs|versus|between|contrast)\b/i,
33
33
  # rubocop:disable Layout/LineLength
34
- framework: /\b(how does rails|what does rails|rails .+ work|work.+\brails\b|in rails\b|activerecord|actioncontroller|activejob)\b/i,
34
+ framework: /\b(how does rails|what does rails|rails .+ work|work.+\brails\b|in rails\b|activerecord|actioncontroller|activejob|actionmailer|actioncable|actiontext|activestorage|solid_queue|solid_cache|solid_cable|kamal|propshaft|importmap|hotwire|turbo|stimulus|zeitwerk)\b/i,
35
35
  # rubocop:enable Layout/LineLength
36
36
  reference: /\b(show me|what is|what are|list|options for|api|interface|signature)\b/i,
37
37
  understand: /\b(how|why|explain|understand|what happens|describe|overview)\b/i
@@ -35,8 +35,11 @@ module Woods
35
35
  RRF_K = 60
36
36
 
37
37
  # @param metadata_store [#find] Store that resolves identifiers to unit metadata
38
- def initialize(metadata_store:)
38
+ # @param graph_store [#pagerank, nil] Optional graph store exposing PageRank scores.
39
+ # When present, PageRank rank-percentile replaces the bucketed importance signal.
40
+ def initialize(metadata_store:, graph_store: nil)
39
41
  @metadata_store = metadata_store
42
+ @graph_store = graph_store
40
43
  end
41
44
 
42
45
  # Rank candidates by weighted signal scoring with diversity adjustment.
@@ -89,8 +92,9 @@ module Woods
89
92
 
90
93
  candidates.group_by(&:source).each_value do |source_candidates|
91
94
  ranked = source_candidates.sort_by { |c| -c.score }
92
- ranked.each_with_index do |candidate, rank|
93
- rrf_scores[candidate.identifier] += 1.0 / (RRF_K + rank)
95
+ ranked.each_with_index do |candidate, idx|
96
+ # RRF is 1-based (Cormack et al., 2009): top-ranked doc uses rank 1, not 0.
97
+ rrf_scores[candidate.identifier] += 1.0 / (RRF_K + idx + 1)
94
98
  metadata_map[candidate.identifier] ||= candidate.metadata
95
99
  end
96
100
  end
@@ -102,7 +106,14 @@ module Woods
102
106
  #
103
107
  # @return [Array<Candidate>]
104
108
  def rebuild_rrf_candidates(candidates, rrf_scores, metadata_map)
105
- original_by_id = candidates.index_by(&:identifier)
109
+ # Plain-Ruby `index_by` substitute — the ActiveSupport version
110
+ # isn't loaded when the gem runs outside a Rails boot. Preserve
111
+ # last-wins semantics to match ActiveSupport's `Enumerable#index_by`
112
+ # so the merged candidate's `source` continues to reflect the
113
+ # final source a given identifier appeared in (relevant when
114
+ # observability/debug tools read `.source` on an RRF result).
115
+ original_by_id = {}
116
+ candidates.each { |c| original_by_id[c.identifier] = c }
106
117
  rrf_scores.sort_by { |_id, score| -score }.map do |identifier, score|
107
118
  original = original_by_id[identifier]
108
119
  build_candidate(
@@ -133,7 +144,7 @@ module Woods
133
144
  semantic: candidate.score.to_f,
134
145
  keyword: keyword_score(candidate),
135
146
  recency: recency_score(unit),
136
- importance: importance_score(unit),
147
+ importance: importance_score(unit, candidate.identifier),
137
148
  type_match: type_match_score(unit, classification),
138
149
  diversity: 1.0 # Adjusted after initial sort
139
150
  }
@@ -184,9 +195,18 @@ module Woods
184
195
 
185
196
  # Importance score based on PageRank / structural importance.
186
197
  #
198
+ # Prefers live PageRank from the graph store (rank-percentile 0.0–1.0) when
199
+ # available. Falls back to bucketed importance metadata (`:high`/`:medium`/`:low`)
200
+ # when there is no graph store, the PageRank map is empty, or the identifier
201
+ # is not yet indexed (e.g., a new unit since the last extraction).
202
+ #
187
203
  # @param unit [Hash, nil] Unit metadata from store
204
+ # @param identifier [String] Candidate identifier (matched against PageRank keys)
188
205
  # @return [Float] 0.0 to 1.0
189
- def importance_score(unit)
206
+ def importance_score(unit, identifier)
207
+ pagerank = pagerank_importance_map[identifier]
208
+ return pagerank if pagerank
209
+
190
210
  return 0.5 unless unit
191
211
 
192
212
  importance = dig_metadata(unit, :importance)
@@ -198,6 +218,35 @@ module Woods
198
218
  end
199
219
  end
200
220
 
221
+ # Lazily-computed rank-percentile map derived from the graph store's PageRank.
222
+ #
223
+ # Top-ranked identifier gets 1.0, bottom-ranked gets 1/n. Identifiers absent
224
+ # from PageRank (new units, ephemeral candidates) return nil and fall back
225
+ # to the bucketed importance signal.
226
+ #
227
+ # @return [Hash{String => Float}]
228
+ def pagerank_importance_map
229
+ @pagerank_importance_map ||= compute_pagerank_importance_map
230
+ end
231
+
232
+ # Compute rank-percentile scores from the graph store's PageRank hash.
233
+ #
234
+ # @return [Hash{String => Float}] Empty hash when no graph store or no scores.
235
+ def compute_pagerank_importance_map
236
+ return {} unless @graph_store.respond_to?(:pagerank)
237
+
238
+ scores = @graph_store.pagerank
239
+ return {} if scores.nil? || scores.empty?
240
+
241
+ ranked = scores.sort_by { |_id, score| -score }
242
+ total = ranked.size.to_f
243
+ ranked.each_with_index.to_h do |(identifier, _score), rank|
244
+ [identifier, 1.0 - (rank / total)]
245
+ end
246
+ rescue StandardError
247
+ {}
248
+ end
249
+
201
250
  # Type match score — bonus when result type matches query target_type.
202
251
  #
203
252
  # @param unit [Hash, nil] Unit metadata from store
@@ -56,10 +56,27 @@ module Woods
56
56
  # @param query [String] The original query text
57
57
  # @param classification [QueryClassifier::Classification] Classified query
58
58
  # @param limit [Integer] Maximum candidates to return
59
+ # @param type_filter [Array<String>, nil] When set, vector and hybrid
60
+ # strategies push this down into the vector store's metadata
61
+ # filter — used by {Retriever#retrieve} to rank-within-type when
62
+ # the unfiltered global top-K had no candidate of the requested type.
63
+ # Overrides the classifier-derived +target_type+ in filter construction.
64
+ # @param strategy [Symbol, nil] Override the classifier-selected strategy.
65
+ # {Retriever#within_type_fallback} passes +:vector+ here because the
66
+ # vector path is the only one that honors +type_filter+; if the
67
+ # classifier picked +:keyword+ / +:graph+ / +:direct+ the fallback
68
+ # would otherwise silently re-run the same strategy, get filtered to
69
+ # empty, and violate the "never empty when units exist" contract.
59
70
  # @return [ExecutionResult] Candidates with strategy metadata
60
- def execute(query:, classification:, limit: 20)
61
- strategy = select_strategy(classification)
62
- candidates = run_strategy(strategy, query: query, classification: classification, limit: limit)
71
+ def execute(query:, classification:, limit: 20, type_filter: nil, strategy: nil)
72
+ strategy ||= select_strategy(classification)
73
+ candidates = run_strategy(
74
+ strategy,
75
+ query: query,
76
+ classification: classification,
77
+ limit: limit,
78
+ type_filter: type_filter
79
+ )
63
80
 
64
81
  ExecutionResult.new(
65
82
  candidates: candidates.first(limit),
@@ -104,17 +121,18 @@ module Woods
104
121
  # @param query [String] Original query text
105
122
  # @param classification [QueryClassifier::Classification]
106
123
  # @param limit [Integer] Max results
124
+ # @param type_filter [Array<String>, nil] Pushed into vector filters
107
125
  # @return [Array<Candidate>]
108
- def run_strategy(strategy, query:, classification:, limit:)
126
+ def run_strategy(strategy, query:, classification:, limit:, type_filter: nil)
109
127
  case strategy
110
128
  when :vector
111
- execute_vector(query, classification: classification, limit: limit)
129
+ execute_vector(query, classification: classification, limit: limit, type_filter: type_filter)
112
130
  when :keyword
113
131
  execute_keyword(classification: classification, limit: limit)
114
132
  when :graph
115
133
  execute_graph(classification: classification, limit: limit)
116
134
  when :hybrid
117
- execute_hybrid(query, classification: classification, limit: limit)
135
+ execute_hybrid(query, classification: classification, limit: limit, type_filter: type_filter)
118
136
  when :direct
119
137
  execute_direct(classification: classification, limit: limit)
120
138
  end
@@ -123,9 +141,9 @@ module Woods
123
141
  # Vector strategy: embed the query and search by similarity.
124
142
  #
125
143
  # @return [Array<Candidate>]
126
- def execute_vector(query, classification:, limit:)
144
+ def execute_vector(query, classification:, limit:, type_filter: nil)
127
145
  query_vector = @embedding_provider.embed(query)
128
- filters = build_vector_filters(classification)
146
+ filters = build_vector_filters(classification, type_filter: type_filter)
129
147
 
130
148
  results = @vector_store.search(query_vector, limit: limit, filters: filters)
131
149
  results.map do |r|
@@ -209,9 +227,10 @@ module Woods
209
227
  # Hybrid strategy: combine vector, keyword, and graph expansion.
210
228
  #
211
229
  # @return [Array<Candidate>]
212
- def execute_hybrid(query, classification:, limit:)
230
+ def execute_hybrid(query, classification:, limit:, type_filter: nil)
213
231
  # Gather from all three sources
214
- vector_candidates = execute_vector(query, classification: classification, limit: limit)
232
+ vector_candidates = execute_vector(query, classification: classification, limit: limit,
233
+ type_filter: type_filter)
215
234
  keyword_candidates = execute_keyword(classification: classification, limit: limit)
216
235
 
217
236
  # Graph expansion on top vector results
@@ -266,13 +285,23 @@ module Woods
266
285
  candidates
267
286
  end
268
287
 
269
- # Build metadata filters for vector search based on classification.
288
+ # Build metadata filters for vector search based on classification
289
+ # and an optional explicit type filter from the caller.
290
+ #
291
+ # The caller's explicit +type_filter+ overrides classifier-derived
292
+ # +target_type+ when both are present — the caller opted into a
293
+ # specific set of types and that intent beats a heuristic.
270
294
  #
271
295
  # @param classification [QueryClassifier::Classification]
296
+ # @param type_filter [Array<String>, nil]
272
297
  # @return [Hash]
273
- def build_vector_filters(classification)
298
+ def build_vector_filters(classification, type_filter: nil)
274
299
  filters = {}
275
- filters[:type] = classification.target_type.to_s if classification.target_type
300
+ if type_filter && !type_filter.empty?
301
+ filters[:type] = type_filter.map(&:to_s)
302
+ elsif classification.target_type
303
+ filters[:type] = classification.target_type.to_s
304
+ end
276
305
  filters
277
306
  end
278
307