woods 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +169 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +15 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +3 -4
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +737 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +1 -1
  102. data/lib/woods/unblocked/document_builder.rb +35 -10
  103. data/lib/woods/unblocked/exporter.rb +1 -1
  104. data/lib/woods/util/host_guard.rb +61 -0
  105. data/lib/woods/version.rb +1 -1
  106. data/lib/woods.rb +126 -6
  107. metadata +69 -4
@@ -1,9 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Ruby 3.2 autoloads Set, but the gem supports >= 3.0 — make the require
4
+ # explicit so +filter_by_type+ works on the whole supported range.
5
+ require 'set'
6
+
3
7
  require_relative 'retrieval/query_classifier'
4
8
  require_relative 'retrieval/search_executor'
5
9
  require_relative 'retrieval/ranker'
6
10
  require_relative 'retrieval/context_assembler'
11
+ require_relative 'embedding/token_counter'
12
+ require_relative 'token_utils'
7
13
 
8
14
  module Woods
9
15
  # Retriever orchestrates the full retrieval pipeline: classify, execute,
@@ -30,26 +36,77 @@ module Woods
30
36
  # result.strategy # => :vector
31
37
  # result.tokens_used # => 4200
32
38
  #
33
- class Retriever
39
+ class Retriever # rubocop:disable Metrics/ClassLength
40
+ # BERT / WordPiece-family embedders Ollama commonly serves. Matched
41
+ # against `provider.model_name` to decide whether to use the 1.5
42
+ # chars/token ratio and wire in an exact {Woods::Embedding::TokenCounter}.
43
+ # Extend this list when new WordPiece-family models become popular —
44
+ # the tiktoken 4.0 default remains the safe fallback for unknowns.
45
+ OLLAMA_EMBEDDING_MODELS = Regexp.union(
46
+ /\Anomic-embed/, /\Abge-/, /\Amxbai-embed/,
47
+ /\Asnowflake-arctic/, /\Aall-minilm/, /\Aparaphrase-/,
48
+ /\Ae5-/, /\Agte-/, /\Astella/,
49
+ /\Agranite-embedding/, /\Ajina-embeddings/
50
+ ).freeze
51
+
34
52
  # Diagnostic trace for retrieval quality analysis.
35
53
  RetrievalTrace = Struct.new(:classification, :strategy, :candidate_count,
36
54
  :ranked_count, :tokens_used, :elapsed_ms,
37
55
  keyword_init: true)
38
56
 
39
57
  # The result of a retrieval operation.
58
+ #
59
+ # When the caller passed +types:+ to +#retrieve+, +type_rank_context+
60
+ # is a Hash keyed by requested type name with one entry per type:
61
+ #
62
+ # {
63
+ # "controller" => {
64
+ # source: :in_top_k, # see enum below
65
+ # top_of_type_global_rank: 3, # 1-based rank in unfiltered ranked, or nil
66
+ # global_k: 20, # size of the unfiltered ranked list
67
+ # total_of_type: 183 # total units of that type in the index
68
+ # }
69
+ # }
70
+ #
71
+ # +:source+ tells the caller which bucket the type landed in without
72
+ # forcing them to infer it from nil ranks:
73
+ # :in_top_k — type present in the unfiltered ranked list;
74
+ # strong match.
75
+ # :within_type_fallback — type NOT in the unfiltered ranked list, but
76
+ # the fallback vector search returned
77
+ # candidates of this type. Weak match.
78
+ # :outside_top_k — type NOT in the unfiltered ranked list, has
79
+ # units in the index, but the fallback did
80
+ # not run (other requested types filled the
81
+ # result). No results of this type.
82
+ # :absent — type has zero units in the index.
83
+ #
84
+ # Nil for unfiltered queries.
40
85
  RetrievalResult = Struct.new(:context, :sources, :classification, :strategy, :tokens_used, :budget, :trace,
41
- keyword_init: true)
86
+ :type_rank_context, keyword_init: true)
42
87
 
43
88
  # Unit types queried for the structural context overview.
44
89
  STRUCTURAL_TYPES = %w[model controller service job mailer component graphql].freeze
45
90
 
91
+ # Direct handles to the injected stores. The sub-components
92
+ # ({Retrieval::SearchExecutor}, {Retrieval::Ranker},
93
+ # {Retrieval::ContextAssembler}) hold their own references too, but those
94
+ # are implementation details — callers that want to mutate store contents
95
+ # (e.g. the MCP +reload+ tool) read through these accessors. All three
96
+ # refer to the same Ruby objects the sub-components were initialised with,
97
+ # so in-place +#clear!+ + +#bulk_load+ propagates through the entire
98
+ # pipeline without re-instantiating sub-components.
99
+ attr_reader :vector_store, :metadata_store, :graph_store
100
+
46
101
  # @param vector_store [Storage::VectorStore::Interface] Vector store adapter
47
102
  # @param metadata_store [Storage::MetadataStore::Interface] Metadata store adapter
48
103
  # @param graph_store [Storage::GraphStore::Interface] Graph store adapter
49
104
  # @param embedding_provider [Embedding::Provider::Interface] Embedding provider
50
105
  # @param formatter [#call, nil] Optional callable to post-process the context string
51
106
  def initialize(vector_store:, metadata_store:, graph_store:, embedding_provider:, formatter: nil)
107
+ @vector_store = vector_store
52
108
  @metadata_store = metadata_store
109
+ @graph_store = graph_store
53
110
  @formatter = formatter
54
111
 
55
112
  @classifier = Retrieval::QueryClassifier.new
@@ -59,41 +116,164 @@ module Woods
59
116
  graph_store: graph_store,
60
117
  embedding_provider: embedding_provider
61
118
  )
62
- @ranker = Retrieval::Ranker.new(metadata_store: metadata_store)
63
- @assembler = Retrieval::ContextAssembler.new(metadata_store: metadata_store)
119
+ @ranker = Retrieval::Ranker.new(metadata_store: metadata_store, graph_store: graph_store)
120
+ # Match truncation sizing to the embedding provider's tokenizer so
121
+ # Ollama-indexed corpora (ratio ~1.5) don't get over-truncated by
122
+ # an OpenAI-sized default (4.0). Unknown/missing providers fall
123
+ # back to the OpenAI-friendly default.
124
+ chars_per_token = infer_chars_per_token(embedding_provider)
125
+ @assembler = Retrieval::ContextAssembler.new(
126
+ metadata_store: metadata_store,
127
+ chars_per_token: chars_per_token,
128
+ token_counter: infer_token_counter(embedding_provider)
129
+ )
64
130
  end
65
131
 
132
+ # Infer the chars-per-token ratio from an embedding provider's model.
133
+ # Ollama WordPiece-style tokenizers (nomic-embed-text, bge-*,
134
+ # mxbai-embed-*, snowflake-arctic-*) run hotter on Ruby source than
135
+ # tiktoken; 1.5 is the project's calibrated value — see
136
+ # {Woods::Builder#chars_per_token_for} and docs/EMBEDDING_MODELS.md.
137
+ #
138
+ # @param provider [Object, nil]
139
+ # @return [Float]
140
+ def infer_chars_per_token(provider)
141
+ return Retrieval::ContextAssembler::DEFAULT_CHARS_PER_TOKEN unless provider.respond_to?(:model_name)
142
+
143
+ model = provider.model_name.to_s
144
+ ollama_patterns = OLLAMA_EMBEDDING_MODELS
145
+ model.match?(ollama_patterns) ? TokenUtils.chars_per_token_for(:ollama) : Retrieval::ContextAssembler::DEFAULT_CHARS_PER_TOKEN
146
+ end
147
+ private :infer_chars_per_token
148
+
149
+ # Build an exact TokenCounter for the Ollama path — where WordPiece
150
+ # ratios vary widely across Rails source, so an exact tokenizer is the
151
+ # only way to keep context-budget truncation honest. For OpenAI (and
152
+ # unknown providers) tiktoken's 4.0 ratio is stable enough that the
153
+ # heuristic fallback is fine; we skip the counter there so we don't
154
+ # pull in the optional `tokenizers` gem or warn about it at boot.
155
+ #
156
+ # @param provider [Object, nil]
157
+ # @return [Woods::Embedding::TokenCounter, nil]
158
+ def infer_token_counter(provider)
159
+ return nil unless provider.respond_to?(:model_name)
160
+
161
+ model = provider.model_name.to_s
162
+ ollama_patterns = OLLAMA_EMBEDDING_MODELS
163
+ return nil unless model.match?(ollama_patterns)
164
+
165
+ Embedding::TokenCounter.new
166
+ end
167
+ private :infer_token_counter
168
+
169
+ # Unit types excluded from retrieval by default. +test_mapping+ units
170
+ # make up ~33% of a typical index and lexically dominate semantic rank
171
+ # for production queries ("stripe webhook" often surfaces
172
+ # stripe_webhook_spec.rb above the actual controller). Callers can
173
+ # override by passing +types:+ (include-only) or an explicit +exclude_types:+.
174
+ DEFAULT_EXCLUDE_TYPES = %w[test_mapping].freeze
175
+
176
+ # Suffix the Indexer appends when a single unit is split into multiple
177
+ # embedding vectors — see {Embedding::Indexer#collect_embed_items}. The
178
+ # metadata store is keyed by the base identifier only, so the fallback
179
+ # lookup in +candidate_type+ strips this before probing. Mirrors the
180
+ # constant in {Retrieval::ContextAssembler}; kept as a local copy so the
181
+ # two consumers can evolve independently if the chunk format ever
182
+ # changes on one side of the pipeline.
183
+ CHUNK_SUFFIX_PATTERN = /#chunk_\d+\z/
184
+ private_constant :CHUNK_SUFFIX_PATTERN
185
+
66
186
  # Execute the full retrieval pipeline for a natural language query.
67
187
  #
68
- # Pipeline: classify -> execute -> rank -> assemble -> format
188
+ # Pipeline: classify -> execute -> rank -> filter -> (fallback within-type
189
+ # when filter emptied everything) -> assemble -> format.
190
+ #
191
+ # When +types:+ is set, the response carries +type_rank_context+ —
192
+ # per-type rank metadata the caller uses to tell a strong match from
193
+ # a weak one without Woods imposing a score threshold.
69
194
  #
70
195
  # @param query [String] Natural language query
71
196
  # @param budget [Integer] Token budget for context assembly
197
+ # @param types [Array<String, Symbol>, nil] If set, restrict results to these
198
+ # unit types (overrides DEFAULT_EXCLUDE_TYPES).
199
+ # @param exclude_types [Array<String, Symbol>, nil] Additional types to
200
+ # exclude. Applied on top of DEFAULT_EXCLUDE_TYPES unless +types:+ is set.
72
201
  # @return [RetrievalResult] Complete retrieval result
73
- def retrieve(query, budget: 8000)
202
+ def retrieve(query, budget: 8000, types: nil, exclude_types: nil)
74
203
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
75
-
76
204
  classification = @classifier.classify(query)
77
205
  execution_result = @executor.execute(query: query, classification: classification)
78
206
  ranked = @ranker.rank(execution_result.candidates, classification: classification)
79
- assembled = assemble_context(ranked, classification, budget)
80
-
81
- elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round(1)
82
207
 
83
- trace = RetrievalTrace.new(
84
- classification: classification,
85
- strategy: execution_result.strategy,
86
- candidate_count: execution_result.candidates.size,
87
- ranked_count: ranked.size,
88
- tokens_used: assembled.tokens_used,
89
- elapsed_ms: elapsed_ms
208
+ type_list = normalize_type_list(types)
209
+ filtered, fallback_ran = apply_type_filter(
210
+ ranked, query, classification, types: types, type_list: type_list, exclude_types: exclude_types
90
211
  )
212
+ type_rank_context = type_list ? build_type_rank_context(ranked, type_list, fallback_ran: fallback_ran) : nil
91
213
 
92
- build_result(assembled, classification, execution_result.strategy, budget, trace)
214
+ assembled = assemble_context(filtered, classification, budget)
215
+ trace = build_trace(classification, execution_result, filtered, assembled, start_time)
216
+
217
+ build_result(
218
+ assembled: assembled, classification: classification, strategy: execution_result.strategy,
219
+ budget: budget, trace: trace, type_rank_context: type_rank_context
220
+ )
93
221
  end
94
222
 
95
223
  private
96
224
 
225
+ # Filter ranked candidates by type, using an include-list when +types+
226
+ # is set and an exclude-list otherwise (default: +DEFAULT_EXCLUDE_TYPES+,
227
+ # extended by any +exclude_types+ the caller adds).
228
+ #
229
+ # Candidate type comes from either the metadata store (when populated)
230
+ # or the candidate's inline +metadata+ hash — both are probed so the
231
+ # filter still works on graph-expansion candidates that carry no
232
+ # vector-store metadata.
233
+ #
234
+ # @param candidates [Array<Candidate>]
235
+ # @param types [Array<String, Symbol>, nil]
236
+ # @param exclude_types [Array<String, Symbol>, nil]
237
+ # @return [Array<Candidate>]
238
+ def filter_by_type(candidates, types:, exclude_types:)
239
+ allowed = normalize_type_list(types)
240
+ return candidates.select { |c| allowed.include?(candidate_type(c)) } if allowed
241
+
242
+ excluded = (normalize_type_list(exclude_types) || Set.new) | DEFAULT_EXCLUDE_TYPES.to_set
243
+ return candidates if excluded.empty?
244
+
245
+ candidates.reject { |c| excluded.include?(candidate_type(c)) }
246
+ end
247
+
248
+ def normalize_type_list(list)
249
+ return nil if list.nil? || list.empty?
250
+
251
+ list.to_set(&:to_s)
252
+ end
253
+
254
+ def candidate_type(candidate)
255
+ inline = type_from_hash(candidate.metadata)
256
+ return inline if inline
257
+
258
+ # Fall back to the metadata store lookup so graph-expansion candidates
259
+ # (which come in with metadata: {}) still get type-filtered. Strip the
260
+ # chunk suffix first: chunked vector hits arrive with +Foo#chunk_0+
261
+ # but the store is keyed by the base identifier +Foo+ only, and a
262
+ # missed lookup would let the candidate past the default-exclude
263
+ # (type resolves to '', which +excluded+ never contains).
264
+ lookup_id = candidate.identifier.to_s.sub(CHUNK_SUFFIX_PATTERN, '')
265
+ type_from_hash(@metadata_store.find(lookup_id)) || ''
266
+ rescue StandardError
267
+ ''
268
+ end
269
+
270
+ def type_from_hash(hash)
271
+ return nil unless hash
272
+
273
+ value = hash[:type] || hash['type']
274
+ value&.to_s
275
+ end
276
+
97
277
  # Assemble token-budgeted context from ranked candidates.
98
278
  #
99
279
  # @param ranked [Array<Candidate>] Ranked search candidates
@@ -115,8 +295,9 @@ module Woods
115
295
  # @param strategy [Symbol] Search strategy used
116
296
  # @param budget [Integer] Token budget
117
297
  # @return [RetrievalResult]
118
- def build_result(assembled, classification, strategy, budget, trace = nil)
298
+ def build_result(assembled:, classification:, strategy:, budget:, trace: nil, type_rank_context: nil)
119
299
  context = @formatter ? @formatter.call(assembled.context) : assembled.context
300
+ context = append_type_rank_context(context, type_rank_context) if type_rank_context
120
301
 
121
302
  RetrievalResult.new(
122
303
  context: context,
@@ -125,14 +306,138 @@ module Woods
125
306
  strategy: strategy,
126
307
  tokens_used: assembled.tokens_used,
127
308
  budget: budget,
128
- trace: trace
309
+ trace: trace,
310
+ type_rank_context: type_rank_context
129
311
  )
130
312
  end
131
313
 
314
+ # Post-rank reject, with rank-within-type fallback when the caller
315
+ # passed a type filter and the global top-K had no candidate of the
316
+ # requested type(s). Returns +[filtered, fallback_ran]+ — the second
317
+ # element drives the :source field on type_rank_context.
318
+ def apply_type_filter(ranked, query, classification, types:, type_list:, exclude_types:)
319
+ filtered = filter_by_type(ranked, types: types, exclude_types: exclude_types)
320
+ return [filtered, false] unless type_list && filtered.empty?
321
+
322
+ [within_type_fallback(query, classification, type_list, exclude_types), true]
323
+ end
324
+
325
+ # Rank-within-type fallback query. Pushes the explicit type filter
326
+ # into the executor so the vector store only scores candidates of
327
+ # that type. Used when the global top-K had none of the requested
328
+ # types but the index may still contain them.
329
+ #
330
+ # Forces +strategy: :vector+. Only the vector path honors
331
+ # +type_filter+ — on a keyword/graph/direct-classified query the
332
+ # default strategy would ignore the filter, return the same
333
+ # candidates, and silently leave +filtered+ empty. Vector search
334
+ # works for any classification because we always have the raw
335
+ # query text.
336
+ #
337
+ # Short-circuits to an empty Array when every requested type has
338
+ # zero units in the index — there is nothing for the fallback to
339
+ # find, so we skip the extra vector search.
340
+ def within_type_fallback(query, classification, type_list, exclude_types)
341
+ type_array = type_list.to_a
342
+ return [] if type_array.all? { |t| total_of_type(t).to_i.zero? }
343
+
344
+ fallback = @executor.execute(
345
+ query: query, classification: classification,
346
+ type_filter: type_array, strategy: :vector
347
+ )
348
+ ranked = @ranker.rank(fallback.candidates, classification: classification)
349
+ filter_by_type(ranked, types: type_array, exclude_types: exclude_types)
350
+ end
351
+
352
+ def build_trace(classification, execution_result, filtered, assembled, start_time)
353
+ elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round(1)
354
+ RetrievalTrace.new(
355
+ classification: classification,
356
+ strategy: execution_result.strategy,
357
+ candidate_count: execution_result.candidates.size,
358
+ ranked_count: filtered.size,
359
+ tokens_used: assembled.tokens_used,
360
+ elapsed_ms: elapsed_ms
361
+ )
362
+ end
363
+
364
+ # Build per-type rank metadata from the unfiltered global ranked list.
365
+ #
366
+ # +top_of_type_global_rank+ is the 1-based position of the first
367
+ # candidate of that type in the ranked list, or nil when no candidate
368
+ # of that type survived ranking. +total_of_type+ is the canonical
369
+ # count from the metadata store — answers "does this type exist in the
370
+ # index at all?" independent of query match. +source+ labels the bucket
371
+ # the type landed in so the caller doesn't infer it from a nil rank;
372
+ # see the RetrievalResult docstring for the four-value enum.
373
+ #
374
+ # @param ranked [Array<Candidate>]
375
+ # @param type_list [Set<String>]
376
+ # @param fallback_ran [Boolean] Whether rank-within-type fallback ran
377
+ # @return [Hash{String => Hash}]
378
+ def build_type_rank_context(ranked, type_list, fallback_ran:)
379
+ global_k = ranked.size
380
+ type_list.to_h do |type|
381
+ match_index = ranked.index { |c| candidate_type(c) == type }
382
+ top_rank = match_index ? match_index + 1 : nil
383
+ total = total_of_type(type)
384
+ [
385
+ type,
386
+ {
387
+ source: type_source(top_rank, total, fallback_ran: fallback_ran),
388
+ top_of_type_global_rank: top_rank,
389
+ global_k: global_k,
390
+ total_of_type: total
391
+ }
392
+ ]
393
+ end
394
+ end
395
+
396
+ # Pick the :source enum value for a single type based on where its
397
+ # candidate ended up. See RetrievalResult's docstring for the enum.
398
+ def type_source(top_rank, total, fallback_ran:)
399
+ return :in_top_k if top_rank
400
+ return :absent if total.to_i.zero?
401
+ return :within_type_fallback if fallback_ran
402
+
403
+ :outside_top_k
404
+ end
405
+
406
+ def total_of_type(type)
407
+ @metadata_store.find_by_type(type).size
408
+ rescue StandardError
409
+ nil
410
+ end
411
+
412
+ # Append a compact markdown summary of +type_rank_context+ to the
413
+ # assembled context string. Machine-readable enough for agents to
414
+ # parse without a structured response channel. :source is the first
415
+ # column so the common "strong match" case (in_top_k) is visible at
416
+ # a glance without needing to reason about rank vs global_k.
417
+ def append_type_rank_context(context, type_rank_context)
418
+ return context if type_rank_context.empty?
419
+
420
+ lines = ['', '### Type rank context', '',
421
+ '| Type | Source | Rank in unfiltered top-K | Global K | Total in index |',
422
+ '|------|--------|--------------------------|----------|----------------|']
423
+ type_rank_context.each do |type, info|
424
+ rank = info[:top_of_type_global_rank] || '—'
425
+ total = info[:total_of_type].nil? ? '?' : info[:total_of_type]
426
+ lines << "| #{type} | #{info[:source]} | #{rank} | #{info[:global_k]} | #{total} |"
427
+ end
428
+ "#{context}\n#{lines.join("\n")}\n"
429
+ end
430
+
132
431
  # Build a structural context overview from the metadata store.
133
432
  #
134
- # Queries the metadata store for total unit count and counts per type,
135
- # producing a summary like "Codebase: 42 units (10 models, 5 controllers, ...)".
433
+ # Reports +searchable_entries+ (the retriever's native denominator:
434
+ # one row per vector, including per-chunk rows for long units) rather
435
+ # than +units_indexed+. The two differ because chunking duplicates
436
+ # units; see the `structure` tool's glossary for the full picture.
437
+ #
438
+ # The banner ends with a pointer to `structure` so operators who
439
+ # spot the searchable-entries vs unit-count discrepancy know which
440
+ # tool carries the canonical unit totals (issue #105).
136
441
  #
137
442
  # @return [String, nil] Overview string, or nil if the store is empty or on error
138
443
  def build_structural_context
@@ -141,10 +446,11 @@ module Woods
141
446
 
142
447
  type_counts = STRUCTURAL_TYPES.filter_map do |type|
143
448
  count = @metadata_store.find_by_type(type).size
144
- "#{count} #{type}s" if count.positive?
449
+ "#{count} #{type} entries" if count.positive?
145
450
  end
146
451
 
147
- "Codebase: #{total} units (#{type_counts.join(', ')})"
452
+ "Codebase: #{total} searchable entries (#{type_counts.join(', ')}). " \
453
+ 'Entries include per-chunk rows for chunked units; see `structure` for canonical unit counts.'
148
454
  rescue StandardError
149
455
  nil
150
456
  end
@@ -17,11 +17,45 @@ module Woods
17
17
  # Woods::SessionTracer::Middleware
18
18
  #
19
19
  class Middleware
20
+ # Full Store interface every backend (FileStore, RedisStore,
21
+ # SolidCacheStore) implements. Middleware itself only calls
22
+ # {#record} — the read-side methods are used by the session_trace
23
+ # MCP tool and other consumers. Surfaced as a constant so operators
24
+ # can assert the full interface eagerly when they want:
25
+ #
26
+ # missing = Woods::SessionTracer::Middleware::FULL_STORE_INTERFACE
27
+ # .reject { |m| store.respond_to?(m) }
28
+ # raise "incomplete store: #{missing}" unless missing.empty?
29
+ FULL_STORE_INTERFACE = %i[record read sessions clear clear_all].freeze
30
+
31
+ # Methods the middleware actually calls at request time. Validated
32
+ # at init so a half-configured store fails loudly at boot.
33
+ REQUIRED_STORE_METHODS = %i[record].freeze
34
+
20
35
  # @param app [#call] The downstream Rack application
21
- # @param store [Store] Session trace store backend
36
+ # @param store [Store] Session trace store backend. Must respond to
37
+ # `#record` (called by this middleware). Consumers that use the
38
+ # read-side ({FULL_STORE_INTERFACE}) should assert on their own
39
+ # contract; middleware does not enforce it to stay backward-
40
+ # compatible with minimal `#record`-only implementations.
22
41
  # @param session_id_proc [Proc, nil] Custom session ID extraction (receives env)
23
42
  # @param exclude_paths [Array<String>] Path prefixes to skip
43
+ # @raise [ArgumentError] if the store is nil or does not implement
44
+ # `:record`. Boot-time validation is preferable to the fire-and-
45
+ # forget rescue in {#call} silently swallowing every request trace
46
+ # when the store has the wrong shape.
24
47
  def initialize(app, store:, session_id_proc: nil, exclude_paths: [])
48
+ raise ArgumentError, 'session tracer middleware requires a store' if store.nil?
49
+
50
+ missing = REQUIRED_STORE_METHODS.reject { |m| store.respond_to?(m) }
51
+ unless missing.empty?
52
+ raise ArgumentError,
53
+ 'session tracer store is missing required methods ' \
54
+ "#{missing.inspect} (got #{store.class}). " \
55
+ "Required: #{REQUIRED_STORE_METHODS.inspect}. " \
56
+ "Full interface: #{FULL_STORE_INTERFACE.inspect}."
57
+ end
58
+
25
59
  @app = app
26
60
  @store = store
27
61
  @session_id_proc = session_id_proc
@@ -63,6 +63,22 @@ module Woods
63
63
  def pagerank(damping: 0.85, iterations: 20)
64
64
  raise NotImplementedError
65
65
  end
66
+
67
+ # Returns true iff this store is the authoritative write target for
68
+ # graph edges and survives process restart.
69
+ #
70
+ # Adapter authors must override this — the default raises so a
71
+ # write-through cache or a partially-persistent adapter can't be
72
+ # misclassified as ephemeral by omission. Boot-time rehydration
73
+ # from +dependency_graph.json+ is only valid when this returns
74
+ # +false+; durable backends own their own persistence and must be
75
+ # populated by the extraction/embed write path.
76
+ #
77
+ # @return [Boolean]
78
+ # @raise [NotImplementedError] if the adapter doesn't declare its durability
79
+ def durable?
80
+ raise NotImplementedError
81
+ end
66
82
  end
67
83
 
68
84
  # In-memory graph store wrapping the existing DependencyGraph.
@@ -78,6 +94,11 @@ module Woods
78
94
  class Memory
79
95
  include Interface
80
96
 
97
+ # The wrapped graph. Exposed so reload paths can peel the raw
98
+ # graph out of a freshly-hydrated wrapper and {#replace_graph}
99
+ # it into the live one.
100
+ attr_reader :graph
101
+
81
102
  # @param graph [DependencyGraph, nil] Existing graph to wrap, or nil to create a new one
82
103
  def initialize(graph = nil)
83
104
  @graph = graph || DependencyGraph.new
@@ -90,6 +111,17 @@ module Woods
90
111
  @graph.register(unit)
91
112
  end
92
113
 
114
+ # Replace the wrapped graph in place. Used by the MCP +reload+ tool
115
+ # so tool closures that captured this wrapper see a fresh graph
116
+ # without needing to re-instantiate the wrapper (and break the
117
+ # closure references).
118
+ #
119
+ # @param graph [DependencyGraph]
120
+ # @return [void]
121
+ def replace_graph(graph)
122
+ @graph = graph
123
+ end
124
+
93
125
  # @see Interface#dependencies_of
94
126
  def dependencies_of(identifier)
95
127
  @graph.dependencies_of(identifier)
@@ -114,6 +146,13 @@ module Woods
114
146
  def pagerank(damping: 0.85, iterations: 20)
115
147
  @graph.pagerank(damping: damping, iterations: iterations)
116
148
  end
149
+
150
+ # @see Interface#durable?
151
+ # @return [Boolean] always +false+ — the in-memory adapter is rebuilt
152
+ # on every process boot and owns none of its state across restarts.
153
+ def durable?
154
+ false
155
+ end
117
156
  end
118
157
  end
119
158
  end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Woods
4
+ class Error < StandardError; end unless defined?(Woods::Error)
5
+
6
+ module Storage
7
+ # Raised when a Snapshotter is applied to a persistent backend adapter
8
+ # (e.g. pgvector, Qdrant, SQLite). Snapshotters only operate on in-memory
9
+ # stores; persistent adapters manage their own durability.
10
+ #
11
+ # Named so that tests can assert it rather than catching a bare {Woods::Error}.
12
+ class InapplicableBackend < Woods::Error; end
13
+ end
14
+ end