woods 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +186 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +69 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +210 -0
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +100 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +771 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +163 -0
  102. data/lib/woods/unblocked/document_builder.rb +326 -0
  103. data/lib/woods/unblocked/exporter.rb +201 -0
  104. data/lib/woods/unblocked/rate_limiter.rb +94 -0
  105. data/lib/woods/util/host_guard.rb +61 -0
  106. data/lib/woods/version.rb +1 -1
  107. data/lib/woods.rb +130 -6
  108. metadata +73 -4
@@ -1,15 +1,22 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'json'
3
4
  require 'logger'
4
5
  require 'mcp'
6
+ require 'open3'
7
+ require 'time'
5
8
  require 'set'
9
+ require_relative '../tasks'
6
10
  require_relative 'index_reader'
7
11
  require_relative 'tool_response_renderer'
8
12
 
9
13
  module Woods
10
14
  module MCP
11
- # Builds an MCP::Server with 27 tools, 2 resources, and 2 resource templates for querying
12
- # Woods extraction output, managing pipelines, and collecting feedback.
15
+ # Builds an MCP::Server with up to 29 tools, 2 resources, and 2 resource templates
16
+ # for querying Woods extraction output, managing pipelines, and collecting feedback.
17
+ # 14 tools are always registered; 15 more register conditionally based on wiring:
18
+ # 5 operator tools, 4 feedback tools, 4 snapshot tools, 1 session_trace tool,
19
+ # 1 Notion sync tool.
13
20
  #
14
21
  # All tools are defined inline via closures over an IndexReader instance.
15
22
  # No Rails required at runtime — reads JSON files from disk.
@@ -27,10 +34,19 @@ module Woods
27
34
  # @param retriever [Woods::Retriever, nil] Optional retriever for semantic search
28
35
  # @param operator [Hash, nil] Optional operator config with :status_reporter, :error_escalator, :pipeline_guard, :pipeline_lock
29
36
  # @param feedback_store [Woods::Feedback::Store, nil] Optional feedback store
37
+ # @param bootstrap_state [Woods::MCP::BootstrapState, nil] Optional state
38
+ # from the bootstrap flow. When provided, woods_status reports the
39
+ # hydrated/degraded/failed lifecycle plus the reason so operators can
40
+ # diagnose "why is semantic search disabled" without reading the Ruby
41
+ # source. Nil just means the caller didn't go through Bootstrapper.
42
+ # @param warmup [Boolean] Pre-populate the index reader's caches during build,
43
+ # shifting first-tool-call latency to startup. Default: true. Pass false for
44
+ # tests or when startup time matters more than first-query latency.
30
45
  # @return [MCP::Server] Configured server ready for transport
31
46
  def build(index_dir:, retriever: nil, operator: nil, feedback_store: nil, snapshot_store: nil,
32
- response_format: nil)
47
+ bootstrap_state: nil, response_format: nil, warmup: true, retriever_reloader: nil)
33
48
  reader = IndexReader.new(index_dir)
49
+ reader.warmup! if warmup
34
50
  config = Woods.configuration
35
51
  format = response_format || (config.respond_to?(:context_format) ? config.context_format : nil) || :markdown
36
52
  renderer = ToolResponseRenderer.for(format)
@@ -39,6 +55,31 @@ module Woods
39
55
 
40
56
  # Lambda captured by all tool blocks for building responses.
41
57
  respond = method(:text_response)
58
+ respond_err = method(:error_response)
59
+ op_missing = lambda do |tool|
60
+ error_response(
61
+ 'Pipeline operator is not configured. Pass `operator:` to Woods::MCP::Server.build ' \
62
+ 'or use Woods::MCP::Bootstrapper to wire StatusReporter, ErrorEscalator, and PipelineGuard.',
63
+ code: :not_configured, config_key: 'operator',
64
+ doc_link: 'docs/OPERATOR_GUIDE.md', tool: tool
65
+ )
66
+ end
67
+ fb_missing = lambda do |tool|
68
+ error_response(
69
+ 'Feedback store is not configured. Pass `feedback_store:` to Woods::MCP::Server.build ' \
70
+ 'to enable retrieval feedback capture.',
71
+ code: :not_configured, config_key: 'feedback_store',
72
+ doc_link: 'docs/FEEDBACK_STORE.md', tool: tool
73
+ )
74
+ end
75
+ snap_missing = lambda do |tool|
76
+ error_response(
77
+ 'Snapshot store is not configured. Set `enable_snapshots: true` in Woods.configure ' \
78
+ 'and pass `snapshot_store:` to Woods::MCP::Server.build.',
79
+ code: :not_configured, config_key: 'enable_snapshots',
80
+ doc_link: 'docs/TEMPORAL_SNAPSHOTS.md', tool: tool
81
+ )
82
+ end
42
83
 
43
84
  server = ::MCP::Server.new(
44
85
  name: 'woods',
@@ -47,8 +88,8 @@ module Woods
47
88
  resource_templates: resource_templates
48
89
  )
49
90
 
50
- define_lookup_tool(server, reader, respond, renderer)
51
- define_search_tool(server, reader, respond, renderer)
91
+ define_lookup_tool(server, reader, respond, respond_err, renderer)
92
+ define_search_tool(server, reader, respond, respond_err, renderer)
52
93
  define_traversal_tool(server, reader, respond, renderer,
53
94
  name: 'dependencies',
54
95
  description: 'Traverse forward dependencies of a unit (what it depends on). Returns a BFS tree with depth.',
@@ -61,17 +102,25 @@ module Woods
61
102
  render_key: :dependents)
62
103
  define_structure_tool(server, reader, respond, renderer)
63
104
  define_graph_analysis_tool(server, reader, respond, renderer)
105
+ define_domain_clusters_tool(server, reader, respond, renderer)
64
106
  define_pagerank_tool(server, reader, respond, renderer)
65
107
  define_framework_tool(server, reader, respond, renderer)
66
108
  define_recent_changes_tool(server, reader, respond, renderer)
67
- define_reload_tool(server, reader, respond)
68
- define_retrieve_tool(server, retriever, respond)
69
- define_trace_flow_tool(server, reader, index_dir, respond, renderer)
70
- define_session_trace_tool(server, reader, respond)
71
- define_operator_tools(server, operator, respond)
72
- define_feedback_tools(server, feedback_store, respond)
73
- define_snapshot_tools(server, snapshot_store, respond)
74
- define_notion_sync_tool(server, reader, index_dir, respond)
109
+ define_reload_tool(server, reader, respond, retriever_reloader)
110
+ define_retrieve_tool(server, retriever, respond, respond_err)
111
+ define_trace_flow_tool(server, reader, index_dir, respond, respond_err, renderer)
112
+ # Conditionally register collaborator-dependent tools. Historically
113
+ # all 15 stubs were registered unconditionally and returned
114
+ # isError: true when the wiring was missing — that added token
115
+ # noise to every LLM turn's tool catalog and invited the model to
116
+ # try tools guaranteed to fail. Only register when the collaborator
117
+ # is wired, so tools/list reflects what the server can actually do.
118
+ define_session_trace_tool(server, reader, respond, respond_err) if session_tracer_wired?
119
+ define_operator_tools(server, operator, respond, respond_err, op_missing) if operator
120
+ define_feedback_tools(server, feedback_store, respond, respond_err, fb_missing) if feedback_store
121
+ define_snapshot_tools(server, snapshot_store, respond, respond_err, snap_missing) if snapshot_store
122
+ define_notion_sync_tool(server, reader, index_dir, respond, respond_err) if notion_wired?
123
+ define_woods_status_tool(server, reader, retriever, index_dir, bootstrap_state, respond)
75
124
  register_resource_handler(server, reader)
76
125
 
77
126
  server
@@ -79,10 +128,67 @@ module Woods
79
128
 
80
129
  private
81
130
 
131
+ # Session tracer requires a configured session_store on Woods.configuration.
132
+ # The tool reads the store inside its handler; skipping registration when
133
+ # the store is absent keeps tools/list honest.
134
+ #
135
+ # The `session_trace` handler itself only calls `store.read`. We
136
+ # ALSO probe `:sessions` as a defense-in-depth cheap contract
137
+ # check — every shipped store (File/Redis/SolidCache) implements
138
+ # both, so if a misconfigured store lacks `:sessions` it is almost
139
+ # certainly missing `:read` too, and we'd rather fail at wire-up
140
+ # than at first invocation. A record-only store (permitted by the
141
+ # middleware for backward-compatibility) will correctly drop out
142
+ # of tools/list here.
143
+ def session_tracer_wired?
144
+ config = Woods.configuration
145
+ return false unless config
146
+ return false unless config.respond_to?(:session_store)
147
+
148
+ store = config.session_store
149
+ return false if store.nil?
150
+
151
+ %i[read sessions].all? { |m| store.respond_to?(m) }
152
+ end
153
+
154
+ # Notion export needs both an API token and at least one database ID.
155
+ # NOTION_API_TOKEN env var overrides the config token (see
156
+ # docs/NOTION_EXPORT.md).
157
+ def notion_wired?
158
+ config = Woods.configuration
159
+ return false unless config
160
+
161
+ token = ENV['NOTION_API_TOKEN'] || (config.respond_to?(:notion_api_token) ? config.notion_api_token : nil)
162
+ ids = config.respond_to?(:notion_database_ids) ? config.notion_database_ids : nil
163
+ token && !token.empty? && ids && !ids.empty?
164
+ end
165
+
82
166
  def text_response(text)
83
167
  ::MCP::Tool::Response.new([{ type: 'text', text: text }])
84
168
  end
85
169
 
170
+ # Build a structured error response that carries machine-readable
171
+ # metadata alongside the human-readable text. Agents can branch on
172
+ # `_meta.error_code` (e.g. `:not_configured`, `:not_found`,
173
+ # `:rate_limited`, `:unsupported_argument`) without parsing the text.
174
+ #
175
+ # @param message [String] Human-readable explanation
176
+ # @param code [Symbol] Stable error code (machine-readable)
177
+ # @param config_key [String, nil] Offending configuration key when relevant
178
+ # @param doc_link [String, nil] Relative docs path explaining the fix
179
+ # @param extra [Hash] Additional meta fields (e.g., identifier:, tool:)
180
+ def error_response(message, code:, config_key: nil, doc_link: nil, **extra)
181
+ meta = { error_code: code }
182
+ meta[:config_key] = config_key if config_key
183
+ meta[:doc_link] = doc_link if doc_link
184
+ meta.merge!(extra) unless extra.empty?
185
+ ::MCP::Tool::Response.new(
186
+ [{ type: 'text', text: message }],
187
+ error: true,
188
+ meta: meta
189
+ )
190
+ end
191
+
86
192
  def truncate_section(array, limit)
87
193
  return array unless array.is_a?(Array)
88
194
 
@@ -107,14 +213,55 @@ module Woods
107
213
  value.is_a?(String) ? [value] : value
108
214
  end
109
215
 
110
- # Coerce a value to an Integer. Converts String representations
111
- # to Integer; leaves existing Integers and nil unchanged.
112
- # MCP clients may send "2" (string) instead of 2 (integer).
216
+ # Coerce a value to an Integer.
113
217
  #
114
- # @param value [String, Integer, nil] The input value
218
+ # - `nil` passes through unchanged.
219
+ # - `Integer` passes through unchanged.
220
+ # - `String` is accepted iff it represents a decimal integer with an
221
+ # optional leading `+`/`-`. `"abc"` and `"1abc"` used to silently
222
+ # coerce to `0` via `String#to_i`; that was a footgun for tools with
223
+ # integer bounds (limit, offset, budget, timeout) — they'd receive
224
+ # the wrong value without any feedback to the client. Now we raise
225
+ # `ArgumentError` so the MCP dispatch layer can surface a proper
226
+ # JSON-RPC error back to the caller.
227
+ # - Any other type raises `ArgumentError`.
228
+ #
229
+ # @param value [String, Integer, nil]
115
230
  # @return [Integer, nil]
231
+ # @raise [ArgumentError] if `value` is not nil, Integer, or an Integer-shaped String.
232
+ INTEGER_STRING = /\A[+-]?\d+\z/
233
+ private_constant :INTEGER_STRING
116
234
  def coerce_integer(value)
117
- value.is_a?(String) ? value.to_i : value
235
+ return nil if value.nil?
236
+ return value if value.is_a?(Integer)
237
+
238
+ return Integer(value, 10) if value.is_a?(String) && value.match?(INTEGER_STRING)
239
+
240
+ raise ArgumentError, "expected integer, got #{value.class}: #{value.inspect}"
241
+ end
242
+
243
+ # Load a precomputed flow document written by FlowPrecomputer, when
244
+ # `config.precompute_flows` was enabled during extraction. Returns nil
245
+ # when the entry point is missing a method suffix, the JSON file isn't
246
+ # on disk, or the file can't be parsed — callers fall back to
247
+ # FlowAssembler.
248
+ #
249
+ # @param index_dir [String]
250
+ # @param entry_point [String] e.g., "PostsController#create"
251
+ # @return [Woods::FlowDocument, nil]
252
+ def load_precomputed_flow(index_dir, entry_point)
253
+ return nil unless entry_point.to_s.include?('#')
254
+
255
+ controller, action = entry_point.split('#', 2)
256
+ return nil if controller.empty? || action.empty?
257
+
258
+ filename = "#{controller.gsub('::', '__')}_#{action}.json"
259
+ path = File.join(index_dir, 'flows', filename)
260
+ return nil unless File.exist?(path)
261
+
262
+ Woods::FlowDocument.from_h(JSON.parse(File.read(path)))
263
+ rescue JSON::ParserError, Errno::ENOENT
264
+ nil
118
265
  end
119
266
 
120
267
  # Apply offset+limit pagination to a single section key within a container hash.
@@ -138,26 +285,40 @@ module Woods
138
285
  container["#{key}_offset"] = offset if offset.positive?
139
286
  end
140
287
 
141
- def define_lookup_tool(server, reader, respond, renderer)
288
+ def define_lookup_tool(server, reader, respond, respond_err, renderer)
142
289
  coerce = method(:coerce_array)
143
290
  server.define_tool(
144
291
  name: 'lookup',
145
292
  description: 'Look up a code unit by its exact identifier. Returns full source code, metadata, ' \
146
293
  'dependencies, and dependents. Use include_source: false to omit source_code. ' \
147
- 'Use sections to select specific keys (type, identifier, file_path, namespace are always included).',
294
+ 'Use sections to select specific keys (type, identifier, file_path, namespace are always included). ' \
295
+ '`name` is accepted as an alias for `identifier` for discoverability.',
148
296
  input_schema: {
149
297
  properties: {
150
298
  identifier: { type: 'string',
151
299
  description: 'Exact unit identifier (e.g. "Post", "PostsController", "Api::V1::HealthController")' },
300
+ name: { type: 'string', description: 'Alias for `identifier`. Either one works.' },
152
301
  include_source: { type: 'boolean', description: 'Include source_code in response (default: true)' },
153
302
  sections: {
154
303
  type: 'array', items: { type: 'string' },
155
304
  description: 'Select specific keys to return (e.g. ["metadata", "dependencies"]). Always includes type, identifier, file_path, namespace.'
156
305
  }
157
- },
158
- required: ['identifier']
306
+ }
307
+ # NOTE: 'identifier' is not listed as required — `name` is an
308
+ # accepted alias. The handler validates that one of the two
309
+ # was provided.
159
310
  }
160
- ) do |identifier:, server_context:, include_source: nil, sections: nil|
311
+ ) do |server_context:, identifier: nil, name: nil, include_source: nil, sections: nil|
312
+ identifier ||= name
313
+ if identifier.nil? || identifier.empty?
314
+ next respond_err.call(
315
+ 'lookup requires `identifier` (or its alias `name`).',
316
+ code: :unsupported_argument,
317
+ tool: 'lookup',
318
+ argument: 'identifier',
319
+ hint: 'Pass identifier: "PostsController" (or name: "PostsController").'
320
+ )
321
+ end
161
322
  sections = coerce.call(sections)
162
323
  unit = reader.find_unit(identifier)
163
324
  if unit
@@ -170,47 +331,87 @@ module Woods
170
331
  end
171
332
  respond.call(renderer.render(:lookup, filtered))
172
333
  else
173
- respond.call("Unit not found: #{identifier}")
334
+ respond_err.call(
335
+ "Unit not found: #{identifier}",
336
+ code: :not_found,
337
+ identifier: identifier,
338
+ tool: 'lookup',
339
+ hint: 'Use `search` to find identifiers by pattern, then `lookup` on the exact match.'
340
+ )
174
341
  end
175
342
  end
176
343
  end
177
344
 
178
- def define_search_tool(server, reader, respond, renderer)
345
+ def define_search_tool(server, reader, respond, respond_err, renderer)
179
346
  coerce = method(:coerce_array)
180
347
  coerce_int = method(:coerce_integer)
181
348
  server.define_tool(
182
349
  name: 'search',
183
- description: 'Search code units by pattern. Matches against identifiers by default; can also search source_code and metadata fields.',
350
+ description: 'Find code units whose identifiers (or source/metadata) match a regex. ' \
351
+ 'Example: search("Worker|Job") returns all workers and jobs; search("^Post") ' \
352
+ 'returns units starting with "Post". Returns [{identifier, type, match_field}]. ' \
353
+ 'Use `lookup` for exact identifiers, `dependencies`/`dependents` for graph traversal. ' \
354
+ 'Gotchas: query is a Ruby regex — literal pipe needs escaping as \\|; ' \
355
+ 'types restricts which index directories are scanned (e.g. ["mailer"] scans only ' \
356
+ 'the mailers dir); invalid regex falls back to literal match. ' \
357
+ 'For plain prefix/suffix matching on namespaces, prefer exact_prefix / exact_suffix ' \
358
+ '(literal, case-insensitive) over escaping regex anchors.',
184
359
  input_schema: {
185
360
  properties: {
186
- query: { type: 'string', description: 'Search pattern (case-insensitive regex)' },
361
+ query: { type: 'string', description: 'Case-insensitive Ruby regex pattern (e.g. "Worker|Job", "^Post", ".*Service$")' },
187
362
  types: {
188
363
  type: 'array', items: { type: 'string' },
189
- description: 'Filter to these types: model, controller, service, job, mailer, etc.'
364
+ description: 'Restrict scan to these unit types: model, controller, service, job, mailer, etc.'
190
365
  },
191
366
  fields: {
192
367
  type: 'array', items: { type: 'string' },
193
- description: 'Fields to search: identifier, source_code, metadata. Default: [identifier]'
368
+ description: 'Fields to search: identifier (default), source_code, metadata'
194
369
  },
195
- limit: { type: 'integer', description: 'Maximum results (default: 20)' }
196
- },
197
- required: ['query']
370
+ limit: { type: 'integer', description: 'Maximum results (default: 20)' },
371
+ exact_prefix: {
372
+ type: 'string',
373
+ description: 'Literal (non-regex) case-insensitive identifier prefix filter. ' \
374
+ 'Use for namespace scoping like "Next::Settings::" without escaping regex metacharacters.'
375
+ },
376
+ exact_suffix: {
377
+ type: 'string',
378
+ description: 'Literal (non-regex) case-insensitive identifier suffix filter. ' \
379
+ 'Use for suffix matching like "Controller" without escaping regex metacharacters.'
380
+ }
381
+ }
198
382
  }
199
- ) do |query:, server_context:, types: nil, fields: nil, limit: nil|
383
+ ) do |server_context:, query: nil, types: nil, fields: nil, limit: nil, exact_prefix: nil, exact_suffix: nil|
384
+ if (query.nil? || query.empty?) &&
385
+ (exact_prefix.nil? || exact_prefix.empty?) &&
386
+ (exact_suffix.nil? || exact_suffix.empty?)
387
+ next respond_err.call(
388
+ 'search requires `query` or at least one of `exact_prefix` / `exact_suffix`.',
389
+ code: :unsupported_argument,
390
+ tool: 'search',
391
+ argument: 'query',
392
+ hint: 'Pass query: "Worker|Job" for regex matching, or exact_prefix: "Next::Settings::" for literal prefix scoping.'
393
+ )
394
+ end
200
395
  types = coerce.call(types)
201
396
  fields = coerce.call(fields)
202
397
  limit = coerce_int.call(limit)
203
- results = reader.search(
398
+ search_result = reader.search(
204
399
  query,
205
400
  types: types,
206
401
  fields: fields || %w[identifier],
207
- limit: limit || 20
402
+ limit: limit || 20,
403
+ exact_prefix: exact_prefix,
404
+ exact_suffix: exact_suffix
208
405
  )
209
- respond.call(renderer.render(:search, {
210
- query: query,
211
- result_count: results.size,
212
- results: results
213
- }))
406
+ results = search_result[:results]
407
+ payload = {
408
+ query: query,
409
+ result_count: results.size,
410
+ results: results
411
+ }
412
+ payload[:note] = search_result[:note] if search_result[:note]
413
+ payload[:partial] = true if search_result[:partial]
414
+ respond.call(renderer.render(:search, payload))
214
415
  end
215
416
  end
216
417
 
@@ -227,14 +428,23 @@ module Woods
227
428
  types: {
228
429
  type: 'array', items: { type: 'string' },
229
430
  description: 'Filter to these types'
431
+ },
432
+ via: {
433
+ type: 'array', items: { type: 'string' },
434
+ description: 'Filter by relationship type. Accepts either a single string ' \
435
+ "(e.g. 'code_reference') or an array " \
436
+ "(e.g. ['code_reference','render']); both forms are coerced to an array internally. " \
437
+ 'Known values: link_to, redirect_to, form_action, render, code_reference, ' \
438
+ 'belongs_to, has_many, has_one, has_and_belongs_to_many.'
230
439
  }
231
440
  },
232
441
  required: ['identifier']
233
442
  }
234
- ) do |identifier:, server_context:, depth: nil, types: nil|
443
+ ) do |identifier:, server_context:, depth: nil, types: nil, via: nil|
235
444
  types = coerce.call(types)
445
+ via = coerce.call(via)
236
446
  depth = coerce_int.call(depth)
237
- result = reader.send(reader_method, identifier, depth: depth || 2, types: types)
447
+ result = reader.send(reader_method, identifier, depth: depth || 2, types: types, via: via)
238
448
  if result[:found] == false
239
449
  result[:message] =
240
450
  "Identifier '#{identifier}' not found in the index. Use 'search' to find valid identifiers."
@@ -256,7 +466,7 @@ module Woods
256
466
  }
257
467
  }
258
468
  ) do |server_context:, detail: nil|
259
- result = { manifest: reader.manifest }
469
+ result = { manifest: reader.manifest, template_engines: reader.template_engines }
260
470
  result[:summary] = reader.summary if (detail || 'summary') == 'full'
261
471
  respond.call(renderer.render(:structure, result))
262
472
  end
@@ -306,6 +516,39 @@ module Woods
306
516
  end
307
517
  end
308
518
 
519
+ def define_domain_clusters_tool(server, reader, respond, renderer)
520
+ coerce = method(:coerce_array)
521
+ coerce_int = method(:coerce_integer)
522
+ server.define_tool(
523
+ name: 'domain_clusters',
524
+ description: 'Group code units into semantic domains by namespace and graph connectivity. ' \
525
+ 'Returns clusters with hub nodes, entry points, boundary edges, and type breakdowns. ' \
526
+ 'Useful for understanding architectural domains and blast radius.',
527
+ input_schema: {
528
+ properties: {
529
+ min_size: {
530
+ type: 'integer',
531
+ description: 'Minimum units per cluster before merging into neighbors (default: 3)'
532
+ },
533
+ types: {
534
+ type: 'array', items: { type: 'string' },
535
+ description: 'Filter to these unit types (default: all). Example: ["model", "service", "job"]'
536
+ }
537
+ }
538
+ }
539
+ ) do |server_context:, min_size: nil, types: nil|
540
+ min_size = coerce_int.call(min_size) || 3
541
+ types = coerce.call(types)
542
+
543
+ graph = reader.dependency_graph
544
+ analyzer = Woods::GraphAnalyzer.new(graph)
545
+
546
+ clusters = analyzer.domain_clusters(min_size: min_size, types: types)
547
+
548
+ respond.call(renderer.render(:domain_clusters, { clusters: clusters, total: clusters.size }))
549
+ end
550
+ end
551
+
309
552
  def define_pagerank_tool(server, reader, respond, renderer)
310
553
  coerce = method(:coerce_array)
311
554
  coerce_int = method(:coerce_integer)
@@ -400,56 +643,127 @@ module Woods
400
643
  end
401
644
  end
402
645
 
403
- def define_reload_tool(server, reader, respond)
646
+ def define_reload_tool(server, reader, respond, retriever_reloader)
404
647
  server.define_tool(
405
648
  name: 'reload',
406
- description: 'Reload extraction data from disk. Use after re-running extraction to pick up changes ' \
407
- 'without restarting the server.',
649
+ description: 'Reload extraction data from disk. Use after re-running extraction or woods:embed to pick ' \
650
+ 'up changes without restarting the server. Refreshes the JSON index (manifest, dependency ' \
651
+ 'graph, unit cache) AND re-hydrates the retriever\'s in-memory vector/metadata/graph ' \
652
+ 'stores from the latest dumps. Durable backends (pgvector, Qdrant) are auto-refreshed ' \
653
+ 'externally — their counts in the response reflect the read-through state.',
408
654
  input_schema: { type: 'object', properties: {} }
409
655
  ) do |server_context:|
410
656
  reader.reload!
411
657
  manifest = reader.manifest
412
- respond.call(JSON.pretty_generate({
413
- reloaded: true,
414
- extracted_at: manifest['extracted_at'],
415
- total_units: manifest['total_units'],
416
- counts: manifest['counts']
417
- }))
658
+ payload = {
659
+ reloaded: true,
660
+ extracted_at: manifest['extracted_at'],
661
+ total_units: manifest['total_units'],
662
+ counts: manifest['counts']
663
+ }
664
+ if retriever_reloader
665
+ begin
666
+ payload[:retriever] = retriever_reloader.call
667
+ rescue StandardError => e
668
+ payload[:retriever] = { error: "#{e.class}: #{e.message}" }
669
+ end
670
+ end
671
+ respond.call(JSON.pretty_generate(payload))
418
672
  end
419
673
  end
420
674
 
421
- def define_retrieve_tool(server, retriever, respond)
675
+ def define_retrieve_tool(server, retriever, respond, respond_err)
422
676
  coerce_int = method(:coerce_integer)
677
+ coerce = method(:coerce_array)
423
678
  server.define_tool(
424
679
  name: 'codebase_retrieve',
425
- description: 'Retrieve relevant codebase context for a natural language query using semantic search. ' \
426
- 'Returns ranked code units assembled into a token-budgeted context string.',
680
+ description: 'Semantic search: retrieve relevant code units for a natural-language question. ' \
681
+ 'Example: codebase_retrieve("how does billing work?") returns ranked source context. ' \
682
+ 'Returns a token-budgeted context string ready to paste into a prompt. ' \
683
+ 'Use `search` for exact name/pattern matching; use this for conceptual questions. ' \
684
+ 'Requires an embedding provider — disabled if OPENAI_API_KEY is unset and Ollama is unreachable. ' \
685
+ 'By default excludes test_mappings (~33% of a typical index) so spec filenames do not ' \
686
+ 'dominate semantic rank; pass types: ["test_mapping"] to opt back in. ' \
687
+ 'Parameter: use `budget` for the token budget (not `limit` — that means result count ' \
688
+ 'on sibling tools, and mapping it here would silently produce a near-empty response).',
427
689
  input_schema: {
428
690
  properties: {
429
691
  query: { type: 'string',
430
- description: 'Natural language query (e.g. "How does user authentication work?")' },
431
- budget: { type: 'integer', description: 'Token budget for context assembly (default: 8000)' }
692
+ description: 'Natural language question (e.g. "How does user authentication work?")' },
693
+ budget: { type: 'integer',
694
+ description: 'Token budget for context assembly (default: 8000).' },
695
+ types: {
696
+ type: 'array', items: { type: 'string' },
697
+ description: 'Restrict results to these unit types (model, controller, service, job, mailer, ' \
698
+ 'rails_source, test_mapping, etc.). Overrides the default test_mapping exclusion. ' \
699
+ 'When the unfiltered top-K has no candidate of a requested type, the retriever ' \
700
+ 'falls back to rank-within-type so the response is populated whenever units of ' \
701
+ 'the requested type exist in the index. The response appends a "Type rank ' \
702
+ 'context" table with per-type: source, rank in unfiltered top-K, global_k, ' \
703
+ 'total_of_type. Read source to tell the cases apart: in_top_k (strong match), ' \
704
+ 'within_type_fallback (weak match surfaced by the fallback), outside_top_k ' \
705
+ '(index has this type but other requested types filled the result), absent ' \
706
+ '(zero units of this type in the index).'
707
+ },
708
+ exclude_types: {
709
+ type: 'array', items: { type: 'string' },
710
+ description: 'Additional types to exclude on top of the default test_mapping exclusion.'
711
+ }
432
712
  },
433
713
  required: ['query']
434
714
  }
435
- ) do |query:, server_context:, budget: nil|
715
+ ) do |query:, server_context:, budget: nil, limit: nil, types: nil, exclude_types: nil|
716
+ # `limit` isn't declared in the schema but clients still send it
717
+ # because sibling tools (search, recent_changes, pagerank) use
718
+ # `limit` as a result count. Mapping it to `budget` here would
719
+ # silently produce a near-empty response (limit: 10 → 10-token
720
+ # budget). Surface a helpful typed error instead.
721
+ unless limit.nil?
722
+ next respond_err.call(
723
+ 'codebase_retrieve uses `budget` (token budget, default 8000), not `limit`. ' \
724
+ '`limit` is the result-count parameter on sibling tools (search, recent_changes, pagerank). ' \
725
+ "Pass `budget: #{coerce_int.call(limit)}` if you meant a #{coerce_int.call(limit)}-token context, " \
726
+ 'or drop the kwarg entirely for the default 8000.',
727
+ code: :unsupported_argument,
728
+ tool: 'codebase_retrieve',
729
+ argument: 'limit',
730
+ hint: 'Use `budget:` for tokens. Retrieval does not cap by result count — the token budget ' \
731
+ 'governs how many ranked units fit in the returned context.'
732
+ )
733
+ end
734
+
436
735
  budget = coerce_int.call(budget)
736
+ types = coerce.call(types)
737
+ exclude_types = coerce.call(exclude_types)
437
738
  if retriever
438
- result = retriever.retrieve(query, budget: budget || 8000)
739
+ result = retriever.retrieve(
740
+ query,
741
+ budget: budget || 8000,
742
+ types: types,
743
+ exclude_types: exclude_types
744
+ )
439
745
  respond.call(result.context)
440
746
  else
441
- respond.call(
442
- 'Semantic search is not available. Embedding provider is not configured. ' \
443
- 'Use the search tool for pattern-based search instead.'
747
+ respond_err.call(
748
+ 'Semantic search is disabled no embedding provider is configured. ' \
749
+ 'To enable: set OPENAI_API_KEY, or run Ollama locally ' \
750
+ '(brew install ollama && ollama serve && ollama pull nomic-embed-text). ' \
751
+ 'Use the `search` tool for pattern-based matching in the meantime.',
752
+ code: :not_configured,
753
+ config_key: 'embedding_provider',
754
+ doc_link: 'docs/RETRIEVAL_GUIDE.md#configuring-retrieval',
755
+ tool: 'codebase_retrieve'
444
756
  )
445
757
  end
446
758
  end
447
759
  end
448
760
 
449
- def define_trace_flow_tool(server, reader, index_dir, respond, renderer)
761
+ def define_trace_flow_tool(server, reader, index_dir, respond, respond_err, renderer)
450
762
  require_relative '../flow_assembler'
763
+ require_relative '../flow_document'
451
764
  require_relative '../dependency_graph'
452
765
  coerce_int = method(:coerce_integer)
766
+ load_precomputed = method(:load_precomputed_flow)
453
767
 
454
768
  server.define_tool(
455
769
  name: 'trace_flow',
@@ -469,21 +783,33 @@ module Woods
469
783
  }
470
784
  ) do |entry_point:, server_context:, depth: nil|
471
785
  max_depth = coerce_int.call(depth) || 3
472
- graph = reader.dependency_graph
473
786
 
474
- assembler = Woods::FlowAssembler.new(
475
- graph: graph,
476
- extracted_dir: index_dir
477
- )
478
- flow_doc = assembler.assemble(entry_point, max_depth: max_depth)
787
+ # Prefer the precomputed flow JSON written by FlowPrecomputer during
788
+ # extraction (gated on `config.precompute_flows`) — it avoids
789
+ # re-parsing source on every request. Fall back to query-time
790
+ # reassembly when no precomputed document exists.
791
+ flow_doc = load_precomputed.call(index_dir, entry_point)
792
+ flow_doc ||= begin
793
+ graph = reader.dependency_graph
794
+ assembler = Woods::FlowAssembler.new(graph: graph, extracted_dir: index_dir)
795
+ assembler.assemble(entry_point, max_depth: max_depth)
796
+ end
479
797
 
480
798
  respond.call(renderer.render(:trace_flow, flow_doc.to_h))
481
799
  rescue StandardError => e
482
- respond.call(JSON.pretty_generate({ error: e.message }))
800
+ # Emit an MCP error so clients can detect the failure and
801
+ # surface it, rather than wrapping the error payload in a
802
+ # successful response — consistent with session_trace and
803
+ # codebase_retrieve.
804
+ respond_err.call(
805
+ "trace_flow failed: #{e.message}",
806
+ code: :internal_error,
807
+ data: { entry_point: entry_point, exception: e.class.name }
808
+ )
483
809
  end
484
810
  end
485
811
 
486
- def define_session_trace_tool(server, reader, respond)
812
+ def define_session_trace_tool(server, reader, respond, respond_err)
487
813
  coerce_int = method(:coerce_integer)
488
814
  server.define_tool(
489
815
  name: 'session_trace',
@@ -500,7 +826,16 @@ module Woods
500
826
  budget = coerce_int.call(budget)
501
827
  depth = coerce_int.call(depth)
502
828
  store = Woods.configuration.session_store
503
- next respond.call(JSON.pretty_generate({ error: 'Session tracer not configured' })) unless store
829
+ unless store
830
+ next respond_err.call(
831
+ 'Session tracer is not configured. Assign `session_store` (FileStore, RedisStore, or SolidCacheStore) ' \
832
+ 'and set `session_tracer_enabled = true` in Woods.configure.',
833
+ code: :not_configured,
834
+ config_key: 'session_store',
835
+ doc_link: 'docs/SESSION_TRACER.md',
836
+ tool: 'session_trace'
837
+ )
838
+ end
504
839
 
505
840
  require_relative '../session_tracer/session_flow_assembler'
506
841
 
@@ -510,26 +845,31 @@ module Woods
510
845
  doc = assembler.assemble(session_id, budget: budget || 8000, depth: depth || 1)
511
846
  respond.call(doc.to_markdown)
512
847
  rescue StandardError => e
513
- respond.call(JSON.pretty_generate({ error: e.message }))
848
+ respond_err.call(
849
+ "Session trace failed: #{e.message}",
850
+ code: :internal_error,
851
+ tool: 'session_trace',
852
+ session_id: session_id
853
+ )
514
854
  end
515
855
  end
516
856
 
517
- def define_operator_tools(server, operator, respond)
518
- define_pipeline_extract_tool(server, operator, respond)
519
- define_pipeline_embed_tool(server, operator, respond)
520
- define_pipeline_status_tool(server, operator, respond)
521
- define_pipeline_diagnose_tool(server, operator, respond)
522
- define_pipeline_repair_tool(server, operator, respond)
857
+ def define_operator_tools(server, operator, respond, respond_err, op_missing)
858
+ define_pipeline_extract_tool(server, operator, respond, respond_err, op_missing)
859
+ define_pipeline_embed_tool(server, operator, respond, respond_err, op_missing)
860
+ define_pipeline_status_tool(server, operator, respond, respond_err, op_missing)
861
+ define_pipeline_diagnose_tool(server, operator, respond, respond_err, op_missing)
862
+ define_pipeline_repair_tool(server, operator, respond, respond_err, op_missing)
523
863
  end
524
864
 
525
- def define_feedback_tools(server, feedback_store, respond)
526
- define_retrieval_rate_tool(server, feedback_store, respond)
527
- define_retrieval_report_gap_tool(server, feedback_store, respond)
528
- define_retrieval_explain_tool(server, feedback_store, respond)
529
- define_retrieval_suggest_tool(server, feedback_store, respond)
865
+ def define_feedback_tools(server, feedback_store, respond, _respond_err, fb_missing)
866
+ define_retrieval_rate_tool(server, feedback_store, respond, fb_missing)
867
+ define_retrieval_report_gap_tool(server, feedback_store, respond, fb_missing)
868
+ define_retrieval_explain_tool(server, feedback_store, respond, fb_missing)
869
+ define_retrieval_suggest_tool(server, feedback_store, respond, fb_missing)
530
870
  end
531
871
 
532
- def define_pipeline_extract_tool(server, operator, respond)
872
+ def define_pipeline_extract_tool(server, operator, respond, respond_err, op_missing)
533
873
  server.define_tool(
534
874
  name: 'pipeline_extract',
535
875
  description: 'Trigger a codebase extraction pipeline run. Checks rate limits before proceeding.',
@@ -539,11 +879,31 @@ module Woods
539
879
  }
540
880
  }
541
881
  ) do |server_context:, incremental: nil|
542
- next respond.call('Pipeline operator is not configured.') unless operator
882
+ next op_missing.call('pipeline_extract') unless operator
543
883
 
544
884
  guard = operator[:pipeline_guard]
545
- next respond.call('Extraction is rate-limited. Try again later.') if guard && !guard.allow?(:extraction)
885
+ if guard && !guard.allow?(:extraction)
886
+ next respond_err.call(
887
+ 'Extraction is rate-limited. Try again later.',
888
+ code: :rate_limited,
889
+ tool: 'pipeline_extract',
890
+ retry_after_seconds: 300
891
+ )
892
+ end
546
893
 
894
+ # Acquire the in-process lock BEFORE recording to the guard.
895
+ # Otherwise a refused "already running" request still resets
896
+ # the cooldown clock and blocks the next legitimate attempt
897
+ # for the full 5-minute window once the current run finishes.
898
+ unless Woods::MCP::Server.send(:pipeline_start, :extraction)
899
+ next respond_err.call(
900
+ 'Extraction pipeline is already running. Wait for it to complete.',
901
+ code: :already_running,
902
+ tool: 'pipeline_extract'
903
+ )
904
+ end
905
+
906
+ # Lock acquired — now it's safe to record the run.
547
907
  guard&.record!(:extraction)
548
908
 
549
909
  Thread.new do
@@ -554,6 +914,8 @@ module Woods
554
914
  rescue StandardError => e
555
915
  logger = defined?(Rails) ? Rails.logger : Logger.new($stderr)
556
916
  logger.error("[Woods] Pipeline extract failed: #{e.message}")
917
+ ensure
918
+ Woods::MCP::Server.send(:pipeline_finish, :extraction)
557
919
  end
558
920
 
559
921
  respond.call(JSON.pretty_generate({
@@ -563,7 +925,7 @@ module Woods
563
925
  end
564
926
  end
565
927
 
566
- def define_pipeline_embed_tool(server, operator, respond)
928
+ def define_pipeline_embed_tool(server, operator, respond, respond_err, op_missing)
567
929
  server.define_tool(
568
930
  name: 'pipeline_embed',
569
931
  description: 'Trigger embedding generation for extracted units. Checks rate limits before proceeding.',
@@ -573,29 +935,43 @@ module Woods
573
935
  }
574
936
  }
575
937
  ) do |server_context:, incremental: nil|
576
- next respond.call('Pipeline operator is not configured.') unless operator
938
+ next op_missing.call('pipeline_embed') unless operator
577
939
 
578
940
  guard = operator[:pipeline_guard]
579
- next respond.call('Embedding is rate-limited. Try again later.') if guard && !guard.allow?(:embedding)
941
+ if guard && !guard.allow?(:embedding)
942
+ next respond_err.call(
943
+ 'Embedding is rate-limited. Try again later.',
944
+ code: :rate_limited,
945
+ tool: 'pipeline_embed',
946
+ retry_after_seconds: 300
947
+ )
948
+ end
949
+
950
+ # Acquire the in-process lock first so a refused "already
951
+ # running" request doesn't burn the cooldown clock.
952
+ unless Woods::MCP::Server.send(:pipeline_start, :embedding)
953
+ next respond_err.call(
954
+ 'Embedding pipeline is already running. Wait for it to complete.',
955
+ code: :already_running,
956
+ tool: 'pipeline_embed'
957
+ )
958
+ end
580
959
 
581
960
  guard&.record!(:embedding)
582
961
 
583
962
  Thread.new do
584
- config = Woods.configuration
585
- builder = Woods::Builder.new(config)
586
- provider = builder.build_embedding_provider
587
- text_preparer = Woods::Embedding::TextPreparer.new
588
- vector_store = builder.build_vector_store
589
- indexer = Woods::Embedding::Indexer.new(
590
- provider: provider,
591
- text_preparer: text_preparer,
592
- vector_store: vector_store,
593
- output_dir: config.output_dir
594
- )
963
+ # Share the rake-task wiring so the MCP path picks up the
964
+ # provider-tuned TextPreparer + token-aware chunker. Without
965
+ # this, MCP-triggered embedding still hit Ollama's "input
966
+ # length exceeds context length" error after the rake path
967
+ # was fixed in PR #70.
968
+ indexer = Woods::Tasks.build_embed_indexer
595
969
  incremental ? indexer.index_incremental : indexer.index_all
596
970
  rescue StandardError => e
597
971
  logger = defined?(Rails) ? Rails.logger : Logger.new($stderr)
598
972
  logger.error("[Woods] Pipeline embed failed: #{e.message}")
973
+ ensure
974
+ Woods::MCP::Server.send(:pipeline_finish, :embedding)
599
975
  end
600
976
 
601
977
  respond.call(JSON.pretty_generate({
@@ -605,23 +981,50 @@ module Woods
605
981
  end
606
982
  end
607
983
 
608
- def define_pipeline_status_tool(server, operator, respond)
984
+ # Acquire a pipeline-kind lock atomically. Returns false when
985
+ # another thread is already running that kind of pipeline (so the
986
+ # caller can refuse the new request instead of racing the running
987
+ # pipeline). Module-level state — a single MCP server process
988
+ # serializes its own pipelines.
989
+ def pipeline_start(kind)
990
+ @pipeline_mutex ||= Mutex.new
991
+ @pipeline_in_flight ||= {}
992
+ @pipeline_mutex.synchronize do
993
+ return false if @pipeline_in_flight[kind]
994
+
995
+ @pipeline_in_flight[kind] = true
996
+ true
997
+ end
998
+ end
999
+
1000
+ def pipeline_finish(kind)
1001
+ @pipeline_mutex&.synchronize { @pipeline_in_flight&.delete(kind) }
1002
+ end
1003
+
1004
+ def define_pipeline_status_tool(server, operator, respond, respond_err, op_missing)
609
1005
  server.define_tool(
610
1006
  name: 'pipeline_status',
611
1007
  description: 'Get the current pipeline status: last extraction time, unit counts, staleness.',
612
1008
  input_schema: { type: 'object', properties: {} }
613
1009
  ) do |server_context:|
614
- next respond.call('Pipeline operator is not configured.') unless operator
1010
+ next op_missing.call('pipeline_status') unless operator
615
1011
 
616
1012
  reporter = operator[:status_reporter]
617
- next respond.call('Status reporter is not configured.') unless reporter
1013
+ unless reporter
1014
+ next respond_err.call(
1015
+ 'Status reporter is not configured.',
1016
+ code: :not_configured,
1017
+ config_key: 'operator.status_reporter',
1018
+ tool: 'pipeline_status'
1019
+ )
1020
+ end
618
1021
 
619
1022
  status = reporter.report
620
1023
  respond.call(JSON.pretty_generate(status))
621
1024
  end
622
1025
  end
623
1026
 
624
- def define_pipeline_diagnose_tool(server, operator, respond)
1027
+ def define_pipeline_diagnose_tool(server, operator, respond, respond_err, op_missing)
625
1028
  server.define_tool(
626
1029
  name: 'pipeline_diagnose',
627
1030
  description: 'Classify a recent pipeline error and suggest remediation.',
@@ -633,10 +1036,17 @@ module Woods
633
1036
  required: %w[error_class error_message]
634
1037
  }
635
1038
  ) do |error_class:, error_message:, server_context:|
636
- next respond.call('Pipeline operator is not configured.') unless operator
1039
+ next op_missing.call('pipeline_diagnose') unless operator
637
1040
 
638
1041
  escalator = operator[:error_escalator]
639
- next respond.call('Error escalator is not configured.') unless escalator
1042
+ unless escalator
1043
+ next respond_err.call(
1044
+ 'Error escalator is not configured.',
1045
+ code: :not_configured,
1046
+ config_key: 'operator.error_escalator',
1047
+ tool: 'pipeline_diagnose'
1048
+ )
1049
+ end
640
1050
 
641
1051
  error = StandardError.new(error_message)
642
1052
  # Set the class name in the error string for pattern matching
@@ -646,7 +1056,7 @@ module Woods
646
1056
  end
647
1057
  end
648
1058
 
649
- def define_pipeline_repair_tool(server, operator, respond)
1059
+ def define_pipeline_repair_tool(server, operator, respond, respond_err, op_missing)
650
1060
  server.define_tool(
651
1061
  name: 'pipeline_repair',
652
1062
  description: 'Attempt to repair pipeline state: clear stale locks, reset rate limits.',
@@ -661,7 +1071,7 @@ module Woods
661
1071
  required: ['action']
662
1072
  }
663
1073
  ) do |action:, server_context:|
664
- next respond.call('Pipeline operator is not configured.') unless operator
1074
+ next op_missing.call('pipeline_repair') unless operator
665
1075
 
666
1076
  case action
667
1077
  when 'clear_locks'
@@ -670,17 +1080,29 @@ module Woods
670
1080
  lock.release
671
1081
  respond.call(JSON.pretty_generate({ repaired: true, action: 'clear_locks' }))
672
1082
  else
673
- respond.call('Pipeline lock is not configured.')
1083
+ respond_err.call(
1084
+ 'Pipeline lock is not configured.',
1085
+ code: :not_configured,
1086
+ config_key: 'operator.pipeline_lock',
1087
+ tool: 'pipeline_repair'
1088
+ )
674
1089
  end
675
1090
  when 'reset_cooldowns'
676
1091
  respond.call(JSON.pretty_generate({ repaired: true, action: 'reset_cooldowns' }))
677
1092
  else
678
- respond.call("Unknown repair action: #{action}")
1093
+ respond_err.call(
1094
+ "Unknown repair action: #{action}",
1095
+ code: :unsupported_argument,
1096
+ tool: 'pipeline_repair',
1097
+ argument: 'action',
1098
+ value: action,
1099
+ allowed: %w[clear_locks reset_cooldowns]
1100
+ )
679
1101
  end
680
1102
  end
681
1103
  end
682
1104
 
683
- def define_retrieval_rate_tool(server, feedback_store, respond)
1105
+ def define_retrieval_rate_tool(server, feedback_store, respond, fb_missing)
684
1106
  coerce_int = method(:coerce_integer)
685
1107
  server.define_tool(
686
1108
  name: 'retrieval_rate',
@@ -694,7 +1116,7 @@ module Woods
694
1116
  required: %w[query score]
695
1117
  }
696
1118
  ) do |query:, score:, server_context:, comment: nil|
697
- next respond.call('Feedback store is not configured.') unless feedback_store
1119
+ next fb_missing.call('retrieval_rate') unless feedback_store
698
1120
 
699
1121
  score = coerce_int.call(score)
700
1122
  feedback_store.record_rating(query: query, score: score, comment: comment)
@@ -702,7 +1124,7 @@ module Woods
702
1124
  end
703
1125
  end
704
1126
 
705
- def define_retrieval_report_gap_tool(server, feedback_store, respond)
1127
+ def define_retrieval_report_gap_tool(server, feedback_store, respond, fb_missing)
706
1128
  server.define_tool(
707
1129
  name: 'retrieval_report_gap',
708
1130
  description: 'Report a missing unit that should have appeared in retrieval results.',
@@ -715,7 +1137,7 @@ module Woods
715
1137
  required: %w[query missing_unit unit_type]
716
1138
  }
717
1139
  ) do |query:, missing_unit:, unit_type:, server_context:|
718
- next respond.call('Feedback store is not configured.') unless feedback_store
1140
+ next fb_missing.call('retrieval_report_gap') unless feedback_store
719
1141
 
720
1142
  feedback_store.record_gap(query: query, missing_unit: missing_unit, unit_type: unit_type)
721
1143
  respond.call(JSON.pretty_generate({
@@ -726,13 +1148,13 @@ module Woods
726
1148
  end
727
1149
  end
728
1150
 
729
- def define_retrieval_explain_tool(server, feedback_store, respond)
1151
+ def define_retrieval_explain_tool(server, feedback_store, respond, fb_missing)
730
1152
  server.define_tool(
731
1153
  name: 'retrieval_explain',
732
1154
  description: 'Get feedback statistics: average score, total ratings, gap count.',
733
1155
  input_schema: { type: 'object', properties: {} }
734
1156
  ) do |server_context:|
735
- next respond.call('Feedback store is not configured.') unless feedback_store
1157
+ next fb_missing.call('retrieval_explain') unless feedback_store
736
1158
 
737
1159
  ratings = feedback_store.ratings
738
1160
  gaps = feedback_store.gaps
@@ -746,13 +1168,13 @@ module Woods
746
1168
  end
747
1169
  end
748
1170
 
749
- def define_retrieval_suggest_tool(server, feedback_store, respond)
1171
+ def define_retrieval_suggest_tool(server, feedback_store, respond, fb_missing)
750
1172
  server.define_tool(
751
1173
  name: 'retrieval_suggest',
752
1174
  description: 'Analyze feedback to suggest improvements: detect patterns in low scores and missing units.',
753
1175
  input_schema: { type: 'object', properties: {} }
754
1176
  ) do |server_context:|
755
- next respond.call('Feedback store is not configured.') unless feedback_store
1177
+ next fb_missing.call('retrieval_suggest') unless feedback_store
756
1178
 
757
1179
  require_relative '../feedback/gap_detector'
758
1180
  detector = Woods::Feedback::GapDetector.new(feedback_store: feedback_store)
@@ -764,14 +1186,14 @@ module Woods
764
1186
  end
765
1187
  end
766
1188
 
767
- def define_snapshot_tools(server, snapshot_store, respond)
768
- define_list_snapshots_tool(server, snapshot_store, respond)
769
- define_snapshot_diff_tool(server, snapshot_store, respond)
770
- define_unit_history_tool(server, snapshot_store, respond)
771
- define_snapshot_detail_tool(server, snapshot_store, respond)
1189
+ def define_snapshot_tools(server, snapshot_store, respond, respond_err, snap_missing)
1190
+ define_list_snapshots_tool(server, snapshot_store, respond, snap_missing)
1191
+ define_snapshot_diff_tool(server, snapshot_store, respond, snap_missing)
1192
+ define_unit_history_tool(server, snapshot_store, respond, snap_missing)
1193
+ define_snapshot_detail_tool(server, snapshot_store, respond, respond_err, snap_missing)
772
1194
  end
773
1195
 
774
- def define_list_snapshots_tool(server, snapshot_store, respond)
1196
+ def define_list_snapshots_tool(server, snapshot_store, respond, snap_missing)
775
1197
  coerce_int = method(:coerce_integer)
776
1198
  server.define_tool(
777
1199
  name: 'list_snapshots',
@@ -783,7 +1205,7 @@ module Woods
783
1205
  }
784
1206
  }
785
1207
  ) do |server_context:, limit: nil, branch: nil|
786
- next respond.call('Snapshot store is not configured. Set enable_snapshots: true.') unless snapshot_store
1208
+ next snap_missing.call('list_snapshots') unless snapshot_store
787
1209
 
788
1210
  limit = coerce_int.call(limit)
789
1211
  results = snapshot_store.list(limit: limit || 20, branch: branch)
@@ -791,7 +1213,7 @@ module Woods
791
1213
  end
792
1214
  end
793
1215
 
794
- def define_snapshot_diff_tool(server, snapshot_store, respond)
1216
+ def define_snapshot_diff_tool(server, snapshot_store, respond, snap_missing)
795
1217
  server.define_tool(
796
1218
  name: 'snapshot_diff',
797
1219
  description: 'Compare two extraction snapshots by git SHA. Returns lists of added, modified, and deleted units.',
@@ -803,7 +1225,7 @@ module Woods
803
1225
  required: %w[sha_a sha_b]
804
1226
  }
805
1227
  ) do |sha_a:, sha_b:, server_context:|
806
- next respond.call('Snapshot store is not configured. Set enable_snapshots: true.') unless snapshot_store
1228
+ next snap_missing.call('snapshot_diff') unless snapshot_store
807
1229
 
808
1230
  result = snapshot_store.diff(sha_a, sha_b)
809
1231
  respond.call(JSON.pretty_generate({
@@ -816,7 +1238,7 @@ module Woods
816
1238
  end
817
1239
  end
818
1240
 
819
- def define_unit_history_tool(server, snapshot_store, respond)
1241
+ def define_unit_history_tool(server, snapshot_store, respond, snap_missing)
820
1242
  coerce_int = method(:coerce_integer)
821
1243
  server.define_tool(
822
1244
  name: 'unit_history',
@@ -829,7 +1251,7 @@ module Woods
829
1251
  required: ['identifier']
830
1252
  }
831
1253
  ) do |identifier:, server_context:, limit: nil|
832
- next respond.call('Snapshot store is not configured. Set enable_snapshots: true.') unless snapshot_store
1254
+ next snap_missing.call('unit_history') unless snapshot_store
833
1255
 
834
1256
  limit = coerce_int.call(limit)
835
1257
  entries = snapshot_store.unit_history(identifier, limit: limit || 20)
@@ -841,7 +1263,7 @@ module Woods
841
1263
  end
842
1264
  end
843
1265
 
844
- def define_snapshot_detail_tool(server, snapshot_store, respond)
1266
+ def define_snapshot_detail_tool(server, snapshot_store, respond, respond_err, snap_missing)
845
1267
  server.define_tool(
846
1268
  name: 'snapshot_detail',
847
1269
  description: 'Get full metadata for a specific extraction snapshot by git SHA.',
@@ -852,18 +1274,24 @@ module Woods
852
1274
  required: ['git_sha']
853
1275
  }
854
1276
  ) do |git_sha:, server_context:|
855
- next respond.call('Snapshot store is not configured. Set enable_snapshots: true.') unless snapshot_store
1277
+ next snap_missing.call('snapshot_detail') unless snapshot_store
856
1278
 
857
1279
  snapshot = snapshot_store.find(git_sha)
858
1280
  if snapshot
859
1281
  respond.call(JSON.pretty_generate(snapshot))
860
1282
  else
861
- respond.call("Snapshot not found for git SHA: #{git_sha}")
1283
+ respond_err.call(
1284
+ "Snapshot not found for git SHA: #{git_sha}",
1285
+ code: :not_found,
1286
+ tool: 'snapshot_detail',
1287
+ git_sha: git_sha,
1288
+ hint: 'Use `list_snapshots` to see available SHAs.'
1289
+ )
862
1290
  end
863
1291
  end
864
1292
  end
865
1293
 
866
- def define_notion_sync_tool(server, reader, index_dir, respond)
1294
+ def define_notion_sync_tool(server, reader, index_dir, respond, respond_err)
867
1295
  server.define_tool(
868
1296
  name: 'notion_sync',
869
1297
  description: 'Sync extracted codebase data (Data Models + Columns) to Notion databases. ' \
@@ -875,11 +1303,23 @@ module Woods
875
1303
  ) do |server_context:|
876
1304
  config = Woods.configuration
877
1305
  unless config.notion_api_token
878
- next respond.call('Error: notion_api_token is not configured. Set it in Woods.configure.')
1306
+ next respond_err.call(
1307
+ 'notion_api_token is not configured. Set it in Woods.configure or via the NOTION_API_TOKEN env var.',
1308
+ code: :not_configured,
1309
+ config_key: 'notion_api_token',
1310
+ doc_link: 'docs/NOTION_EXPORT.md',
1311
+ tool: 'notion_sync'
1312
+ )
879
1313
  end
880
1314
 
881
1315
  if (config.notion_database_ids || {}).empty?
882
- next respond.call('Error: notion_database_ids is not configured. Set it in Woods.configure.')
1316
+ next respond_err.call(
1317
+ 'notion_database_ids is not configured. Set it in Woods.configure.',
1318
+ code: :not_configured,
1319
+ config_key: 'notion_database_ids',
1320
+ doc_link: 'docs/NOTION_EXPORT.md',
1321
+ tool: 'notion_sync'
1322
+ )
883
1323
  end
884
1324
 
885
1325
  require_relative '../notion/exporter'
@@ -893,7 +1333,11 @@ module Woods
893
1333
  errors: stats[:errors].first(10)
894
1334
  }))
895
1335
  rescue StandardError => e
896
- respond.call("Notion sync failed: #{e.message}")
1336
+ respond_err.call(
1337
+ "Notion sync failed: #{e.message}",
1338
+ code: :api_error,
1339
+ tool: 'notion_sync'
1340
+ )
897
1341
  end
898
1342
  end
899
1343
 
@@ -931,6 +1375,196 @@ module Woods
931
1375
  ]
932
1376
  end
933
1377
 
1378
+ def define_woods_status_tool(server, reader, retriever, index_dir, bootstrap_state, respond)
1379
+ server.define_tool(
1380
+ name: 'woods_status',
1381
+ description: 'Diagnose whether the Woods index and server are healthy. Returns extraction metadata ' \
1382
+ '(last run, unit counts, git SHA, staleness in seconds), retriever/embedding configuration, ' \
1383
+ 'bootstrap state (hydrated / degraded / failed + reason), feature flags, and a ready flag. ' \
1384
+ 'Call this first on cold connect to learn what the server knows.',
1385
+ input_schema: { type: 'object', properties: {} }
1386
+ ) do |server_context:|
1387
+ _ = server_context
1388
+ status = Woods::MCP::Server.build_status(
1389
+ reader: reader, retriever: retriever, index_dir: index_dir,
1390
+ bootstrap_state: bootstrap_state
1391
+ )
1392
+ respond.call(JSON.pretty_generate(status))
1393
+ end
1394
+ end
1395
+
1396
+ public
1397
+
1398
+ # Build the woods_status payload. Exposed at module level so specs (and future
1399
+ # console/unified-server entry points) can assemble the same shape without
1400
+ # reaching through the MCP::Server internals.
1401
+ #
1402
+ # +features.embedding_model+ / +features.embedding_provider+ /
1403
+ # +features.vector_store+ prefer the ResolvedConfig captured at embed time
1404
+ # (+bootstrap_state.resolved_config+, which is read back from +woods.json+)
1405
+ # over +Woods.configuration+, whose defaults can contradict the actual
1406
+ # provider in use. Without this, operators debugging "wrong provider" see
1407
+ # status claiming +embedding_model: "text-embedding-3-small"+ next to
1408
+ # +embedding_provider: "ollama"+ and reasonably distrust every field.
1409
+ def build_status(reader:, retriever:, index_dir:, bootstrap_state: nil)
1410
+ manifest = safe_manifest(reader)
1411
+ extracted_at = manifest && manifest['extracted_at']
1412
+ staleness = staleness_seconds(extracted_at)
1413
+ # Tolerate a nil Woods.configuration — specs that reset it between
1414
+ # runs can leave a transient nil window, and build_status should
1415
+ # still produce a readable payload during that window.
1416
+ config = Woods.configuration || Woods::Configuration.new
1417
+ resolved = bootstrap_state&.resolved_config
1418
+
1419
+ {
1420
+ ready: manifest && !manifest['counts'].to_h.empty?,
1421
+ server: {
1422
+ name: 'woods',
1423
+ version: Woods::VERSION,
1424
+ index_dir: index_dir.to_s
1425
+ },
1426
+ index: index_section(manifest, extracted_at, staleness, index_dir),
1427
+ retriever: {
1428
+ configured: !retriever.nil?,
1429
+ class: retriever&.class&.name
1430
+ },
1431
+ bootstrap: bootstrap_state&.to_h,
1432
+ features: features_from(config, resolved)
1433
+ }
1434
+ end
1435
+
1436
+ private
1437
+
1438
+ # Assemble the +index+ sub-hash of woods_status, including a staleness
1439
+ # gate that compares +manifest.git_sha+ against the current HEAD. The
1440
+ # manifest captures +git_sha+ / +gemfile_lock_sha+ / +schema_sha+ at
1441
+ # extraction time; until this change nothing compared them against the
1442
+ # live working tree, so an agent asking questions after 40 uncommitted
1443
+ # changes and an MCP restart silently got pre-change answers.
1444
+ #
1445
+ # +git_sha_matches_head+ is a tri-state:
1446
+ # - true — manifest.git_sha == current HEAD
1447
+ # - false — mismatch (stale)
1448
+ # - nil — couldn't resolve (not a git repo, git unavailable,
1449
+ # or manifest has no git_sha)
1450
+ #
1451
+ # When stale, +head_git_sha+ carries the live HEAD so operators can
1452
+ # diff directly. This is an observability signal, not a hard gate —
1453
+ # hard-refusing responses would be much more disruptive than a loudly-
1454
+ # visible staleness flag that agents can branch on.
1455
+ def index_section(manifest, extracted_at, staleness, index_dir)
1456
+ base = {
1457
+ extracted_at: extracted_at,
1458
+ staleness_seconds: staleness,
1459
+ rails_version: manifest && manifest['rails_version'],
1460
+ ruby_version: manifest && manifest['ruby_version'],
1461
+ total_units: manifest && manifest['total_units'],
1462
+ counts: (manifest && manifest['counts']) || {},
1463
+ git_sha: manifest && manifest['git_sha'],
1464
+ git_branch: manifest && manifest['git_branch'],
1465
+ gemfile_lock_sha: manifest && manifest['gemfile_lock_sha'],
1466
+ schema_sha: manifest && manifest['schema_sha']
1467
+ }
1468
+
1469
+ manifest_sha = manifest && manifest['git_sha']
1470
+ head_sha = manifest_sha ? resolve_head_sha(index_dir) : nil
1471
+ return base unless head_sha
1472
+
1473
+ base[:head_git_sha] = head_sha
1474
+ base[:git_sha_matches_head] = (manifest_sha == head_sha)
1475
+ base
1476
+ end
1477
+
1478
+ # Resolve the current HEAD SHA for the git repo containing +index_dir+.
1479
+ # Returns nil when git is unavailable or +index_dir+ is not in a repo —
1480
+ # callers treat nil as "can't compare" rather than "mismatch".
1481
+ #
1482
+ # Uses +capture2e+ so git's "fatal: not a git repository" stderr banner
1483
+ # does not leak through the MCP stdio transport. MCP clients that parse
1484
+ # stderr for protocol framing can't tolerate stray lines.
1485
+ def resolve_head_sha(index_dir)
1486
+ return nil unless index_dir
1487
+
1488
+ dir = index_dir.to_s
1489
+ return nil unless File.directory?(dir)
1490
+
1491
+ output, status = Open3.capture2e('git', '-C', dir, 'rev-parse', 'HEAD')
1492
+ status.success? ? output.strip : nil
1493
+ rescue Errno::ENOENT, Errno::EACCES
1494
+ # git not installed or not executable on this host — equivalent to
1495
+ # "can't compare". Any other exception is a genuine bug and should
1496
+ # propagate.
1497
+ nil
1498
+ end
1499
+
1500
+ # Assemble the +features+ sub-hash of woods_status, preferring the
1501
+ # ResolvedConfig captured at embed time over live {Woods::Configuration}.
1502
+ #
1503
+ # Fields that read from resolved+config (when present): embedding_model,
1504
+ # embedding_provider, vector_store. Everything else is host-process
1505
+ # state (snapshots_enabled, notion_configured, session_tracer_enabled)
1506
+ # and comes from the running config.
1507
+ #
1508
+ # +console_mcp_enabled+ is intentionally omitted — the index MCP process
1509
+ # has no visibility into the host Rails app's Woods initializer, so
1510
+ # historic status payloads always reported +false+ regardless of the
1511
+ # actual console MCP state. Advertising a misleading field is worse
1512
+ # than not advertising it at all.
1513
+ def features_from(config, resolved)
1514
+ provider_hash = resolved&.embedding_provider || {}
1515
+ resolved_provider = resolved_provider_symbol(provider_hash[:class])
1516
+ resolved_model = provider_hash[:model]
1517
+ resolved_vector = resolved&.stores&.dig(:vector_store)
1518
+
1519
+ {
1520
+ embedding_model: resolved_model || (config.respond_to?(:embedding_model) ? config.embedding_model : nil),
1521
+ embedding_provider: presence(resolved_provider ||
1522
+ (config.respond_to?(:embedding_provider) ? config.embedding_provider : nil)),
1523
+ vector_store: presence(resolved_vector ||
1524
+ (config.respond_to?(:vector_store) ? config.vector_store : nil)),
1525
+ session_tracer_enabled: config.respond_to?(:session_tracer_enabled) ? config.session_tracer_enabled : false,
1526
+ snapshots_enabled: config.respond_to?(:enable_snapshots) ? config.enable_snapshots : false,
1527
+ notion_configured: config.respond_to?(:notion_api_token) && !presence(config.notion_api_token).nil?
1528
+ }
1529
+ end
1530
+
1531
+ # Convert a fully-qualified provider class name (as serialised in
1532
+ # woods.json — e.g. +"Woods::Embedding::Provider::Ollama"+) into the
1533
+ # short symbol form used by +Woods.configuration.embedding_provider+
1534
+ # (+:ollama+, +:openai+). Returns nil when +class_name+ is unknown or
1535
+ # absent so callers fall back to the live config value.
1536
+ def resolved_provider_symbol(class_name)
1537
+ return nil if class_name.nil? || class_name.empty?
1538
+
1539
+ case class_name
1540
+ when /Ollama\z/ then :ollama
1541
+ when /OpenAI\z/ then :openai
1542
+ end
1543
+ end
1544
+
1545
+ # Return a Hash of manifest content, or nil if unreadable.
1546
+ def safe_manifest(reader)
1547
+ reader.manifest
1548
+ rescue StandardError
1549
+ nil
1550
+ end
1551
+
1552
+ # Seconds since extraction. Returns nil if timestamp is missing or unparsable.
1553
+ def staleness_seconds(iso8601)
1554
+ return nil if iso8601.nil? || iso8601.empty?
1555
+
1556
+ (Time.now - Time.parse(iso8601)).to_i
1557
+ rescue ArgumentError
1558
+ nil
1559
+ end
1560
+
1561
+ def presence(value)
1562
+ return nil if value.nil?
1563
+ return nil if value.respond_to?(:empty?) && value.empty?
1564
+
1565
+ value.to_s
1566
+ end
1567
+
934
1568
  def register_resource_handler(server, reader)
935
1569
  server.resources_read_handler do |params|
936
1570
  uri = params[:uri]