woods 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +169 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +15 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +3 -4
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +737 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +1 -1
  102. data/lib/woods/unblocked/document_builder.rb +35 -10
  103. data/lib/woods/unblocked/exporter.rb +1 -1
  104. data/lib/woods/util/host_guard.rb +61 -0
  105. data/lib/woods/version.rb +1 -1
  106. data/lib/woods.rb +126 -6
  107. metadata +69 -4
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'active_support/core_ext/object/blank'
3
4
  require 'active_support/core_ext/string/inflections'
4
5
  require 'digest'
5
6
  require 'json'
@@ -52,6 +53,34 @@ module Woods
52
53
  @identifier_map = nil
53
54
  end
54
55
 
56
+ # Pre-populate cached state so the first MCP tool call doesn't pay
57
+ # for disk reads + JSON parsing.
58
+ #
59
+ # Touches every lazy accessor: manifest, summary, dependency_graph,
60
+ # graph_analysis, and the identifier_map (which reads all _index.json
61
+ # files). Each step is individually rescued so a missing optional
62
+ # artefact (e.g. graph_analysis.json) never blocks the rest.
63
+ #
64
+ # Safe to call multiple times — lazy accessors short-circuit on the
65
+ # memoized value.
66
+ #
67
+ # @return [Hash] Per-step outcome: `{step => true | Exception}`
68
+ def warmup!
69
+ steps = {
70
+ manifest: -> { manifest },
71
+ summary: -> { summary },
72
+ dependency_graph: -> { dependency_graph },
73
+ graph_analysis: -> { graph_analysis },
74
+ identifier_map: -> { identifier_map }
75
+ }
76
+ steps.each_with_object({}) do |(step, runner), result|
77
+ runner.call
78
+ result[step] = true
79
+ rescue StandardError => e
80
+ result[step] = e
81
+ end
82
+ end
83
+
55
84
  # Clear all cached state so the next access re-reads from disk.
56
85
  #
57
86
  # @return [void]
@@ -65,6 +94,7 @@ module Woods
65
94
  @dependency_graph = nil
66
95
  @graph_analysis = nil
67
96
  @raw_graph_data = nil
97
+ @normalized_graph_edges = nil
68
98
  end
69
99
 
70
100
  # @return [Hash] Parsed manifest.json
@@ -72,6 +102,17 @@ module Woods
72
102
  @manifest ||= parse_json('manifest.json')
73
103
  end
74
104
 
105
+ # Template engines the extraction pipeline currently understands.
106
+ # Delegates to {ViewTemplateExtractor.supported_template_engines} so
107
+ # the list stays honest as engines are added or removed. Surfaced by
108
+ # the MCP `structure` tool (#86).
109
+ #
110
+ # @return [Array<Symbol>]
111
+ def template_engines
112
+ require_relative '../extractors/view_template_extractor'
113
+ Woods::Extractors::ViewTemplateExtractor.supported_template_engines.dup
114
+ end
115
+
75
116
  # @return [String, nil] SUMMARY.md content, or nil if not present
76
117
  def summary
77
118
  @summary ||= begin
@@ -119,19 +160,57 @@ module Woods
119
160
  dirs.flat_map { |dir| read_index(dir) }
120
161
  end
121
162
 
163
+ # Default maximum number of unit files to load during phase-2 search.
164
+ # Override with WOODS_SEARCH_MAX_SCAN env var.
165
+ DEFAULT_SEARCH_MAX_SCAN = 500
166
+
122
167
  # Search units by case-insensitive pattern.
123
168
  #
124
169
  # Phase 1: match identifiers from index files (cheap).
125
170
  # Phase 2: lazy-load unit files for metadata/source_code matching.
126
171
  #
127
- # @param query [String] Search pattern (treated as case-insensitive regex)
172
+ # The query is compiled as a raw Ruby regex with IGNORECASE. If the pattern
173
+ # is invalid, it falls back to an escaped literal match.
174
+ #
175
+ # A "broad" pattern is one that matches more than 50% of the entries in a
176
+ # type directory. Broad patterns still run but the result includes a :note.
177
+ #
178
+ # Phase-2 scan is capped at WOODS_SEARCH_MAX_SCAN unit files (default 500).
179
+ # When the cap is reached the result includes :partial => true.
180
+ #
181
+ # The optional +exact_prefix+ / +exact_suffix+ filters restrict results to
182
+ # identifiers whose start/end matches the given string literally (case-
183
+ # insensitive). They are ANDed with the +query+ regex and are safer than
184
+ # hand-escaping regex anchors — metacharacters like +::+ are treated as
185
+ # literal text.
186
+ #
187
+ # @param query [String, nil] Search pattern (case-insensitive regex). Optional when
188
+ # +exact_prefix+ or +exact_suffix+ is provided; otherwise required.
128
189
  # @param types [Array<String>, nil] Filter to these singular type names
129
190
  # @param fields [Array<String>] Fields to search: "identifier", "metadata", "source_code"
130
191
  # @param limit [Integer] Maximum results to return
131
- # @return [Array<Hash>] Matches with :identifier, :type, :match_field keys
132
- def search(query, types: nil, fields: %w[identifier], limit: 20)
133
- pattern = Regexp.new(Regexp.escape(query), Regexp::IGNORECASE)
192
+ # @param exact_prefix [String, nil] Literal identifier prefix filter (case-insensitive)
193
+ # @param exact_suffix [String, nil] Literal identifier suffix filter (case-insensitive)
194
+ # @return [Hash] { results: Array<Hash>, note: String|nil, partial: Boolean }
195
+ # @raise [ArgumentError] when all of query, exact_prefix, and exact_suffix are blank
196
+ def search(query = nil, types: nil, fields: %w[identifier], limit: 20, exact_prefix: nil, exact_suffix: nil)
197
+ prefix = exact_prefix.blank? ? nil : exact_prefix.downcase
198
+ suffix = exact_suffix.blank? ? nil : exact_suffix.downcase
199
+ if query.blank? && !prefix && !suffix
200
+ raise ArgumentError, 'search requires a query or exact_prefix/exact_suffix filter'
201
+ end
202
+
203
+ # When only prefix/suffix are provided, the regex acts as a match-all
204
+ # wildcard so the existing phase-1/phase-2 pipeline still works.
205
+ pattern = compile_search_pattern(query.to_s.empty? ? '.*' : query)
206
+ max_scan_env = ENV.fetch('WOODS_SEARCH_MAX_SCAN', '').to_s.strip
207
+ max_scan = max_scan_env.empty? ? DEFAULT_SEARCH_MAX_SCAN : max_scan_env.to_i
208
+ max_scan = DEFAULT_SEARCH_MAX_SCAN if max_scan <= 0
209
+
134
210
  results = []
211
+ notes = []
212
+ phase2_scanned = 0
213
+ partial = false
135
214
 
136
215
  dirs = if types
137
216
  types.filter_map { |t| TYPE_TO_DIR[t] }
@@ -139,36 +218,85 @@ module Woods
139
218
  TYPE_DIRS
140
219
  end
141
220
 
221
+ # Phase 2 candidates are collected per-dir and then scanned in
222
+ # round-robin across dirs. Exhausting the per-run scan cap linearly
223
+ # down TYPE_DIRS order would starve later types (`concerns` at pos
224
+ # 13, `test_mappings` at pos 31) on any codebase where the earlier
225
+ # dirs together exceed max_scan entries. Interleaving guarantees
226
+ # every type contributes to the scanned set.
227
+ phase2_queues = {}
228
+
142
229
  dirs.each do |dir|
143
230
  type_name = DIR_TO_TYPE[dir]
144
231
  entries = read_index(dir)
145
232
 
146
- entries.each do |entry|
147
- break if results.size >= limit
233
+ # Broad-match detection: warn when pattern matches >50% of dir entries
234
+ if entries.size > 1
235
+ matching_count = entries.count do |e|
236
+ identifier_passes_filters?(e['identifier'], pattern, prefix, suffix)
237
+ end
238
+ if matching_count > entries.size / 2.0
239
+ notes << "broad pattern matched #{matching_count}/#{entries.size} entries in #{dir}"
240
+ end
241
+ end
148
242
 
243
+ entries.each do |entry|
149
244
  id = entry['identifier']
245
+ next unless identifier_passes_prefix_suffix?(id, prefix, suffix)
150
246
 
151
- # Phase 1: identifier matching
247
+ # Phase 1: identifier matching (still in-order per dir)
152
248
  if fields.include?('identifier') && pattern.match?(id)
249
+ next if results.size >= limit
250
+
153
251
  results << { identifier: id, type: type_name, match_field: 'identifier' }
154
252
  next
155
253
  end
156
254
 
157
- # Phase 2: metadata/source_code matching (requires loading full unit)
255
+ # Phase 2 is only reached when the caller opted into deeper fields.
158
256
  next unless fields.include?('metadata') || fields.include?('source_code')
159
257
 
160
- unit = find_unit(id)
161
- next unless unit
258
+ (phase2_queues[dir] ||= []) << [type_name, id]
259
+ end
260
+ end
261
+
262
+ if results.size < limit && phase2_queues.any?
263
+ queues = phase2_queues.values.map(&:dup)
264
+ catch(:phase2_done) do
265
+ loop do
266
+ progressed = false
267
+ queues.each do |queue|
268
+ next if queue.empty?
269
+
270
+ throw :phase2_done if results.size >= limit
271
+
272
+ if phase2_scanned >= max_scan
273
+ partial = true
274
+ throw :phase2_done
275
+ end
276
+
277
+ type_name, id = queue.shift
278
+ progressed = true
162
279
 
163
- if fields.include?('source_code') && unit['source_code'] && pattern.match?(unit['source_code'])
164
- results << { identifier: id, type: type_name, match_field: 'source_code' }
165
- elsif fields.include?('metadata') && unit['metadata'] && pattern.match?(unit['metadata'].to_json)
166
- results << { identifier: id, type: type_name, match_field: 'metadata' }
280
+ unit = find_unit(id)
281
+ next unless unit
282
+
283
+ phase2_scanned += 1
284
+
285
+ if fields.include?('source_code') && unit['source_code'] && pattern.match?(unit['source_code'])
286
+ results << { identifier: id, type: type_name, match_field: 'source_code' }
287
+ elsif fields.include?('metadata') && unit['metadata'] && pattern.match?(unit['metadata'].to_json)
288
+ results << { identifier: id, type: type_name, match_field: 'metadata' }
289
+ end
290
+ end
291
+ break unless progressed
167
292
  end
168
293
  end
169
294
  end
170
295
 
171
- results.first(limit)
296
+ response = { results: results.first(limit) }
297
+ response[:note] = notes.join('; ') unless notes.empty?
298
+ response[:partial] = true if partial
299
+ response
172
300
  end
173
301
 
174
302
  # BFS traversal of forward dependencies.
@@ -176,9 +304,10 @@ module Woods
176
304
  # @param identifier [String] Starting unit identifier
177
305
  # @param depth [Integer] Maximum traversal depth
178
306
  # @param types [Array<String>, nil] Filter to these singular type names
307
+ # @param via [Array<String>, nil] Filter to these relationship types (e.g. ["link_to", "redirect_to"])
179
308
  # @return [Hash] { root:, nodes: { id => { type:, depth:, deps: [] } } }
180
- def traverse_dependencies(identifier, depth: 2, types: nil)
181
- traverse(identifier, depth: depth, types: types, direction: :forward)
309
+ def traverse_dependencies(identifier, depth: 2, types: nil, via: nil)
310
+ traverse(identifier, depth: depth, types: types, via: via, direction: :forward)
182
311
  end
183
312
 
184
313
  # BFS traversal of reverse dependencies (dependents).
@@ -186,9 +315,10 @@ module Woods
186
315
  # @param identifier [String] Starting unit identifier
187
316
  # @param depth [Integer] Maximum traversal depth
188
317
  # @param types [Array<String>, nil] Filter to these singular type names
318
+ # @param via [Array<String>, nil] Filter to these relationship types (e.g. ["link_to", "redirect_to"])
189
319
  # @return [Hash] { root:, nodes: { id => { type:, depth:, deps: [] } } }
190
- def traverse_dependents(identifier, depth: 2, types: nil)
191
- traverse(identifier, depth: depth, types: types, direction: :reverse)
320
+ def traverse_dependents(identifier, depth: 2, types: nil, via: nil)
321
+ traverse(identifier, depth: depth, types: types, via: via, direction: :reverse)
192
322
  end
193
323
 
194
324
  # Search rails_source units by concept keyword.
@@ -200,7 +330,12 @@ module Woods
200
330
  # @param limit [Integer] Maximum results to return
201
331
  # @return [Array<Hash>] Matching rails_source unit summaries
202
332
  def framework_sources(keyword, limit: 20)
203
- pattern = Regexp.new(Regexp.escape(keyword), Regexp::IGNORECASE)
333
+ # Multi-word keywords ("ActiveRecord callbacks") are split on
334
+ # whitespace and ANDed. Single-word queries behave as before.
335
+ tokens = keyword.to_s.strip.split(/\s+/)
336
+ return [] if tokens.empty?
337
+
338
+ patterns = tokens.map { |t| Regexp.new(Regexp.escape(t), Regexp::IGNORECASE) }
204
339
  results = []
205
340
 
206
341
  entries = read_index('rails_source')
@@ -211,9 +346,12 @@ module Woods
211
346
  unit = find_unit(id)
212
347
  next unless unit
213
348
 
214
- matched = pattern.match?(id) ||
215
- (unit['source_code'] && pattern.match?(unit['source_code'])) ||
216
- (unit['metadata'] && pattern.match?(unit['metadata'].to_json))
349
+ metadata_json = unit['metadata']&.to_json
350
+ matched = patterns.all? do |pat|
351
+ pat.match?(id) ||
352
+ (unit['source_code'] && pat.match?(unit['source_code'])) ||
353
+ (metadata_json && pat.match?(metadata_json))
354
+ end
217
355
 
218
356
  next unless matched
219
357
 
@@ -259,7 +397,8 @@ module Woods
259
397
  identifier: id,
260
398
  type: DIR_TO_TYPE[dir],
261
399
  file_path: unit['file_path'],
262
- last_modified: last_modified
400
+ last_modified: last_modified,
401
+ author: unit.dig('metadata', 'git', 'last_author')
263
402
  }
264
403
  end
265
404
  end
@@ -277,6 +416,46 @@ module Woods
277
416
 
278
417
  private
279
418
 
419
+ # Compile a case-insensitive regex from a query string.
420
+ #
421
+ # Treats the query as a raw Ruby regex pattern. Falls back to an escaped
422
+ # literal match (with a :note field added by callers) when the pattern is
423
+ # invalid.
424
+ #
425
+ # @param query [String] Raw regex pattern
426
+ # @return [Regexp] Compiled case-insensitive pattern
427
+ def compile_search_pattern(query)
428
+ Regexp.new(query, Regexp::IGNORECASE)
429
+ rescue RegexpError
430
+ Regexp.new(Regexp.escape(query), Regexp::IGNORECASE)
431
+ end
432
+
433
+ # Case-insensitive literal prefix/suffix check on an identifier.
434
+ # Nil filters are treated as "no restriction".
435
+ def identifier_passes_prefix_suffix?(identifier, prefix, suffix)
436
+ return false unless identifier
437
+
438
+ downcased = identifier.downcase
439
+ return false if prefix && !downcased.start_with?(prefix)
440
+ return false if suffix && !downcased.end_with?(suffix)
441
+
442
+ true
443
+ end
444
+
445
+ # Combined regex + prefix/suffix check used only by broad-match detection,
446
+ # which reports how many identifiers would actually surface.
447
+ def identifier_passes_filters?(identifier, pattern, prefix, suffix)
448
+ return false unless identifier_passes_prefix_suffix?(identifier, prefix, suffix)
449
+
450
+ pattern.match?(identifier)
451
+ end
452
+
453
+ # Memoized normalized edges — converts bare strings (old format) to hashes once.
454
+ # Cleared by reload! alongside raw_graph_data.
455
+ def normalized_graph_edges
456
+ @normalized_graph_edges ||= normalize_all_edges(raw_graph_data['edges'] || {})
457
+ end
458
+
280
459
  # Build identifier → { type_dir, filename } map from all _index.json files.
281
460
  def identifier_map
282
461
  @identifier_map ||= build_identifier_map
@@ -340,13 +519,28 @@ module Woods
340
519
  end
341
520
 
342
521
  # BFS traversal in either direction.
343
- def traverse(identifier, depth:, types:, direction:)
522
+ #
523
+ # Edges may be stored as bare strings (old format) or as
524
+ # +{"target" => "...", "via" => "..."}+ hashes (new format).
525
+ # This method handles both transparently.
526
+ #
527
+ # @param identifier [String] Starting unit identifier
528
+ # @param depth [Integer] Maximum traversal depth
529
+ # @param types [Array<String>, nil] Filter to these unit type names
530
+ # @param via [Array<String>, nil] Filter to these relationship types
531
+ # @param direction [:forward, :reverse] Traversal direction
532
+ # @return [Hash]
533
+ def traverse(identifier, depth:, types:, via:, direction:)
344
534
  graph_data = raw_graph_data
345
535
  nodes_data = graph_data['nodes'] || {}
346
536
 
347
537
  return { root: identifier, found: false, nodes: {} } unless nodes_data.key?(identifier)
348
538
 
539
+ # Normalize edges once per graph load — memoized alongside raw_graph_data
540
+ normalized_edges = normalized_graph_edges
541
+
349
542
  type_set = types&.to_set
543
+ via_set = via&.to_set
350
544
  visited = Set.new([identifier])
351
545
  queue = [[identifier, 0]]
352
546
  result_nodes = {}
@@ -355,12 +549,12 @@ module Woods
355
549
  current, current_depth = queue.shift
356
550
 
357
551
  neighbors = if direction == :forward
358
- (graph_data['edges'] || {})[current] || []
552
+ resolve_forward_neighbors(normalized_edges, current, via_set)
359
553
  else
360
- (graph_data['reverse'] || {})[current] || []
554
+ resolve_reverse_neighbors(graph_data, normalized_edges, current, via_set)
361
555
  end
362
556
 
363
- # Filter by type if requested
557
+ # Filter by node type if requested
364
558
  filtered = if type_set
365
559
  neighbors.select do |n|
366
560
  node_meta = nodes_data[n]
@@ -370,14 +564,18 @@ module Woods
370
564
  neighbors
371
565
  end
372
566
 
567
+ # At max depth, record the node with empty deps so the renderer
568
+ # doesn't emit an extra level of unexpanded neighbors. The parent
569
+ # node's deps list already shows this node as a child.
570
+ will_expand = current_depth < depth
373
571
  node_meta = nodes_data[current]
374
572
  result_nodes[current] = {
375
573
  type: node_meta&.dig('type'),
376
574
  depth: current_depth,
377
- deps: filtered
575
+ deps: will_expand ? filtered : []
378
576
  }
379
577
 
380
- next if current_depth >= depth
578
+ next unless will_expand
381
579
 
382
580
  filtered.each do |neighbor|
383
581
  unless visited.include?(neighbor)
@@ -389,6 +587,43 @@ module Woods
389
587
 
390
588
  { root: identifier, found: true, nodes: result_nodes }
391
589
  end
590
+
591
+ # Normalize all edge arrays once, converting bare strings to hashes.
592
+ #
593
+ # NOTE: This uses string keys ('target', 'via') because IndexReader
594
+ # operates on parsed JSON. DependencyGraph.normalize_edges uses symbol
595
+ # keys (:target, :via) for in-memory Ruby objects. The two normalizers
596
+ # are intentionally separate — do not merge them.
597
+ #
598
+ # @param raw_edges [Hash] Raw edges from graph JSON
599
+ # @return [Hash] Edges with all entries as { 'target' => ..., 'via' => ... } hashes
600
+ def normalize_all_edges(raw_edges)
601
+ raw_edges.transform_values do |entries|
602
+ entries.map { |e| e.is_a?(Hash) ? e : { 'target' => e } }
603
+ end
604
+ end
605
+
606
+ # Extract forward neighbor identifiers, optionally filtered by via type.
607
+ # Expects pre-normalized edges (all entries are hashes).
608
+ def resolve_forward_neighbors(normalized_edges, identifier, via_set)
609
+ edges = normalized_edges[identifier] || []
610
+ edges = edges.select { |e| via_set.include?(e['via']) } if via_set
611
+ edges.map { |e| e['target'] }
612
+ end
613
+
614
+ # Extract reverse neighbor identifiers, optionally filtered by via type.
615
+ # Reverse edges are stored as bare identifier arrays. When via filtering
616
+ # is requested, checks each dependent's pre-normalized forward edges to
617
+ # find those pointing at +identifier+ with a matching via type.
618
+ def resolve_reverse_neighbors(graph_data, normalized_edges, identifier, via_set)
619
+ dependents = (graph_data['reverse'] || {})[identifier] || []
620
+ return dependents unless via_set
621
+
622
+ dependents.select do |dep|
623
+ forward = normalized_edges[dep] || []
624
+ forward.any? { |e| e['target'] == identifier && via_set.include?(e['via']) }
625
+ end
626
+ end
392
627
  end
393
628
  end
394
629
  end
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ require_relative '../util/host_guard'
6
+
7
+ module Woods
8
+ module MCP
9
+ # Rack middleware that rejects browser-origin requests from unexpected sources.
10
+ #
11
+ # Defends against DNS rebinding and cross-site request forgery against a
12
+ # locally-bound MCP HTTP server. Defaults to loopback-only origins; operators
13
+ # can widen via WOODS_MCP_HTTP_ALLOWED_ORIGINS (comma-separated) or by passing
14
+ # :allowed_origins. Requests without an Origin header (curl, server-to-server,
15
+ # MCP stdio clients) are allowed through — bearer auth still gates them.
16
+ #
17
+ # Host header validation defends against the residual DNS-rebinding surface:
18
+ # an attacker who controls a hostname they can point at the server's IP can
19
+ # pass the Origin check (the browser sends their origin, which we might
20
+ # allow-list for some deployments) while Host carries their hostname. By
21
+ # also requiring Host to appear in the allow-list (or to be a loopback
22
+ # address), we close that gap even when Rails is bound to 0.0.0.0.
23
+ #
24
+ # Port-matching: an allow-list entry WITHOUT a port (`http://localhost`)
25
+ # matches that host on any port. An entry WITH a port (`http://localhost:3000`)
26
+ # requires an exact port match. Specify explicit ports when port isolation
27
+ # matters.
28
+ #
29
+ # Also answers CORS preflight (OPTIONS) with the matching allow-list.
30
+ class OriginGuard
31
+ DEFAULT_ALLOWED = %w[
32
+ http://localhost http://127.0.0.1 http://[::1]
33
+ https://localhost https://127.0.0.1 https://[::1]
34
+ ].freeze
35
+
36
+ # Hosts that always pass the Host-header check even without an explicit
37
+ # allow-list entry — they resolve to loopback by definition and cannot
38
+ # be rebound to an attacker-controlled address.
39
+ LOOPBACK_HOSTS = %w[localhost 127.0.0.1 ::1 [::1]].freeze
40
+
41
+ ALLOWED_METHODS = 'GET, POST, DELETE, OPTIONS'
42
+ ALLOWED_HEADERS = 'Authorization, Content-Type, Mcp-Session-Id'
43
+
44
+ # Response bodies are emitted as constants so the rejected Origin /
45
+ # Host value is NEVER echoed back to the caller — preventing a
46
+ # stored-XSS / log-injection surface where an attacker-supplied
47
+ # header ended up embedded in the JSON error.
48
+ FORBIDDEN_BODY = { jsonrpc: '2.0', error: { code: -32_002, message: 'Origin not allowed' }, id: nil }.to_json.freeze
49
+ FORBIDDEN_HOST_BODY = { jsonrpc: '2.0', error: { code: -32_002, message: 'Host not allowed' }, id: nil }.to_json.freeze
50
+
51
+ def initialize(app, allowed_origins: nil)
52
+ @app = app
53
+ @allowed = Array(allowed_origins).compact.reject { |o| o.to_s.strip.empty? }.map { |o| normalize(o) }
54
+ @allowed = DEFAULT_ALLOWED.dup if @allowed.empty?
55
+ @allowed_hosts = @allowed.map { |o| extract_host(o) }.compact.uniq
56
+ end
57
+
58
+ def call(env)
59
+ origin = env['HTTP_ORIGIN']
60
+ method = env['REQUEST_METHOD']
61
+ host = env['HTTP_HOST']
62
+
63
+ return forbidden if origin && !origin_allowed?(origin)
64
+ return forbidden_host if host && !host_allowed?(host)
65
+
66
+ return preflight(origin) if method == 'OPTIONS'
67
+
68
+ status, headers, body = @app.call(env)
69
+ headers = cors_headers(origin).merge(headers) if origin && origin_allowed?(origin)
70
+ [status, headers, body]
71
+ end
72
+
73
+ private
74
+
75
+ def normalize(origin)
76
+ origin.to_s.sub(%r{/\z}, '').downcase
77
+ end
78
+
79
+ def extract_host(origin)
80
+ host = origin.to_s.sub(%r{\Ahttps?://}, '').sub(%r{/.*\z}, '').downcase
81
+ host.empty? ? nil : host
82
+ end
83
+
84
+ def host_allowed?(host)
85
+ # Canonicalize (strip port, trailing dot, IPv6 brackets) via the
86
+ # shared helper so Qdrant and OriginGuard stay in sync on bypass
87
+ # notations. `normalized` keeps the port for literal allow-list
88
+ # lookups; `bare` drops it for loopback matching.
89
+ normalized = host.to_s.downcase.sub(/\.\z/, '')
90
+ bare = Util::HostGuard.canonicalize(host)
91
+
92
+ # Reject non-canonical numeric hosts. Net::HTTP / getaddrinfo
93
+ # would happily resolve `0x7f000001` or `2130706433` to 127.0.0.1,
94
+ # bypassing the loopback allow-list.
95
+ return false if Util::HostGuard.suspicious_numeric_host?(bare)
96
+
97
+ return true if LOOPBACK_HOSTS.include?(bare)
98
+
99
+ @allowed_hosts.include?(normalized) || @allowed_hosts.include?(bare)
100
+ end
101
+
102
+ def origin_allowed?(origin)
103
+ return false if origin.match?(/[[:cntrl:]]/)
104
+
105
+ @allowed.include?(normalize(origin)) || @allowed.include?(normalize(origin).sub(/:\d+\z/, ''))
106
+ end
107
+
108
+ def preflight(origin)
109
+ headers = origin && origin_allowed?(origin) ? cors_headers(origin) : {}
110
+ [204, headers, []]
111
+ end
112
+
113
+ def cors_headers(origin)
114
+ {
115
+ 'access-control-allow-origin' => origin,
116
+ 'access-control-allow-methods' => ALLOWED_METHODS,
117
+ 'access-control-allow-headers' => ALLOWED_HEADERS,
118
+ 'access-control-expose-headers' => 'Mcp-Session-Id',
119
+ 'vary' => 'Origin'
120
+ }
121
+ end
122
+
123
+ def forbidden
124
+ [403, { 'content-type' => 'application/json' }, [FORBIDDEN_BODY]]
125
+ end
126
+
127
+ def forbidden_host
128
+ [403, { 'content-type' => 'application/json' }, [FORBIDDEN_HOST_BODY]]
129
+ end
130
+ end
131
+ end
132
+ end