woods 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +169 -0
- data/README.md +20 -8
- data/exe/woods-console +51 -6
- data/exe/woods-console-mcp +24 -4
- data/exe/woods-mcp +30 -7
- data/exe/woods-mcp-http +47 -6
- data/lib/generators/woods/install_generator.rb +13 -4
- data/lib/generators/woods/templates/woods.rb.tt +155 -0
- data/lib/tasks/woods.rake +15 -50
- data/lib/woods/builder.rb +174 -9
- data/lib/woods/cache/cache_middleware.rb +360 -31
- data/lib/woods/chunking/semantic_chunker.rb +334 -7
- data/lib/woods/console/adapters/job_adapter.rb +10 -4
- data/lib/woods/console/audit_logger.rb +76 -4
- data/lib/woods/console/bridge.rb +48 -15
- data/lib/woods/console/bridge_protocol.rb +44 -0
- data/lib/woods/console/confirmation.rb +3 -4
- data/lib/woods/console/console_response_renderer.rb +56 -18
- data/lib/woods/console/credential_index.rb +201 -0
- data/lib/woods/console/credential_scanner.rb +302 -0
- data/lib/woods/console/dispatch_pipeline.rb +138 -0
- data/lib/woods/console/embedded_executor.rb +682 -35
- data/lib/woods/console/eval_guard.rb +319 -0
- data/lib/woods/console/model_validator.rb +1 -3
- data/lib/woods/console/rack_middleware.rb +185 -29
- data/lib/woods/console/redactor.rb +161 -0
- data/lib/woods/console/response_context.rb +127 -0
- data/lib/woods/console/safe_context.rb +220 -23
- data/lib/woods/console/scope_predicate_parser.rb +131 -0
- data/lib/woods/console/server.rb +417 -486
- data/lib/woods/console/sql_noise_stripper.rb +87 -0
- data/lib/woods/console/sql_table_scanner.rb +213 -0
- data/lib/woods/console/sql_validator.rb +81 -31
- data/lib/woods/console/table_gate.rb +93 -0
- data/lib/woods/console/tool_specs.rb +552 -0
- data/lib/woods/console/tools/tier1.rb +3 -3
- data/lib/woods/console/tools/tier4.rb +7 -1
- data/lib/woods/dependency_graph.rb +66 -7
- data/lib/woods/embedding/indexer.rb +190 -6
- data/lib/woods/embedding/openai.rb +40 -4
- data/lib/woods/embedding/provider.rb +104 -8
- data/lib/woods/embedding/text_preparer.rb +23 -3
- data/lib/woods/embedding/token_counter.rb +133 -0
- data/lib/woods/evaluation/baseline_runner.rb +20 -2
- data/lib/woods/evaluation/metrics.rb +4 -1
- data/lib/woods/extracted_unit.rb +1 -0
- data/lib/woods/extractor.rb +7 -1
- data/lib/woods/extractors/controller_extractor.rb +6 -0
- data/lib/woods/extractors/mailer_extractor.rb +16 -2
- data/lib/woods/extractors/model_extractor.rb +6 -1
- data/lib/woods/extractors/phlex_extractor.rb +13 -4
- data/lib/woods/extractors/rails_source_extractor.rb +2 -0
- data/lib/woods/extractors/route_helper_resolver.rb +130 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
- data/lib/woods/extractors/view_component_extractor.rb +12 -1
- data/lib/woods/extractors/view_engines/base.rb +141 -0
- data/lib/woods/extractors/view_engines/erb.rb +145 -0
- data/lib/woods/extractors/view_template_extractor.rb +92 -133
- data/lib/woods/flow_assembler.rb +23 -15
- data/lib/woods/flow_precomputer.rb +21 -2
- data/lib/woods/graph_analyzer.rb +3 -4
- data/lib/woods/index_artifact.rb +173 -0
- data/lib/woods/mcp/bearer_auth.rb +45 -0
- data/lib/woods/mcp/bootstrap_state.rb +94 -0
- data/lib/woods/mcp/bootstrapper.rb +337 -16
- data/lib/woods/mcp/config_resolver.rb +288 -0
- data/lib/woods/mcp/errors.rb +134 -0
- data/lib/woods/mcp/index_reader.rb +265 -30
- data/lib/woods/mcp/origin_guard.rb +132 -0
- data/lib/woods/mcp/provider_probe.rb +166 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
- data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
- data/lib/woods/mcp/server.rb +737 -137
- data/lib/woods/model_name_cache.rb +78 -2
- data/lib/woods/notion/client.rb +25 -2
- data/lib/woods/notion/mappers/model_mapper.rb +36 -2
- data/lib/woods/railtie.rb +55 -15
- data/lib/woods/resilience/circuit_breaker.rb +9 -2
- data/lib/woods/resilience/retryable_provider.rb +40 -3
- data/lib/woods/resolved_config.rb +299 -0
- data/lib/woods/retrieval/context_assembler.rb +112 -5
- data/lib/woods/retrieval/query_classifier.rb +1 -1
- data/lib/woods/retrieval/ranker.rb +55 -6
- data/lib/woods/retrieval/search_executor.rb +42 -13
- data/lib/woods/retriever.rb +330 -24
- data/lib/woods/session_tracer/middleware.rb +35 -1
- data/lib/woods/storage/graph_store.rb +39 -0
- data/lib/woods/storage/inapplicable_backend.rb +14 -0
- data/lib/woods/storage/metadata_store.rb +129 -1
- data/lib/woods/storage/pgvector.rb +70 -8
- data/lib/woods/storage/qdrant.rb +196 -5
- data/lib/woods/storage/snapshotter/metadata.rb +172 -0
- data/lib/woods/storage/snapshotter/vector.rb +238 -0
- data/lib/woods/storage/snapshotter.rb +24 -0
- data/lib/woods/storage/vector_store.rb +184 -35
- data/lib/woods/tasks.rb +85 -0
- data/lib/woods/temporal/snapshot_store.rb +49 -1
- data/lib/woods/token_utils.rb +44 -5
- data/lib/woods/unblocked/client.rb +1 -1
- data/lib/woods/unblocked/document_builder.rb +35 -10
- data/lib/woods/unblocked/exporter.rb +1 -1
- data/lib/woods/util/host_guard.rb +61 -0
- data/lib/woods/version.rb +1 -1
- data/lib/woods.rb +126 -6
- metadata +69 -4
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'active_support/core_ext/object/blank'
|
|
3
4
|
require 'active_support/core_ext/string/inflections'
|
|
4
5
|
require 'digest'
|
|
5
6
|
require 'json'
|
|
@@ -52,6 +53,34 @@ module Woods
|
|
|
52
53
|
@identifier_map = nil
|
|
53
54
|
end
|
|
54
55
|
|
|
56
|
+
# Pre-populate cached state so the first MCP tool call doesn't pay
|
|
57
|
+
# for disk reads + JSON parsing.
|
|
58
|
+
#
|
|
59
|
+
# Touches every lazy accessor: manifest, summary, dependency_graph,
|
|
60
|
+
# graph_analysis, and the identifier_map (which reads all _index.json
|
|
61
|
+
# files). Each step is individually rescued so a missing optional
|
|
62
|
+
# artefact (e.g. graph_analysis.json) never blocks the rest.
|
|
63
|
+
#
|
|
64
|
+
# Safe to call multiple times — lazy accessors short-circuit on the
|
|
65
|
+
# memoized value.
|
|
66
|
+
#
|
|
67
|
+
# @return [Hash] Per-step outcome: `{step => true | Exception}`
|
|
68
|
+
def warmup!
|
|
69
|
+
steps = {
|
|
70
|
+
manifest: -> { manifest },
|
|
71
|
+
summary: -> { summary },
|
|
72
|
+
dependency_graph: -> { dependency_graph },
|
|
73
|
+
graph_analysis: -> { graph_analysis },
|
|
74
|
+
identifier_map: -> { identifier_map }
|
|
75
|
+
}
|
|
76
|
+
steps.each_with_object({}) do |(step, runner), result|
|
|
77
|
+
runner.call
|
|
78
|
+
result[step] = true
|
|
79
|
+
rescue StandardError => e
|
|
80
|
+
result[step] = e
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
55
84
|
# Clear all cached state so the next access re-reads from disk.
|
|
56
85
|
#
|
|
57
86
|
# @return [void]
|
|
@@ -65,6 +94,7 @@ module Woods
|
|
|
65
94
|
@dependency_graph = nil
|
|
66
95
|
@graph_analysis = nil
|
|
67
96
|
@raw_graph_data = nil
|
|
97
|
+
@normalized_graph_edges = nil
|
|
68
98
|
end
|
|
69
99
|
|
|
70
100
|
# @return [Hash] Parsed manifest.json
|
|
@@ -72,6 +102,17 @@ module Woods
|
|
|
72
102
|
@manifest ||= parse_json('manifest.json')
|
|
73
103
|
end
|
|
74
104
|
|
|
105
|
+
# Template engines the extraction pipeline currently understands.
|
|
106
|
+
# Delegates to {ViewTemplateExtractor.supported_template_engines} so
|
|
107
|
+
# the list stays honest as engines are added or removed. Surfaced by
|
|
108
|
+
# the MCP `structure` tool (#86).
|
|
109
|
+
#
|
|
110
|
+
# @return [Array<Symbol>]
|
|
111
|
+
def template_engines
|
|
112
|
+
require_relative '../extractors/view_template_extractor'
|
|
113
|
+
Woods::Extractors::ViewTemplateExtractor.supported_template_engines.dup
|
|
114
|
+
end
|
|
115
|
+
|
|
75
116
|
# @return [String, nil] SUMMARY.md content, or nil if not present
|
|
76
117
|
def summary
|
|
77
118
|
@summary ||= begin
|
|
@@ -119,19 +160,57 @@ module Woods
|
|
|
119
160
|
dirs.flat_map { |dir| read_index(dir) }
|
|
120
161
|
end
|
|
121
162
|
|
|
163
|
+
# Default maximum number of unit files to load during phase-2 search.
|
|
164
|
+
# Override with WOODS_SEARCH_MAX_SCAN env var.
|
|
165
|
+
DEFAULT_SEARCH_MAX_SCAN = 500
|
|
166
|
+
|
|
122
167
|
# Search units by case-insensitive pattern.
|
|
123
168
|
#
|
|
124
169
|
# Phase 1: match identifiers from index files (cheap).
|
|
125
170
|
# Phase 2: lazy-load unit files for metadata/source_code matching.
|
|
126
171
|
#
|
|
127
|
-
#
|
|
172
|
+
# The query is compiled as a raw Ruby regex with IGNORECASE. If the pattern
|
|
173
|
+
# is invalid, it falls back to an escaped literal match.
|
|
174
|
+
#
|
|
175
|
+
# A "broad" pattern is one that matches more than 50% of the entries in a
|
|
176
|
+
# type directory. Broad patterns still run but the result includes a :note.
|
|
177
|
+
#
|
|
178
|
+
# Phase-2 scan is capped at WOODS_SEARCH_MAX_SCAN unit files (default 500).
|
|
179
|
+
# When the cap is reached the result includes :partial => true.
|
|
180
|
+
#
|
|
181
|
+
# The optional +exact_prefix+ / +exact_suffix+ filters restrict results to
|
|
182
|
+
# identifiers whose start/end matches the given string literally (case-
|
|
183
|
+
# insensitive). They are ANDed with the +query+ regex and are safer than
|
|
184
|
+
# hand-escaping regex anchors — metacharacters like +::+ are treated as
|
|
185
|
+
# literal text.
|
|
186
|
+
#
|
|
187
|
+
# @param query [String, nil] Search pattern (case-insensitive regex). Optional when
|
|
188
|
+
# +exact_prefix+ or +exact_suffix+ is provided; otherwise required.
|
|
128
189
|
# @param types [Array<String>, nil] Filter to these singular type names
|
|
129
190
|
# @param fields [Array<String>] Fields to search: "identifier", "metadata", "source_code"
|
|
130
191
|
# @param limit [Integer] Maximum results to return
|
|
131
|
-
# @
|
|
132
|
-
|
|
133
|
-
|
|
192
|
+
# @param exact_prefix [String, nil] Literal identifier prefix filter (case-insensitive)
|
|
193
|
+
# @param exact_suffix [String, nil] Literal identifier suffix filter (case-insensitive)
|
|
194
|
+
# @return [Hash] { results: Array<Hash>, note: String|nil, partial: Boolean }
|
|
195
|
+
# @raise [ArgumentError] when all of query, exact_prefix, and exact_suffix are blank
|
|
196
|
+
def search(query = nil, types: nil, fields: %w[identifier], limit: 20, exact_prefix: nil, exact_suffix: nil)
|
|
197
|
+
prefix = exact_prefix.blank? ? nil : exact_prefix.downcase
|
|
198
|
+
suffix = exact_suffix.blank? ? nil : exact_suffix.downcase
|
|
199
|
+
if query.blank? && !prefix && !suffix
|
|
200
|
+
raise ArgumentError, 'search requires a query or exact_prefix/exact_suffix filter'
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# When only prefix/suffix are provided, the regex acts as a match-all
|
|
204
|
+
# wildcard so the existing phase-1/phase-2 pipeline still works.
|
|
205
|
+
pattern = compile_search_pattern(query.to_s.empty? ? '.*' : query)
|
|
206
|
+
max_scan_env = ENV.fetch('WOODS_SEARCH_MAX_SCAN', '').to_s.strip
|
|
207
|
+
max_scan = max_scan_env.empty? ? DEFAULT_SEARCH_MAX_SCAN : max_scan_env.to_i
|
|
208
|
+
max_scan = DEFAULT_SEARCH_MAX_SCAN if max_scan <= 0
|
|
209
|
+
|
|
134
210
|
results = []
|
|
211
|
+
notes = []
|
|
212
|
+
phase2_scanned = 0
|
|
213
|
+
partial = false
|
|
135
214
|
|
|
136
215
|
dirs = if types
|
|
137
216
|
types.filter_map { |t| TYPE_TO_DIR[t] }
|
|
@@ -139,36 +218,85 @@ module Woods
|
|
|
139
218
|
TYPE_DIRS
|
|
140
219
|
end
|
|
141
220
|
|
|
221
|
+
# Phase 2 candidates are collected per-dir and then scanned in
|
|
222
|
+
# round-robin across dirs. Exhausting the per-run scan cap linearly
|
|
223
|
+
# down TYPE_DIRS order would starve later types (`concerns` at pos
|
|
224
|
+
# 13, `test_mappings` at pos 31) on any codebase where the earlier
|
|
225
|
+
# dirs together exceed max_scan entries. Interleaving guarantees
|
|
226
|
+
# every type contributes to the scanned set.
|
|
227
|
+
phase2_queues = {}
|
|
228
|
+
|
|
142
229
|
dirs.each do |dir|
|
|
143
230
|
type_name = DIR_TO_TYPE[dir]
|
|
144
231
|
entries = read_index(dir)
|
|
145
232
|
|
|
146
|
-
|
|
147
|
-
|
|
233
|
+
# Broad-match detection: warn when pattern matches >50% of dir entries
|
|
234
|
+
if entries.size > 1
|
|
235
|
+
matching_count = entries.count do |e|
|
|
236
|
+
identifier_passes_filters?(e['identifier'], pattern, prefix, suffix)
|
|
237
|
+
end
|
|
238
|
+
if matching_count > entries.size / 2.0
|
|
239
|
+
notes << "broad pattern matched #{matching_count}/#{entries.size} entries in #{dir}"
|
|
240
|
+
end
|
|
241
|
+
end
|
|
148
242
|
|
|
243
|
+
entries.each do |entry|
|
|
149
244
|
id = entry['identifier']
|
|
245
|
+
next unless identifier_passes_prefix_suffix?(id, prefix, suffix)
|
|
150
246
|
|
|
151
|
-
# Phase 1: identifier matching
|
|
247
|
+
# Phase 1: identifier matching (still in-order per dir)
|
|
152
248
|
if fields.include?('identifier') && pattern.match?(id)
|
|
249
|
+
next if results.size >= limit
|
|
250
|
+
|
|
153
251
|
results << { identifier: id, type: type_name, match_field: 'identifier' }
|
|
154
252
|
next
|
|
155
253
|
end
|
|
156
254
|
|
|
157
|
-
# Phase 2
|
|
255
|
+
# Phase 2 is only reached when the caller opted into deeper fields.
|
|
158
256
|
next unless fields.include?('metadata') || fields.include?('source_code')
|
|
159
257
|
|
|
160
|
-
|
|
161
|
-
|
|
258
|
+
(phase2_queues[dir] ||= []) << [type_name, id]
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
if results.size < limit && phase2_queues.any?
|
|
263
|
+
queues = phase2_queues.values.map(&:dup)
|
|
264
|
+
catch(:phase2_done) do
|
|
265
|
+
loop do
|
|
266
|
+
progressed = false
|
|
267
|
+
queues.each do |queue|
|
|
268
|
+
next if queue.empty?
|
|
269
|
+
|
|
270
|
+
throw :phase2_done if results.size >= limit
|
|
271
|
+
|
|
272
|
+
if phase2_scanned >= max_scan
|
|
273
|
+
partial = true
|
|
274
|
+
throw :phase2_done
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
type_name, id = queue.shift
|
|
278
|
+
progressed = true
|
|
162
279
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
280
|
+
unit = find_unit(id)
|
|
281
|
+
next unless unit
|
|
282
|
+
|
|
283
|
+
phase2_scanned += 1
|
|
284
|
+
|
|
285
|
+
if fields.include?('source_code') && unit['source_code'] && pattern.match?(unit['source_code'])
|
|
286
|
+
results << { identifier: id, type: type_name, match_field: 'source_code' }
|
|
287
|
+
elsif fields.include?('metadata') && unit['metadata'] && pattern.match?(unit['metadata'].to_json)
|
|
288
|
+
results << { identifier: id, type: type_name, match_field: 'metadata' }
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
break unless progressed
|
|
167
292
|
end
|
|
168
293
|
end
|
|
169
294
|
end
|
|
170
295
|
|
|
171
|
-
results.first(limit)
|
|
296
|
+
response = { results: results.first(limit) }
|
|
297
|
+
response[:note] = notes.join('; ') unless notes.empty?
|
|
298
|
+
response[:partial] = true if partial
|
|
299
|
+
response
|
|
172
300
|
end
|
|
173
301
|
|
|
174
302
|
# BFS traversal of forward dependencies.
|
|
@@ -176,9 +304,10 @@ module Woods
|
|
|
176
304
|
# @param identifier [String] Starting unit identifier
|
|
177
305
|
# @param depth [Integer] Maximum traversal depth
|
|
178
306
|
# @param types [Array<String>, nil] Filter to these singular type names
|
|
307
|
+
# @param via [Array<String>, nil] Filter to these relationship types (e.g. ["link_to", "redirect_to"])
|
|
179
308
|
# @return [Hash] { root:, nodes: { id => { type:, depth:, deps: [] } } }
|
|
180
|
-
def traverse_dependencies(identifier, depth: 2, types: nil)
|
|
181
|
-
traverse(identifier, depth: depth, types: types, direction: :forward)
|
|
309
|
+
def traverse_dependencies(identifier, depth: 2, types: nil, via: nil)
|
|
310
|
+
traverse(identifier, depth: depth, types: types, via: via, direction: :forward)
|
|
182
311
|
end
|
|
183
312
|
|
|
184
313
|
# BFS traversal of reverse dependencies (dependents).
|
|
@@ -186,9 +315,10 @@ module Woods
|
|
|
186
315
|
# @param identifier [String] Starting unit identifier
|
|
187
316
|
# @param depth [Integer] Maximum traversal depth
|
|
188
317
|
# @param types [Array<String>, nil] Filter to these singular type names
|
|
318
|
+
# @param via [Array<String>, nil] Filter to these relationship types (e.g. ["link_to", "redirect_to"])
|
|
189
319
|
# @return [Hash] { root:, nodes: { id => { type:, depth:, deps: [] } } }
|
|
190
|
-
def traverse_dependents(identifier, depth: 2, types: nil)
|
|
191
|
-
traverse(identifier, depth: depth, types: types, direction: :reverse)
|
|
320
|
+
def traverse_dependents(identifier, depth: 2, types: nil, via: nil)
|
|
321
|
+
traverse(identifier, depth: depth, types: types, via: via, direction: :reverse)
|
|
192
322
|
end
|
|
193
323
|
|
|
194
324
|
# Search rails_source units by concept keyword.
|
|
@@ -200,7 +330,12 @@ module Woods
|
|
|
200
330
|
# @param limit [Integer] Maximum results to return
|
|
201
331
|
# @return [Array<Hash>] Matching rails_source unit summaries
|
|
202
332
|
def framework_sources(keyword, limit: 20)
|
|
203
|
-
|
|
333
|
+
# Multi-word keywords ("ActiveRecord callbacks") are split on
|
|
334
|
+
# whitespace and ANDed. Single-word queries behave as before.
|
|
335
|
+
tokens = keyword.to_s.strip.split(/\s+/)
|
|
336
|
+
return [] if tokens.empty?
|
|
337
|
+
|
|
338
|
+
patterns = tokens.map { |t| Regexp.new(Regexp.escape(t), Regexp::IGNORECASE) }
|
|
204
339
|
results = []
|
|
205
340
|
|
|
206
341
|
entries = read_index('rails_source')
|
|
@@ -211,9 +346,12 @@ module Woods
|
|
|
211
346
|
unit = find_unit(id)
|
|
212
347
|
next unless unit
|
|
213
348
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
349
|
+
metadata_json = unit['metadata']&.to_json
|
|
350
|
+
matched = patterns.all? do |pat|
|
|
351
|
+
pat.match?(id) ||
|
|
352
|
+
(unit['source_code'] && pat.match?(unit['source_code'])) ||
|
|
353
|
+
(metadata_json && pat.match?(metadata_json))
|
|
354
|
+
end
|
|
217
355
|
|
|
218
356
|
next unless matched
|
|
219
357
|
|
|
@@ -259,7 +397,8 @@ module Woods
|
|
|
259
397
|
identifier: id,
|
|
260
398
|
type: DIR_TO_TYPE[dir],
|
|
261
399
|
file_path: unit['file_path'],
|
|
262
|
-
last_modified: last_modified
|
|
400
|
+
last_modified: last_modified,
|
|
401
|
+
author: unit.dig('metadata', 'git', 'last_author')
|
|
263
402
|
}
|
|
264
403
|
end
|
|
265
404
|
end
|
|
@@ -277,6 +416,46 @@ module Woods
|
|
|
277
416
|
|
|
278
417
|
private
|
|
279
418
|
|
|
419
|
+
# Compile a case-insensitive regex from a query string.
|
|
420
|
+
#
|
|
421
|
+
# Treats the query as a raw Ruby regex pattern. Falls back to an escaped
|
|
422
|
+
# literal match (with a :note field added by callers) when the pattern is
|
|
423
|
+
# invalid.
|
|
424
|
+
#
|
|
425
|
+
# @param query [String] Raw regex pattern
|
|
426
|
+
# @return [Regexp] Compiled case-insensitive pattern
|
|
427
|
+
def compile_search_pattern(query)
|
|
428
|
+
Regexp.new(query, Regexp::IGNORECASE)
|
|
429
|
+
rescue RegexpError
|
|
430
|
+
Regexp.new(Regexp.escape(query), Regexp::IGNORECASE)
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
# Case-insensitive literal prefix/suffix check on an identifier.
|
|
434
|
+
# Nil filters are treated as "no restriction".
|
|
435
|
+
def identifier_passes_prefix_suffix?(identifier, prefix, suffix)
|
|
436
|
+
return false unless identifier
|
|
437
|
+
|
|
438
|
+
downcased = identifier.downcase
|
|
439
|
+
return false if prefix && !downcased.start_with?(prefix)
|
|
440
|
+
return false if suffix && !downcased.end_with?(suffix)
|
|
441
|
+
|
|
442
|
+
true
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
# Combined regex + prefix/suffix check used only by broad-match detection,
|
|
446
|
+
# which reports how many identifiers would actually surface.
|
|
447
|
+
def identifier_passes_filters?(identifier, pattern, prefix, suffix)
|
|
448
|
+
return false unless identifier_passes_prefix_suffix?(identifier, prefix, suffix)
|
|
449
|
+
|
|
450
|
+
pattern.match?(identifier)
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
# Memoized normalized edges — converts bare strings (old format) to hashes once.
|
|
454
|
+
# Cleared by reload! alongside raw_graph_data.
|
|
455
|
+
def normalized_graph_edges
|
|
456
|
+
@normalized_graph_edges ||= normalize_all_edges(raw_graph_data['edges'] || {})
|
|
457
|
+
end
|
|
458
|
+
|
|
280
459
|
# Build identifier → { type_dir, filename } map from all _index.json files.
|
|
281
460
|
def identifier_map
|
|
282
461
|
@identifier_map ||= build_identifier_map
|
|
@@ -340,13 +519,28 @@ module Woods
|
|
|
340
519
|
end
|
|
341
520
|
|
|
342
521
|
# BFS traversal in either direction.
|
|
343
|
-
|
|
522
|
+
#
|
|
523
|
+
# Edges may be stored as bare strings (old format) or as
|
|
524
|
+
# +{"target" => "...", "via" => "..."}+ hashes (new format).
|
|
525
|
+
# This method handles both transparently.
|
|
526
|
+
#
|
|
527
|
+
# @param identifier [String] Starting unit identifier
|
|
528
|
+
# @param depth [Integer] Maximum traversal depth
|
|
529
|
+
# @param types [Array<String>, nil] Filter to these unit type names
|
|
530
|
+
# @param via [Array<String>, nil] Filter to these relationship types
|
|
531
|
+
# @param direction [:forward, :reverse] Traversal direction
|
|
532
|
+
# @return [Hash]
|
|
533
|
+
def traverse(identifier, depth:, types:, via:, direction:)
|
|
344
534
|
graph_data = raw_graph_data
|
|
345
535
|
nodes_data = graph_data['nodes'] || {}
|
|
346
536
|
|
|
347
537
|
return { root: identifier, found: false, nodes: {} } unless nodes_data.key?(identifier)
|
|
348
538
|
|
|
539
|
+
# Normalize edges once per graph load — memoized alongside raw_graph_data
|
|
540
|
+
normalized_edges = normalized_graph_edges
|
|
541
|
+
|
|
349
542
|
type_set = types&.to_set
|
|
543
|
+
via_set = via&.to_set
|
|
350
544
|
visited = Set.new([identifier])
|
|
351
545
|
queue = [[identifier, 0]]
|
|
352
546
|
result_nodes = {}
|
|
@@ -355,12 +549,12 @@ module Woods
|
|
|
355
549
|
current, current_depth = queue.shift
|
|
356
550
|
|
|
357
551
|
neighbors = if direction == :forward
|
|
358
|
-
(
|
|
552
|
+
resolve_forward_neighbors(normalized_edges, current, via_set)
|
|
359
553
|
else
|
|
360
|
-
(graph_data
|
|
554
|
+
resolve_reverse_neighbors(graph_data, normalized_edges, current, via_set)
|
|
361
555
|
end
|
|
362
556
|
|
|
363
|
-
# Filter by type if requested
|
|
557
|
+
# Filter by node type if requested
|
|
364
558
|
filtered = if type_set
|
|
365
559
|
neighbors.select do |n|
|
|
366
560
|
node_meta = nodes_data[n]
|
|
@@ -370,14 +564,18 @@ module Woods
|
|
|
370
564
|
neighbors
|
|
371
565
|
end
|
|
372
566
|
|
|
567
|
+
# At max depth, record the node with empty deps so the renderer
|
|
568
|
+
# doesn't emit an extra level of unexpanded neighbors. The parent
|
|
569
|
+
# node's deps list already shows this node as a child.
|
|
570
|
+
will_expand = current_depth < depth
|
|
373
571
|
node_meta = nodes_data[current]
|
|
374
572
|
result_nodes[current] = {
|
|
375
573
|
type: node_meta&.dig('type'),
|
|
376
574
|
depth: current_depth,
|
|
377
|
-
deps: filtered
|
|
575
|
+
deps: will_expand ? filtered : []
|
|
378
576
|
}
|
|
379
577
|
|
|
380
|
-
next
|
|
578
|
+
next unless will_expand
|
|
381
579
|
|
|
382
580
|
filtered.each do |neighbor|
|
|
383
581
|
unless visited.include?(neighbor)
|
|
@@ -389,6 +587,43 @@ module Woods
|
|
|
389
587
|
|
|
390
588
|
{ root: identifier, found: true, nodes: result_nodes }
|
|
391
589
|
end
|
|
590
|
+
|
|
591
|
+
# Normalize all edge arrays once, converting bare strings to hashes.
|
|
592
|
+
#
|
|
593
|
+
# NOTE: This uses string keys ('target', 'via') because IndexReader
|
|
594
|
+
# operates on parsed JSON. DependencyGraph.normalize_edges uses symbol
|
|
595
|
+
# keys (:target, :via) for in-memory Ruby objects. The two normalizers
|
|
596
|
+
# are intentionally separate — do not merge them.
|
|
597
|
+
#
|
|
598
|
+
# @param raw_edges [Hash] Raw edges from graph JSON
|
|
599
|
+
# @return [Hash] Edges with all entries as { 'target' => ..., 'via' => ... } hashes
|
|
600
|
+
def normalize_all_edges(raw_edges)
|
|
601
|
+
raw_edges.transform_values do |entries|
|
|
602
|
+
entries.map { |e| e.is_a?(Hash) ? e : { 'target' => e } }
|
|
603
|
+
end
|
|
604
|
+
end
|
|
605
|
+
|
|
606
|
+
# Extract forward neighbor identifiers, optionally filtered by via type.
|
|
607
|
+
# Expects pre-normalized edges (all entries are hashes).
|
|
608
|
+
def resolve_forward_neighbors(normalized_edges, identifier, via_set)
|
|
609
|
+
edges = normalized_edges[identifier] || []
|
|
610
|
+
edges = edges.select { |e| via_set.include?(e['via']) } if via_set
|
|
611
|
+
edges.map { |e| e['target'] }
|
|
612
|
+
end
|
|
613
|
+
|
|
614
|
+
# Extract reverse neighbor identifiers, optionally filtered by via type.
|
|
615
|
+
# Reverse edges are stored as bare identifier arrays. When via filtering
|
|
616
|
+
# is requested, checks each dependent's pre-normalized forward edges to
|
|
617
|
+
# find those pointing at +identifier+ with a matching via type.
|
|
618
|
+
def resolve_reverse_neighbors(graph_data, normalized_edges, identifier, via_set)
|
|
619
|
+
dependents = (graph_data['reverse'] || {})[identifier] || []
|
|
620
|
+
return dependents unless via_set
|
|
621
|
+
|
|
622
|
+
dependents.select do |dep|
|
|
623
|
+
forward = normalized_edges[dep] || []
|
|
624
|
+
forward.any? { |e| e['target'] == identifier && via_set.include?(e['via']) }
|
|
625
|
+
end
|
|
626
|
+
end
|
|
392
627
|
end
|
|
393
628
|
end
|
|
394
629
|
end
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
require_relative '../util/host_guard'
|
|
6
|
+
|
|
7
|
+
module Woods
|
|
8
|
+
module MCP
|
|
9
|
+
# Rack middleware that rejects browser-origin requests from unexpected sources.
|
|
10
|
+
#
|
|
11
|
+
# Defends against DNS rebinding and cross-site request forgery against a
|
|
12
|
+
# locally-bound MCP HTTP server. Defaults to loopback-only origins; operators
|
|
13
|
+
# can widen via WOODS_MCP_HTTP_ALLOWED_ORIGINS (comma-separated) or by passing
|
|
14
|
+
# :allowed_origins. Requests without an Origin header (curl, server-to-server,
|
|
15
|
+
# MCP stdio clients) are allowed through — bearer auth still gates them.
|
|
16
|
+
#
|
|
17
|
+
# Host header validation defends against the residual DNS-rebinding surface:
|
|
18
|
+
# an attacker who controls a hostname they can point at the server's IP can
|
|
19
|
+
# pass the Origin check (the browser sends their origin, which we might
|
|
20
|
+
# allow-list for some deployments) while Host carries their hostname. By
|
|
21
|
+
# also requiring Host to appear in the allow-list (or to be a loopback
|
|
22
|
+
# address), we close that gap even when Rails is bound to 0.0.0.0.
|
|
23
|
+
#
|
|
24
|
+
# Port-matching: an allow-list entry WITHOUT a port (`http://localhost`)
|
|
25
|
+
# matches that host on any port. An entry WITH a port (`http://localhost:3000`)
|
|
26
|
+
# requires an exact port match. Specify explicit ports when port isolation
|
|
27
|
+
# matters.
|
|
28
|
+
#
|
|
29
|
+
# Also answers CORS preflight (OPTIONS) with the matching allow-list.
|
|
30
|
+
class OriginGuard
|
|
31
|
+
DEFAULT_ALLOWED = %w[
|
|
32
|
+
http://localhost http://127.0.0.1 http://[::1]
|
|
33
|
+
https://localhost https://127.0.0.1 https://[::1]
|
|
34
|
+
].freeze
|
|
35
|
+
|
|
36
|
+
# Hosts that always pass the Host-header check even without an explicit
|
|
37
|
+
# allow-list entry — they resolve to loopback by definition and cannot
|
|
38
|
+
# be rebound to an attacker-controlled address.
|
|
39
|
+
LOOPBACK_HOSTS = %w[localhost 127.0.0.1 ::1 [::1]].freeze
|
|
40
|
+
|
|
41
|
+
ALLOWED_METHODS = 'GET, POST, DELETE, OPTIONS'
|
|
42
|
+
ALLOWED_HEADERS = 'Authorization, Content-Type, Mcp-Session-Id'
|
|
43
|
+
|
|
44
|
+
# Response bodies are emitted as constants so the rejected Origin /
|
|
45
|
+
# Host value is NEVER echoed back to the caller — preventing a
|
|
46
|
+
# stored-XSS / log-injection surface where an attacker-supplied
|
|
47
|
+
# header ended up embedded in the JSON error.
|
|
48
|
+
FORBIDDEN_BODY = { jsonrpc: '2.0', error: { code: -32_002, message: 'Origin not allowed' }, id: nil }.to_json.freeze
|
|
49
|
+
FORBIDDEN_HOST_BODY = { jsonrpc: '2.0', error: { code: -32_002, message: 'Host not allowed' }, id: nil }.to_json.freeze
|
|
50
|
+
|
|
51
|
+
def initialize(app, allowed_origins: nil)
|
|
52
|
+
@app = app
|
|
53
|
+
@allowed = Array(allowed_origins).compact.reject { |o| o.to_s.strip.empty? }.map { |o| normalize(o) }
|
|
54
|
+
@allowed = DEFAULT_ALLOWED.dup if @allowed.empty?
|
|
55
|
+
@allowed_hosts = @allowed.map { |o| extract_host(o) }.compact.uniq
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def call(env)
|
|
59
|
+
origin = env['HTTP_ORIGIN']
|
|
60
|
+
method = env['REQUEST_METHOD']
|
|
61
|
+
host = env['HTTP_HOST']
|
|
62
|
+
|
|
63
|
+
return forbidden if origin && !origin_allowed?(origin)
|
|
64
|
+
return forbidden_host if host && !host_allowed?(host)
|
|
65
|
+
|
|
66
|
+
return preflight(origin) if method == 'OPTIONS'
|
|
67
|
+
|
|
68
|
+
status, headers, body = @app.call(env)
|
|
69
|
+
headers = cors_headers(origin).merge(headers) if origin && origin_allowed?(origin)
|
|
70
|
+
[status, headers, body]
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
private
|
|
74
|
+
|
|
75
|
+
def normalize(origin)
|
|
76
|
+
origin.to_s.sub(%r{/\z}, '').downcase
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def extract_host(origin)
|
|
80
|
+
host = origin.to_s.sub(%r{\Ahttps?://}, '').sub(%r{/.*\z}, '').downcase
|
|
81
|
+
host.empty? ? nil : host
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def host_allowed?(host)
|
|
85
|
+
# Canonicalize (strip port, trailing dot, IPv6 brackets) via the
|
|
86
|
+
# shared helper so Qdrant and OriginGuard stay in sync on bypass
|
|
87
|
+
# notations. `normalized` keeps the port for literal allow-list
|
|
88
|
+
# lookups; `bare` drops it for loopback matching.
|
|
89
|
+
normalized = host.to_s.downcase.sub(/\.\z/, '')
|
|
90
|
+
bare = Util::HostGuard.canonicalize(host)
|
|
91
|
+
|
|
92
|
+
# Reject non-canonical numeric hosts. Net::HTTP / getaddrinfo
|
|
93
|
+
# would happily resolve `0x7f000001` or `2130706433` to 127.0.0.1,
|
|
94
|
+
# bypassing the loopback allow-list.
|
|
95
|
+
return false if Util::HostGuard.suspicious_numeric_host?(bare)
|
|
96
|
+
|
|
97
|
+
return true if LOOPBACK_HOSTS.include?(bare)
|
|
98
|
+
|
|
99
|
+
@allowed_hosts.include?(normalized) || @allowed_hosts.include?(bare)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def origin_allowed?(origin)
|
|
103
|
+
return false if origin.match?(/[[:cntrl:]]/)
|
|
104
|
+
|
|
105
|
+
@allowed.include?(normalize(origin)) || @allowed.include?(normalize(origin).sub(/:\d+\z/, ''))
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def preflight(origin)
|
|
109
|
+
headers = origin && origin_allowed?(origin) ? cors_headers(origin) : {}
|
|
110
|
+
[204, headers, []]
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def cors_headers(origin)
|
|
114
|
+
{
|
|
115
|
+
'access-control-allow-origin' => origin,
|
|
116
|
+
'access-control-allow-methods' => ALLOWED_METHODS,
|
|
117
|
+
'access-control-allow-headers' => ALLOWED_HEADERS,
|
|
118
|
+
'access-control-expose-headers' => 'Mcp-Session-Id',
|
|
119
|
+
'vary' => 'Origin'
|
|
120
|
+
}
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def forbidden
|
|
124
|
+
[403, { 'content-type' => 'application/json' }, [FORBIDDEN_BODY]]
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def forbidden_host
|
|
128
|
+
[403, { 'content-type' => 'application/json' }, [FORBIDDEN_HOST_BODY]]
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|