woods 1.1.0 β†’ 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +186 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +69 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +210 -0
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +100 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +771 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +163 -0
  102. data/lib/woods/unblocked/document_builder.rb +326 -0
  103. data/lib/woods/unblocked/exporter.rb +201 -0
  104. data/lib/woods/unblocked/rate_limiter.rb +94 -0
  105. data/lib/woods/util/host_guard.rb +61 -0
  106. data/lib/woods/version.rb +1 -1
  107. data/lib/woods.rb +130 -6
  108. metadata +73 -4
@@ -0,0 +1,166 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'uri'
5
+
6
+ module Woods
7
+ module MCP
8
+ # Probes an embedding provider's HTTP endpoint to confirm it is reachable
9
+ # before the MCP server commits to a fully-hydrated start.
10
+ #
11
+ # A probe is pure: input β†’ result-or-raise. No logging, no stderr writes,
12
+ # no side effects. The caller decides what to do with a failure.
13
+ #
14
+ # Raises {Woods::MCP::ProviderUnreachable} on any network failure with
15
+ # structured +url:+ and +reason:+ fields so callers can pattern-match on
16
+ # the reason string. Raises +ArgumentError+ for unknown provider classes β€”
17
+ # that is a programming error, not a runtime condition.
18
+ #
19
+ # @example
20
+ # Woods::MCP::ProviderProbe.reachable!(provider) # β†’ provider or raises
21
+ module ProviderProbe
22
+ # Connect timeout for Ollama probes (LAN/localhost β€” fail fast).
23
+ OLLAMA_OPEN_TIMEOUT = 0.5
24
+ # Read timeout for Ollama probes.
25
+ OLLAMA_READ_TIMEOUT = 0.5
26
+
27
+ # Connect timeout for OpenAI probes (WAN β€” allow for latency).
28
+ OPENAI_OPEN_TIMEOUT = 2.0
29
+ # Read timeout for OpenAI probes.
30
+ OPENAI_READ_TIMEOUT = 2.0
31
+
32
+ # Probe +provider+ and return it if reachable.
33
+ #
34
+ # Dispatches on the provider's concrete class:
35
+ # - {Woods::Embedding::Provider::Ollama} β†’ +GET /api/tags+ on the
36
+ # configured host. Any non-5xx response is treated as reachable.
37
+ # - {Woods::Embedding::Provider::OpenAI} β†’ +GET /v1/models+ on
38
+ # +api.openai.com:443+. A 401 response raises +ProviderUnreachable+
39
+ # with +reason: "unauthorized"+ because an invalid key means the
40
+ # provider cannot be used; network failures raise with the appropriate
41
+ # reason string.
42
+ # - Any other class β†’ raises +ArgumentError+.
43
+ #
44
+ # @param provider [Woods::Embedding::Provider::Ollama,
45
+ # Woods::Embedding::Provider::OpenAI] a concrete embedding provider
46
+ # @return [Object] the same +provider+ if reachable
47
+ # @raise [Woods::MCP::ProviderUnreachable] if the endpoint is unreachable,
48
+ # times out, returns 5xx, or (for OpenAI) returns 401
49
+ # @raise [ArgumentError] if +provider+ is not a recognised provider class
50
+ def self.reachable!(provider)
51
+ case provider
52
+ when Woods::Embedding::Provider::Ollama
53
+ probe_ollama!(provider)
54
+ when Woods::Embedding::Provider::OpenAI
55
+ probe_openai!(provider)
56
+ else
57
+ raise ArgumentError,
58
+ "#{self}.reachable! does not know how to probe #{provider.class} β€” " \
59
+ 'add a provider-specific probe method or implement #probe_url'
60
+ end
61
+ provider
62
+ end
63
+
64
+ # Probe the Ollama instance backing +provider+.
65
+ #
66
+ # @param provider [Woods::Embedding::Provider::Ollama]
67
+ # @raise [Woods::MCP::ProviderUnreachable]
68
+ # @api private
69
+ def self.probe_ollama!(provider)
70
+ base_url = provider.instance_variable_get(:@host)
71
+ http_get!(base_url, '/api/tags',
72
+ open_timeout: OLLAMA_OPEN_TIMEOUT,
73
+ read_timeout: OLLAMA_READ_TIMEOUT,
74
+ use_ssl: URI.parse(base_url).scheme == 'https') do |response|
75
+ if response.is_a?(Net::HTTPServerError)
76
+ raise Woods::MCP::ProviderUnreachable.new(
77
+ url: base_url,
78
+ reason: 'http_500'
79
+ )
80
+ end
81
+ end
82
+ end
83
+ private_class_method :probe_ollama!
84
+
85
+ # Probe the OpenAI API endpoint.
86
+ #
87
+ # The probe sends an unauthenticated +GET /v1/models+ so it deliberately
88
+ # expects a 401 from a healthy OpenAI. Anything that is not a plain 401
89
+ # or 2xx/3xx means the provider cannot be used from this host:
90
+ #
91
+ # - +401 Unauthorized+ β†’ +reason: "unauthorized"+. The expected response
92
+ # for an unauthed probe; starts :degraded so the first real query
93
+ # carries the API key and surfaces credential errors precisely.
94
+ # - +403 Forbidden+ β†’ +reason: "forbidden"+. Seen when the edge
95
+ # intercepts the request before OpenAI's auth layer (geoblock,
96
+ # corporate proxy). Subsequent embed calls will 403 too, so treating
97
+ # this as reachable would give operators a false-green status.
98
+ # - +5xx+ β†’ +reason: "http_500"+.
99
+ #
100
+ # @param provider [Woods::Embedding::Provider::OpenAI]
101
+ # @raise [Woods::MCP::ProviderUnreachable]
102
+ # @api private
103
+ def self.probe_openai!(_provider)
104
+ base_url = 'https://api.openai.com'
105
+ http_get!(base_url, '/v1/models',
106
+ open_timeout: OPENAI_OPEN_TIMEOUT,
107
+ read_timeout: OPENAI_READ_TIMEOUT,
108
+ use_ssl: true) do |response|
109
+ reason = openai_unreachable_reason(response)
110
+ next unless reason
111
+
112
+ raise Woods::MCP::ProviderUnreachable.new(url: base_url, reason: reason)
113
+ end
114
+ end
115
+ private_class_method :probe_openai!
116
+
117
+ # Map an HTTP response from +GET /v1/models+ to a ProviderUnreachable
118
+ # reason string, or nil when the response signals a healthy provider.
119
+ #
120
+ # Uses +is_a?+ (not +case/when+) so RSpec stubs via
121
+ # +allow(response).to receive(:is_a?).with(...)+ compose cleanly β€”
122
+ # +case/when+ goes through +Module#===+ which some mocks don't round-trip.
123
+ def self.openai_unreachable_reason(response)
124
+ return 'unauthorized' if response.is_a?(Net::HTTPUnauthorized)
125
+ return 'forbidden' if response.is_a?(Net::HTTPForbidden)
126
+ return 'http_500' if response.is_a?(Net::HTTPServerError)
127
+
128
+ nil
129
+ end
130
+ private_class_method :openai_unreachable_reason
131
+
132
+ # Execute +GET path+ against +base_url+ and yield the response to the
133
+ # caller's block for provider-specific checks.
134
+ #
135
+ # All network-level exceptions are translated into
136
+ # {Woods::MCP::ProviderUnreachable} with a machine-readable reason
137
+ # string before propagating.
138
+ #
139
+ # @param base_url [String] scheme + host + optional port
140
+ # @param path [String] request path
141
+ # @param open_timeout [Numeric]
142
+ # @param read_timeout [Numeric]
143
+ # @param use_ssl [Boolean]
144
+ # @yieldparam response [Net::HTTPResponse]
145
+ # @raise [Woods::MCP::ProviderUnreachable]
146
+ # @api private
147
+ def self.http_get!(base_url, path, open_timeout:, read_timeout:, use_ssl:)
148
+ uri = URI.parse(base_url)
149
+ http = Net::HTTP.new(uri.host, uri.port)
150
+ http.open_timeout = open_timeout
151
+ http.read_timeout = read_timeout
152
+ http.use_ssl = use_ssl
153
+
154
+ response = http.start { |h| h.get(path) }
155
+ yield response
156
+ rescue Errno::ECONNREFUSED, Errno::ECONNRESET
157
+ raise Woods::MCP::ProviderUnreachable.new(url: base_url, reason: 'connection_refused')
158
+ rescue Net::OpenTimeout, Errno::ETIMEDOUT, Net::ReadTimeout
159
+ raise Woods::MCP::ProviderUnreachable.new(url: base_url, reason: 'timeout')
160
+ rescue SocketError
161
+ raise Woods::MCP::ProviderUnreachable.new(url: base_url, reason: 'dns_failure')
162
+ end
163
+ private_class_method :http_get!
164
+ end
165
+ end
166
+ end
@@ -59,6 +59,12 @@ module Woods
59
59
  wrap_xml('recent_changes', super)
60
60
  end
61
61
 
62
+ def render_trace_flow(data, **)
63
+ content = super
64
+ entry_point = data[:entry_point] || data['entry_point']
65
+ wrap_xml('trace_flow', content, entry_point: entry_point)
66
+ end
67
+
62
68
  def render_default(data)
63
69
  wrap_xml('result', super)
64
70
  end
@@ -102,7 +102,11 @@ module Woods
102
102
  %w[rails_version ruby_version git_branch git_sha extracted_at].each do |key|
103
103
  lines << "- **#{key.tr('_', ' ').capitalize}:** #{manifest[key]}" if manifest[key]
104
104
  end
105
- lines << "- **Total units:** #{manifest['total_units']}" if manifest['total_units']
105
+ lines << "- **Total units indexed:** #{manifest['total_units']}" if manifest['total_units']
106
+ template_engines = fetch_key(data, :template_engines)
107
+ if template_engines.is_a?(Array) && template_engines.any?
108
+ lines << "- **Supported template engines:** #{template_engines.join(', ')}"
109
+ end
106
110
  lines << ''
107
111
 
108
112
  counts = manifest['counts']
@@ -120,11 +124,32 @@ module Woods
120
124
  lines << '### Summary'
121
125
  lines << ''
122
126
  lines << summary
127
+ lines << ''
123
128
  end
124
129
 
130
+ lines << structure_denominators_glossary
125
131
  lines.join("\n").rstrip
126
132
  end
127
133
 
134
+ # Canonical glossary of the three index denominators that differ
135
+ # across Woods' tools. Surfaced once in the structure tool so
136
+ # readers don't have to cross-reference other tools' outputs to
137
+ # understand why the numbers disagree. Resolves #105.
138
+ def structure_denominators_glossary
139
+ <<~GLOSSARY
140
+ ### Denominators
141
+
142
+ - **units_indexed** (manifest.json, `structure` tool) β€” total
143
+ ExtractedUnits written by the extractor. Canonical count.
144
+ - **graph_nodes** (`pagerank`, `dependencies`, `dependents`) β€”
145
+ units present in the dependency graph. Excludes orphans
146
+ that have no incoming or outgoing edges.
147
+ - **searchable_entries** (`codebase_retrieve`) β€” retriever-store
148
+ entries, including per-chunk rows for units long enough to
149
+ be chunked. Always β‰₯ units_indexed.
150
+ GLOSSARY
151
+ end
152
+
128
153
  # ── graph_analysis ──────────────────────────────────────────
129
154
 
130
155
  # @param data [Hash] Graph analysis with section arrays and stats
@@ -147,7 +172,9 @@ module Woods
147
172
  lines << "### #{section.tr('_', ' ').capitalize}"
148
173
  lines << ''
149
174
  items.each do |item|
150
- lines << if item.is_a?(Hash)
175
+ lines << if item.is_a?(Hash) && item.key?('score')
176
+ "- **#{item['identifier']}** (#{item['type']}) β€” score: #{item['score']}"
177
+ elsif item.is_a?(Hash)
151
178
  "- **#{item['identifier']}** (#{item['type']}) β€” #{item['dependent_count']} dependents"
152
179
  else
153
180
  "- #{item}"
@@ -165,6 +192,67 @@ module Woods
165
192
  lines.join("\n").rstrip
166
193
  end
167
194
 
195
+ # ── domain_clusters ────────────────────────────────────────
196
+
197
+ # @param data [Hash] Domain cluster data with :clusters and :total
198
+ # @return [String] Markdown domain cluster overview
199
+ def render_domain_clusters(data, **)
200
+ clusters = fetch_key(data, :clusters) || []
201
+ total = fetch_key(data, :total) || clusters.size
202
+ lines = []
203
+ lines << '## Domain Clusters'
204
+ lines << ''
205
+ lines << "#{total} domains detected."
206
+ lines << ''
207
+
208
+ clusters.each do |cluster|
209
+ name = cluster[:name] || cluster['name']
210
+ member_count = cluster[:member_count] || cluster['member_count'] || 0
211
+ hub = cluster[:hub] || cluster['hub']
212
+ lines << "### #{name} (#{member_count} units)"
213
+ lines << ''
214
+ lines << "**Hub:** #{hub}" if hub
215
+ lines << ''
216
+
217
+ # Type breakdown
218
+ types = cluster[:types] || cluster['types']
219
+ if types.is_a?(Hash) && types.any?
220
+ type_parts = types.sort_by { |_, count| -count }.map { |type, count| "#{count} #{type}s" }
221
+ lines << "**Types:** #{type_parts.join(', ')}"
222
+ end
223
+
224
+ # Entry points
225
+ entry_points = cluster[:entry_points] || cluster['entry_points'] || []
226
+ lines << "**Entry points:** #{entry_points.first(10).join(', ')}" if entry_points.any?
227
+
228
+ # Members (show first 15)
229
+ members = cluster[:members] || cluster['members'] || []
230
+ if members.any?
231
+ lines << ''
232
+ lines << '**Members:**'
233
+ members.first(15).each { |m| lines << "- #{m}" }
234
+ lines << "- _... and #{members.size - 15} more_" if members.size > 15
235
+ end
236
+
237
+ # Boundary edges (show first 10)
238
+ boundaries = cluster[:boundary_edges] || cluster['boundary_edges'] || []
239
+ if boundaries.any?
240
+ lines << ''
241
+ lines << '**Boundary connections:**'
242
+ boundaries.first(10).each do |edge|
243
+ from = edge[:from] || edge['from']
244
+ to = edge[:to] || edge['to']
245
+ via = edge[:via] || edge['via']
246
+ lines << "- #{from} β†’ #{to} (#{via})"
247
+ end
248
+ end
249
+
250
+ lines << ''
251
+ end
252
+
253
+ lines.join("\n").rstrip
254
+ end
255
+
168
256
  # ── pagerank ────────────────────────────────────────────────
169
257
 
170
258
  # @param data [Hash] PageRank data with :total_nodes and :results
@@ -173,7 +261,7 @@ module Woods
173
261
  lines = []
174
262
  lines << '## PageRank Scores'
175
263
  lines << ''
176
- lines << "#{fetch_key(data, :total_nodes)} nodes in graph."
264
+ lines << "Ranking #{fetch_key(data, :total_nodes)} nodes in the dependency graph."
177
265
  lines << ''
178
266
  lines << '| Rank | Identifier | Type | Score |'
179
267
  lines << '|------|-----------|------|-------|'
@@ -240,6 +328,15 @@ module Woods
240
328
  lines.join("\n").rstrip
241
329
  end
242
330
 
331
+ # ── trace_flow ──────────────────────────────────────────────
332
+
333
+ # @param data [Hash] Serialized FlowDocument
334
+ # @return [String] Markdown flow document with a step-by-step operations table
335
+ def render_trace_flow(data, **)
336
+ require_relative '../../flow_document'
337
+ Woods::FlowDocument.from_h(data).to_markdown
338
+ end
339
+
243
340
  # ── Default fallback ────────────────────────────────────────
244
341
 
245
342
  # @param data [Object] Any data
@@ -89,9 +89,14 @@ module Woods
89
89
  lines << 'Codebase Structure'
90
90
  lines << DIVIDER
91
91
 
92
- %w[rails_version ruby_version git_branch git_sha extracted_at total_units].each do |key|
92
+ %w[rails_version ruby_version git_branch git_sha extracted_at].each do |key|
93
93
  lines << " #{key}: #{manifest[key]}" if manifest[key]
94
94
  end
95
+ lines << " units_indexed: #{manifest['total_units']}" if manifest['total_units']
96
+ template_engines = fetch_key(data, :template_engines)
97
+ if template_engines.is_a?(Array) && template_engines.any?
98
+ lines << " template_engines: #{template_engines.join(', ')}"
99
+ end
95
100
 
96
101
  counts = manifest['counts']
97
102
  if counts.is_a?(Hash) && counts.any?
@@ -107,6 +112,15 @@ module Woods
107
112
  lines << summary
108
113
  end
109
114
 
115
+ lines << ''
116
+ lines << DIVIDER
117
+ lines << 'Denominators:'
118
+ lines << ' units_indexed (manifest, structure): total ExtractedUnits written.'
119
+ lines << ' graph_nodes (pagerank, dependencies, dependents): units in the graph'
120
+ lines << ' (excludes orphans with no incoming/outgoing edges).'
121
+ lines << ' searchable_entries (codebase_retrieve): retriever-store entries including'
122
+ lines << ' per-chunk rows. Always >= units_indexed.'
123
+
110
124
  lines.join("\n").rstrip
111
125
  end
112
126
 
@@ -144,7 +158,7 @@ module Woods
144
158
 
145
159
  def render_pagerank(data, **)
146
160
  lines = []
147
- lines << "PageRank Scores (#{fetch_key(data, :total_nodes)} nodes)"
161
+ lines << "PageRank Scores (ranking #{fetch_key(data, :total_nodes)} graph nodes)"
148
162
  lines << DIVIDER
149
163
 
150
164
  results = fetch_key(data, :results, [])