woods 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +186 -0
- data/README.md +20 -8
- data/exe/woods-console +51 -6
- data/exe/woods-console-mcp +24 -4
- data/exe/woods-mcp +30 -7
- data/exe/woods-mcp-http +47 -6
- data/lib/generators/woods/install_generator.rb +13 -4
- data/lib/generators/woods/templates/woods.rb.tt +155 -0
- data/lib/tasks/woods.rake +69 -50
- data/lib/woods/builder.rb +174 -9
- data/lib/woods/cache/cache_middleware.rb +360 -31
- data/lib/woods/chunking/semantic_chunker.rb +334 -7
- data/lib/woods/console/adapters/job_adapter.rb +10 -4
- data/lib/woods/console/audit_logger.rb +76 -4
- data/lib/woods/console/bridge.rb +48 -15
- data/lib/woods/console/bridge_protocol.rb +44 -0
- data/lib/woods/console/confirmation.rb +3 -4
- data/lib/woods/console/console_response_renderer.rb +56 -18
- data/lib/woods/console/credential_index.rb +201 -0
- data/lib/woods/console/credential_scanner.rb +302 -0
- data/lib/woods/console/dispatch_pipeline.rb +138 -0
- data/lib/woods/console/embedded_executor.rb +682 -35
- data/lib/woods/console/eval_guard.rb +319 -0
- data/lib/woods/console/model_validator.rb +1 -3
- data/lib/woods/console/rack_middleware.rb +185 -29
- data/lib/woods/console/redactor.rb +161 -0
- data/lib/woods/console/response_context.rb +127 -0
- data/lib/woods/console/safe_context.rb +220 -23
- data/lib/woods/console/scope_predicate_parser.rb +131 -0
- data/lib/woods/console/server.rb +417 -486
- data/lib/woods/console/sql_noise_stripper.rb +87 -0
- data/lib/woods/console/sql_table_scanner.rb +213 -0
- data/lib/woods/console/sql_validator.rb +81 -31
- data/lib/woods/console/table_gate.rb +93 -0
- data/lib/woods/console/tool_specs.rb +552 -0
- data/lib/woods/console/tools/tier1.rb +3 -3
- data/lib/woods/console/tools/tier4.rb +7 -1
- data/lib/woods/dependency_graph.rb +66 -7
- data/lib/woods/embedding/indexer.rb +190 -6
- data/lib/woods/embedding/openai.rb +40 -4
- data/lib/woods/embedding/provider.rb +104 -8
- data/lib/woods/embedding/text_preparer.rb +23 -3
- data/lib/woods/embedding/token_counter.rb +133 -0
- data/lib/woods/evaluation/baseline_runner.rb +20 -2
- data/lib/woods/evaluation/metrics.rb +4 -1
- data/lib/woods/extracted_unit.rb +1 -0
- data/lib/woods/extractor.rb +7 -1
- data/lib/woods/extractors/controller_extractor.rb +6 -0
- data/lib/woods/extractors/mailer_extractor.rb +16 -2
- data/lib/woods/extractors/model_extractor.rb +6 -1
- data/lib/woods/extractors/phlex_extractor.rb +13 -4
- data/lib/woods/extractors/rails_source_extractor.rb +2 -0
- data/lib/woods/extractors/route_helper_resolver.rb +130 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
- data/lib/woods/extractors/view_component_extractor.rb +12 -1
- data/lib/woods/extractors/view_engines/base.rb +141 -0
- data/lib/woods/extractors/view_engines/erb.rb +145 -0
- data/lib/woods/extractors/view_template_extractor.rb +92 -133
- data/lib/woods/flow_assembler.rb +23 -15
- data/lib/woods/flow_precomputer.rb +21 -2
- data/lib/woods/graph_analyzer.rb +210 -0
- data/lib/woods/index_artifact.rb +173 -0
- data/lib/woods/mcp/bearer_auth.rb +45 -0
- data/lib/woods/mcp/bootstrap_state.rb +94 -0
- data/lib/woods/mcp/bootstrapper.rb +337 -16
- data/lib/woods/mcp/config_resolver.rb +288 -0
- data/lib/woods/mcp/errors.rb +134 -0
- data/lib/woods/mcp/index_reader.rb +265 -30
- data/lib/woods/mcp/origin_guard.rb +132 -0
- data/lib/woods/mcp/provider_probe.rb +166 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +100 -3
- data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
- data/lib/woods/mcp/server.rb +771 -137
- data/lib/woods/model_name_cache.rb +78 -2
- data/lib/woods/notion/client.rb +25 -2
- data/lib/woods/notion/mappers/model_mapper.rb +36 -2
- data/lib/woods/railtie.rb +55 -15
- data/lib/woods/resilience/circuit_breaker.rb +9 -2
- data/lib/woods/resilience/retryable_provider.rb +40 -3
- data/lib/woods/resolved_config.rb +299 -0
- data/lib/woods/retrieval/context_assembler.rb +112 -5
- data/lib/woods/retrieval/query_classifier.rb +1 -1
- data/lib/woods/retrieval/ranker.rb +55 -6
- data/lib/woods/retrieval/search_executor.rb +42 -13
- data/lib/woods/retriever.rb +330 -24
- data/lib/woods/session_tracer/middleware.rb +35 -1
- data/lib/woods/storage/graph_store.rb +39 -0
- data/lib/woods/storage/inapplicable_backend.rb +14 -0
- data/lib/woods/storage/metadata_store.rb +129 -1
- data/lib/woods/storage/pgvector.rb +70 -8
- data/lib/woods/storage/qdrant.rb +196 -5
- data/lib/woods/storage/snapshotter/metadata.rb +172 -0
- data/lib/woods/storage/snapshotter/vector.rb +238 -0
- data/lib/woods/storage/snapshotter.rb +24 -0
- data/lib/woods/storage/vector_store.rb +184 -35
- data/lib/woods/tasks.rb +85 -0
- data/lib/woods/temporal/snapshot_store.rb +49 -1
- data/lib/woods/token_utils.rb +44 -5
- data/lib/woods/unblocked/client.rb +163 -0
- data/lib/woods/unblocked/document_builder.rb +326 -0
- data/lib/woods/unblocked/exporter.rb +201 -0
- data/lib/woods/unblocked/rate_limiter.rb +94 -0
- data/lib/woods/util/host_guard.rb +61 -0
- data/lib/woods/version.rb +1 -1
- data/lib/woods.rb +130 -6
- metadata +73 -4
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Woods
|
|
4
|
+
module Unblocked
|
|
5
|
+
# Converts extracted unit JSON into condensed Markdown documents
|
|
6
|
+
# optimized for Unblocked's code review and Q&A context.
|
|
7
|
+
#
|
|
8
|
+
# Each unit type has a specialized formatting strategy that emphasizes
|
|
9
|
+
# what matters for code review: associations, blast radius, entry points,
|
|
10
|
+
# side effects, and structural complexity.
|
|
11
|
+
#
|
|
12
|
+
# @example
|
|
13
|
+
# builder = DocumentBuilder.new(repo_url: "https://github.com/acme/myapp")
|
|
14
|
+
# doc = builder.build(unit_data)
|
|
15
|
+
# # => { title: "Order (model)", body: "# Order (model)\n...", uri: "https://..." }
|
|
16
|
+
#
|
|
17
|
+
class DocumentBuilder
|
|
18
|
+
# @param repo_url [String] GitHub repo base URL for citation URIs
|
|
19
|
+
def initialize(repo_url:)
|
|
20
|
+
@repo_url = repo_url.chomp('/')
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Build a document hash from a unit's extracted data.
|
|
24
|
+
#
|
|
25
|
+
# @param unit_data [Hash] Parsed unit JSON (from IndexReader)
|
|
26
|
+
# @return [Hash] { title:, body:, uri: }
|
|
27
|
+
def build(unit_data)
|
|
28
|
+
type = unit_data['type']
|
|
29
|
+
identifier = unit_data['identifier']
|
|
30
|
+
file_path = unit_data['file_path']
|
|
31
|
+
|
|
32
|
+
{
|
|
33
|
+
title: "#{identifier} (#{type})",
|
|
34
|
+
body: build_body(unit_data),
|
|
35
|
+
uri: build_uri(file_path)
|
|
36
|
+
}
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def build_uri(file_path)
|
|
42
|
+
return @repo_url unless file_path
|
|
43
|
+
|
|
44
|
+
"#{@repo_url}/blob/main/#{file_path}"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def build_body(unit_data)
|
|
48
|
+
type = unit_data['type']
|
|
49
|
+
body = case type
|
|
50
|
+
when 'model' then build_model_body(unit_data)
|
|
51
|
+
when 'controller' then build_controller_body(unit_data)
|
|
52
|
+
when 'service', 'job', 'mailer', 'manager', 'decorator', 'concern'
|
|
53
|
+
build_generic_body(unit_data)
|
|
54
|
+
when 'graphql', 'graphql_type', 'graphql_mutation', 'graphql_resolver', 'graphql_query'
|
|
55
|
+
build_graphql_body(unit_data)
|
|
56
|
+
else build_generic_body(unit_data)
|
|
57
|
+
end
|
|
58
|
+
# Defensive credential scrub — current builders only emit structured
|
|
59
|
+
# metadata, but if a future formatter adds source_code or comments
|
|
60
|
+
# (mirroring Notion's `ModelMapper#extract_description`) the scrub
|
|
61
|
+
# keeps credential material from reaching Unblocked.
|
|
62
|
+
redact_credentials(body)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Run the assembled body through CredentialScanner. Fails closed (empty
|
|
66
|
+
# body) if the scanner raises, so a shipping failure never leaks
|
|
67
|
+
# unredacted content.
|
|
68
|
+
#
|
|
69
|
+
# @param body [String]
|
|
70
|
+
# @return [String]
|
|
71
|
+
def redact_credentials(body)
|
|
72
|
+
return body if body.nil? || body.empty?
|
|
73
|
+
|
|
74
|
+
require 'woods/console/credential_scanner'
|
|
75
|
+
redacted, _counts = credential_scanner.scan(body)
|
|
76
|
+
redacted
|
|
77
|
+
rescue StandardError
|
|
78
|
+
''
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def credential_scanner
|
|
82
|
+
@credential_scanner ||= Woods::Console::CredentialScanner.new
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# ── Model formatting ─────────────────────────────────────────────
|
|
86
|
+
|
|
87
|
+
def build_model_body(unit)
|
|
88
|
+
meta = unit['metadata'] || {}
|
|
89
|
+
sections = []
|
|
90
|
+
|
|
91
|
+
sections << model_header(unit, meta)
|
|
92
|
+
sections << model_associations(meta)
|
|
93
|
+
sections << model_dependents(unit)
|
|
94
|
+
sections << model_entry_points(unit)
|
|
95
|
+
sections << model_schema_highlights(meta)
|
|
96
|
+
sections << model_side_effects(unit)
|
|
97
|
+
|
|
98
|
+
sections.compact.join("\n\n")
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def model_header(unit, meta)
|
|
102
|
+
parts = ["# #{unit['identifier']} (model)"]
|
|
103
|
+
file_info = ["**File:** `#{unit['file_path']}`"]
|
|
104
|
+
file_info << "**LOC:** #{meta['loc']}" if meta['loc']
|
|
105
|
+
file_info << "**Table:** #{meta['table_name']}" if meta['table_name']
|
|
106
|
+
column_count = meta['column_count'] || (meta['columns'] || []).size
|
|
107
|
+
file_info << "(#{column_count} columns)" if column_count&.positive?
|
|
108
|
+
parts << file_info.join(' | ')
|
|
109
|
+
parts.join("\n")
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def model_associations(meta)
|
|
113
|
+
assocs = meta['associations'] || []
|
|
114
|
+
return nil if assocs.empty?
|
|
115
|
+
|
|
116
|
+
grouped = assocs.group_by { |a| a['type'] }
|
|
117
|
+
lines = ["## Associations (#{assocs.size})"]
|
|
118
|
+
|
|
119
|
+
%w[belongs_to has_many has_one has_and_belongs_to_many].each do |type|
|
|
120
|
+
items = grouped[type]
|
|
121
|
+
next unless items&.any?
|
|
122
|
+
|
|
123
|
+
targets = items.map do |a|
|
|
124
|
+
name = a['target'] || a['name']
|
|
125
|
+
dep = a.dig('options', 'dependent')
|
|
126
|
+
dep ? "#{name} (#{dep})" : name
|
|
127
|
+
end
|
|
128
|
+
lines << "**#{type}:** #{targets.join(', ')}"
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
lines.join("\n")
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def model_dependents(unit)
|
|
135
|
+
deps = unit['dependents'] || []
|
|
136
|
+
return nil if deps.empty?
|
|
137
|
+
|
|
138
|
+
grouped = deps.group_by { |d| d['type'] }
|
|
139
|
+
summary_parts = grouped.map { |type, items| "#{items.size} #{type}s" }
|
|
140
|
+
|
|
141
|
+
lines = ["## Dependents (#{deps.size} units)"]
|
|
142
|
+
lines << summary_parts.join(', ')
|
|
143
|
+
|
|
144
|
+
# Blast radius assessment
|
|
145
|
+
if deps.size > 50
|
|
146
|
+
lines << '**High blast radius** — changes here affect many parts of the codebase'
|
|
147
|
+
elsif deps.size > 20
|
|
148
|
+
lines << '**Moderate blast radius** — changes may ripple to dependent code'
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
lines.join("\n")
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def model_entry_points(unit)
|
|
155
|
+
deps = unit['dependents'] || []
|
|
156
|
+
controllers = deps.select { |d| d['type'] == 'controller' }
|
|
157
|
+
graphql = deps.select { |d| d['type']&.start_with?('graphql') }
|
|
158
|
+
jobs = deps.select { |d| d['type'] == 'job' }
|
|
159
|
+
|
|
160
|
+
return nil if controllers.empty? && graphql.empty?
|
|
161
|
+
|
|
162
|
+
lines = ['## Entry Points']
|
|
163
|
+
lines << "**Controllers:** #{controllers.map { |c| c['identifier'] }.join(', ')}" if controllers.any?
|
|
164
|
+
lines << "**GraphQL:** #{graphql.map { |g| g['identifier'] }.join(', ')}" if graphql.any?
|
|
165
|
+
lines << "**Jobs:** #{jobs.map { |j| j['identifier'] }.join(', ')}" if jobs.any?
|
|
166
|
+
|
|
167
|
+
lines.join("\n")
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def model_schema_highlights(meta)
|
|
171
|
+
parts = []
|
|
172
|
+
|
|
173
|
+
enums = meta['enums']
|
|
174
|
+
if enums.is_a?(Hash) && enums.any?
|
|
175
|
+
enum_strs = enums.map { |name, values| "#{name} (#{format_enum_values(values)})" }
|
|
176
|
+
parts << "**Enums:** #{enum_strs.join('; ')}"
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
scopes = meta['scopes']
|
|
180
|
+
parts << "**Scopes:** #{scopes.map { |s| s['name'] }.join(', ')}" if scopes.is_a?(Array) && scopes.any?
|
|
181
|
+
|
|
182
|
+
concerns = meta['inlined_concerns']
|
|
183
|
+
parts << "**Concerns:** #{concerns.join(', ')}" if concerns.is_a?(Array) && concerns.any?
|
|
184
|
+
|
|
185
|
+
callbacks = meta['callbacks']
|
|
186
|
+
if callbacks.is_a?(Array) && callbacks.any?
|
|
187
|
+
parts << "**Callbacks (#{callbacks.size}):** #{format_callbacks(callbacks)}"
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
return nil if parts.empty?
|
|
191
|
+
|
|
192
|
+
(['## Schema Highlights'] + parts).join("\n")
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def model_side_effects(unit)
|
|
196
|
+
deps = unit['dependents'] || []
|
|
197
|
+
jobs = deps.select { |d| d['type'] == 'job' }
|
|
198
|
+
mailers = deps.select { |d| d['type'] == 'mailer' }
|
|
199
|
+
|
|
200
|
+
return nil if jobs.empty? && mailers.empty?
|
|
201
|
+
|
|
202
|
+
lines = ['## Side Effects']
|
|
203
|
+
lines << "**Jobs:** #{jobs.map { |j| j['identifier'] }.join(', ')}" if jobs.any?
|
|
204
|
+
lines << "**Mailers:** #{mailers.map { |m| m['identifier'] }.join(', ')}" if mailers.any?
|
|
205
|
+
|
|
206
|
+
lines.join("\n")
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# ── Controller formatting ────────────────────────────────────────
|
|
210
|
+
|
|
211
|
+
def build_controller_body(unit)
|
|
212
|
+
meta = unit['metadata'] || {}
|
|
213
|
+
sections = []
|
|
214
|
+
|
|
215
|
+
sections << "# #{unit['identifier']} (controller)"
|
|
216
|
+
sections << "**File:** `#{unit['file_path']}`"
|
|
217
|
+
|
|
218
|
+
ancestors = meta['ancestors']
|
|
219
|
+
sections << "**Inherits:** #{ancestors[1..3]&.join(' → ')}" if ancestors.is_a?(Array) && ancestors.size > 1
|
|
220
|
+
|
|
221
|
+
sections << controller_routes(meta)
|
|
222
|
+
sections << controller_dependencies(unit)
|
|
223
|
+
sections << controller_dependents(unit)
|
|
224
|
+
|
|
225
|
+
sections.compact.join("\n\n")
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def controller_routes(meta)
|
|
229
|
+
routes = meta['routes']
|
|
230
|
+
return nil unless routes.is_a?(Hash) && routes.any?
|
|
231
|
+
|
|
232
|
+
lines = ['## Routes']
|
|
233
|
+
routes.each do |action, route_list|
|
|
234
|
+
next unless route_list.is_a?(Array)
|
|
235
|
+
|
|
236
|
+
route_list.each do |route|
|
|
237
|
+
next unless route.is_a?(Hash)
|
|
238
|
+
|
|
239
|
+
lines << "- `#{route['verb']} #{route['path']}` (#{action})"
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
lines.size > 1 ? lines.first(20).join("\n") : nil
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
def controller_dependencies(unit)
|
|
247
|
+
deps = unit['dependencies'] || []
|
|
248
|
+
return nil if deps.empty?
|
|
249
|
+
|
|
250
|
+
models = deps.select { |d| d['type'] == 'model' }.map { |d| d['target'] }
|
|
251
|
+
return nil if models.empty?
|
|
252
|
+
|
|
253
|
+
"## Dependencies\n**Models:** #{models.join(', ')}"
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def controller_dependents(unit)
|
|
257
|
+
deps = unit['dependents'] || []
|
|
258
|
+
views = deps.select { |d| d['type'] == 'view_template' }
|
|
259
|
+
return nil if views.empty?
|
|
260
|
+
|
|
261
|
+
"## Views\n#{views.map { |v| "- `#{v['identifier']}`" }.first(10).join("\n")}"
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# ── GraphQL formatting ───────────────────────────────────────────
|
|
265
|
+
|
|
266
|
+
def build_graphql_body(unit)
|
|
267
|
+
sections = []
|
|
268
|
+
|
|
269
|
+
sections << "# #{unit['identifier']} (#{unit['type']})"
|
|
270
|
+
sections << "**File:** `#{unit['file_path']}`"
|
|
271
|
+
|
|
272
|
+
deps = unit['dependencies'] || []
|
|
273
|
+
models = deps.select { |d| d['type'] == 'model' }.map { |d| d['target'] }
|
|
274
|
+
sections << "**Models:** #{models.join(', ')}" if models.any?
|
|
275
|
+
|
|
276
|
+
dependents = unit['dependents'] || []
|
|
277
|
+
sections << "**Referenced by:** #{dependents.size} units" if dependents.any?
|
|
278
|
+
|
|
279
|
+
sections.compact.join("\n\n")
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# ── Generic formatting (services, jobs, mailers, etc.) ──────────
|
|
283
|
+
|
|
284
|
+
def build_generic_body(unit)
|
|
285
|
+
meta = unit['metadata'] || {}
|
|
286
|
+
sections = []
|
|
287
|
+
|
|
288
|
+
sections << "# #{unit['identifier']} (#{unit['type']})"
|
|
289
|
+
sections << "**File:** `#{unit['file_path']}`"
|
|
290
|
+
sections << "**LOC:** #{meta['loc']}" if meta['loc']
|
|
291
|
+
|
|
292
|
+
deps = unit['dependencies'] || []
|
|
293
|
+
if deps.any?
|
|
294
|
+
by_type = deps.group_by { |d| d['type'] }
|
|
295
|
+
dep_parts = by_type.map { |type, items| "#{type}: #{items.map { |d| d['target'] }.join(', ')}" }
|
|
296
|
+
sections << "## Dependencies\n#{dep_parts.join("\n")}"
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
dependents = unit['dependents'] || []
|
|
300
|
+
if dependents.any?
|
|
301
|
+
grouped = dependents.group_by { |d| d['type'] }
|
|
302
|
+
summary = grouped.map { |type, items| "#{items.size} #{type}s" }
|
|
303
|
+
sections << "## Dependents (#{dependents.size})\n#{summary.join(', ')}"
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
sections.compact.join("\n\n")
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
# ── Helpers ──────────────────────────────────────────────────────
|
|
310
|
+
|
|
311
|
+
def format_enum_values(values)
|
|
312
|
+
case values
|
|
313
|
+
when Hash then values.keys.first(5).join(', ')
|
|
314
|
+
when Array then values.first(5).join(', ')
|
|
315
|
+
else values.to_s
|
|
316
|
+
end
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
def format_callbacks(callbacks)
|
|
320
|
+
callbacks.first(5).map do |cb|
|
|
321
|
+
"#{cb['type']}: #{cb['filter']}"
|
|
322
|
+
end.join(', ')
|
|
323
|
+
end
|
|
324
|
+
end
|
|
325
|
+
end
|
|
326
|
+
end
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'woods'
|
|
4
|
+
require_relative 'client'
|
|
5
|
+
require_relative 'rate_limiter'
|
|
6
|
+
require_relative 'document_builder'
|
|
7
|
+
|
|
8
|
+
module Woods
|
|
9
|
+
module Unblocked
|
|
10
|
+
# Orchestrates syncing Woods extraction data to an Unblocked collection.
|
|
11
|
+
#
|
|
12
|
+
# Reads extraction output from disk via IndexReader, converts units to
|
|
13
|
+
# condensed Markdown documents, and pushes via the Unblocked Documents API.
|
|
14
|
+
# All syncs are idempotent — documents are upserted by URI.
|
|
15
|
+
#
|
|
16
|
+
# @example
|
|
17
|
+
# exporter = Exporter.new(index_dir: "tmp/woods")
|
|
18
|
+
# stats = exporter.sync_all
|
|
19
|
+
# # => { synced: 940, skipped: 5060, errors: [] }
|
|
20
|
+
#
|
|
21
|
+
class Exporter
|
|
22
|
+
MAX_ERRORS = 100
|
|
23
|
+
|
|
24
|
+
# Unit types to sync, in priority order.
|
|
25
|
+
# All units are synced for these types.
|
|
26
|
+
FULL_SYNC_TYPES = %w[
|
|
27
|
+
model controller service job mailer manager decorator concern serializer
|
|
28
|
+
graphql graphql_type graphql_mutation graphql_resolver graphql_query
|
|
29
|
+
].freeze
|
|
30
|
+
|
|
31
|
+
# Unit types where only the most-connected units are synced.
|
|
32
|
+
# Each entry: [type, max_count]
|
|
33
|
+
PARTIAL_SYNC_TYPES = [
|
|
34
|
+
['poro', 100],
|
|
35
|
+
['lib', 50]
|
|
36
|
+
].freeze
|
|
37
|
+
|
|
38
|
+
# @param index_dir [String] Path to extraction output directory
|
|
39
|
+
# @param config [Configuration] Woods configuration (default: global config)
|
|
40
|
+
# @param client [Client, nil] Unblocked API client (auto-created from config if nil)
|
|
41
|
+
# @param reader [Object, nil] IndexReader instance (auto-created if nil)
|
|
42
|
+
# @param output [IO] Progress output stream (default: $stdout)
|
|
43
|
+
# @raise [ConfigurationError] if required config is missing
|
|
44
|
+
def initialize(index_dir:, config: Woods.configuration, client: nil, reader: nil, output: $stdout)
|
|
45
|
+
@collection_id = config.unblocked_collection_id
|
|
46
|
+
raise ConfigurationError, 'unblocked_collection_id is required' unless @collection_id
|
|
47
|
+
|
|
48
|
+
repo_url = config.unblocked_repo_url
|
|
49
|
+
raise ConfigurationError, 'unblocked_repo_url is required' unless repo_url
|
|
50
|
+
|
|
51
|
+
api_token = config.unblocked_api_token
|
|
52
|
+
raise ConfigurationError, 'unblocked_api_token is required' unless api_token
|
|
53
|
+
|
|
54
|
+
budget = ENV.fetch('UNBLOCKED_DAILY_BUDGET', RateLimiter::DEFAULT_BUDGET.to_s).to_i
|
|
55
|
+
limiter = RateLimiter.new(daily_budget: budget)
|
|
56
|
+
|
|
57
|
+
@client = client || Client.new(api_token: api_token, rate_limiter: limiter)
|
|
58
|
+
@reader = reader || build_reader(index_dir)
|
|
59
|
+
@builder = DocumentBuilder.new(repo_url: repo_url)
|
|
60
|
+
@output = output
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Sync all configured unit types to the Unblocked collection.
|
|
64
|
+
#
|
|
65
|
+
# @return [Hash] { synced: Integer, skipped: Integer, errors: Array<String> }
|
|
66
|
+
def sync_all
|
|
67
|
+
synced = 0
|
|
68
|
+
skipped = 0
|
|
69
|
+
errors = []
|
|
70
|
+
|
|
71
|
+
FULL_SYNC_TYPES.each do |type|
|
|
72
|
+
result = sync_type(type)
|
|
73
|
+
synced += result[:synced]
|
|
74
|
+
skipped += result[:skipped]
|
|
75
|
+
errors.concat(result[:errors])
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
PARTIAL_SYNC_TYPES.each do |type, max_count|
|
|
79
|
+
result = sync_type_partial(type, max_count)
|
|
80
|
+
synced += result[:synced]
|
|
81
|
+
skipped += result[:skipped]
|
|
82
|
+
errors.concat(result[:errors])
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
{ synced: synced, skipped: skipped, errors: cap_errors(errors) }
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Sync all units of a given type.
|
|
89
|
+
#
|
|
90
|
+
# @param type [String] Unit type (e.g. "model", "controller")
|
|
91
|
+
# @return [Hash] { synced: Integer, skipped: Integer, errors: Array<String> }
|
|
92
|
+
def sync_type(type)
|
|
93
|
+
units = @reader.list_units(type: type)
|
|
94
|
+
log " #{type}: #{units.size} units"
|
|
95
|
+
|
|
96
|
+
sync_units(units)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Sync the top N most-connected units of a type (by dependent count).
|
|
100
|
+
#
|
|
101
|
+
# @param type [String] Unit type
|
|
102
|
+
# @param max_count [Integer] Maximum units to sync
|
|
103
|
+
# @return [Hash] { synced: Integer, skipped: Integer, errors: Array<String> }
|
|
104
|
+
def sync_type_partial(type, max_count)
|
|
105
|
+
units = @reader.list_units(type: type)
|
|
106
|
+
return empty_stats if units.empty?
|
|
107
|
+
|
|
108
|
+
# Load full data to sort by dependent count
|
|
109
|
+
units_with_data = units.filter_map do |entry|
|
|
110
|
+
data = @reader.find_unit(entry['identifier'])
|
|
111
|
+
next unless data
|
|
112
|
+
|
|
113
|
+
dep_count = (data['dependents'] || []).size
|
|
114
|
+
{ entry: entry, data: data, dep_count: dep_count }
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
top_units = units_with_data.sort_by { |u| -u[:dep_count] }.first(max_count)
|
|
118
|
+
skipped_count = [units.size - max_count, 0].max
|
|
119
|
+
|
|
120
|
+
log " #{type}: #{top_units.size}/#{units.size} units (top by dependents)"
|
|
121
|
+
|
|
122
|
+
result = sync_unit_data(top_units.map { |u| [u[:entry], u[:data]] })
|
|
123
|
+
result[:skipped] += skipped_count
|
|
124
|
+
result
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
private
|
|
128
|
+
|
|
129
|
+
def sync_units(units)
|
|
130
|
+
synced = 0
|
|
131
|
+
skipped = 0
|
|
132
|
+
errors = []
|
|
133
|
+
|
|
134
|
+
units.each do |entry|
|
|
135
|
+
unit_data = @reader.find_unit(entry['identifier'])
|
|
136
|
+
unless unit_data
|
|
137
|
+
skipped += 1
|
|
138
|
+
next
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
push_document(unit_data)
|
|
142
|
+
synced += 1
|
|
143
|
+
rescue Woods::Error => e
|
|
144
|
+
errors << "#{entry['identifier']}: #{e.message}"
|
|
145
|
+
break if e.message.include?('daily budget exhausted')
|
|
146
|
+
rescue StandardError => e
|
|
147
|
+
errors << "#{entry['identifier']}: #{e.message}"
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
{ synced: synced, skipped: skipped, errors: errors }
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def sync_unit_data(entries_with_data)
|
|
154
|
+
synced = 0
|
|
155
|
+
skipped = 0
|
|
156
|
+
errors = []
|
|
157
|
+
|
|
158
|
+
entries_with_data.each do |entry, unit_data|
|
|
159
|
+
push_document(unit_data)
|
|
160
|
+
synced += 1
|
|
161
|
+
rescue Woods::Error => e
|
|
162
|
+
errors << "#{entry['identifier']}: #{e.message}"
|
|
163
|
+
break if e.message.include?('daily budget exhausted')
|
|
164
|
+
rescue StandardError => e
|
|
165
|
+
errors << "#{entry['identifier']}: #{e.message}"
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
{ synced: synced, skipped: skipped, errors: errors }
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def push_document(unit_data)
|
|
172
|
+
doc = @builder.build(unit_data)
|
|
173
|
+
@client.put_document(
|
|
174
|
+
collection_id: @collection_id,
|
|
175
|
+
title: doc[:title],
|
|
176
|
+
body: doc[:body],
|
|
177
|
+
uri: doc[:uri]
|
|
178
|
+
)
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def build_reader(index_dir)
|
|
182
|
+
require_relative '../mcp/index_reader'
|
|
183
|
+
Woods::MCP::IndexReader.new(index_dir)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def empty_stats
|
|
187
|
+
{ synced: 0, skipped: 0, errors: [] }
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def cap_errors(errors)
|
|
191
|
+
return errors if errors.size <= MAX_ERRORS
|
|
192
|
+
|
|
193
|
+
errors.first(MAX_ERRORS) + ["... and #{errors.size - MAX_ERRORS} more errors"]
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def log(message)
|
|
197
|
+
@output&.puts(message)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Woods
|
|
4
|
+
module Unblocked
|
|
5
|
+
# Daily budget-based rate limiter for the Unblocked API (1000 calls/day).
|
|
6
|
+
#
|
|
7
|
+
# Unlike Notion's per-second throttling, Unblocked limits by daily call count.
|
|
8
|
+
# Tracks usage against a configurable budget, warns when approaching the limit,
|
|
9
|
+
# and raises when exhausted.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# limiter = RateLimiter.new(daily_budget: 1000)
|
|
13
|
+
# limiter.track { client.put_document(...) } # => result
|
|
14
|
+
# limiter.remaining # => 999
|
|
15
|
+
#
|
|
16
|
+
class RateLimiter
|
|
17
|
+
DEFAULT_BUDGET = 1000
|
|
18
|
+
WARN_THRESHOLD = 0.8 # Warn at 80% usage
|
|
19
|
+
|
|
20
|
+
# @param daily_budget [Integer] Maximum API calls per day
|
|
21
|
+
# @param warn_io [IO] Where to write warnings (default: $stderr)
|
|
22
|
+
def initialize(daily_budget: DEFAULT_BUDGET, warn_io: $stderr)
|
|
23
|
+
unless daily_budget.is_a?(Integer) && daily_budget.positive?
|
|
24
|
+
raise ArgumentError, 'daily_budget must be positive'
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
@daily_budget = daily_budget
|
|
28
|
+
@calls_today = 0
|
|
29
|
+
@warn_io = warn_io
|
|
30
|
+
@warned = false
|
|
31
|
+
@mutex = Mutex.new
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Execute a block, tracking the API call against the daily budget.
|
|
35
|
+
#
|
|
36
|
+
# @yield The API call to execute
|
|
37
|
+
# @return [Object] The block's return value
|
|
38
|
+
# @raise [Woods::Error] if daily budget is exhausted
|
|
39
|
+
def track
|
|
40
|
+
raise ArgumentError, 'block required' unless block_given?
|
|
41
|
+
|
|
42
|
+
@mutex.synchronize do
|
|
43
|
+
if @calls_today >= @daily_budget
|
|
44
|
+
raise Woods::Error,
|
|
45
|
+
"Unblocked API daily budget exhausted (#{@daily_budget} calls). " \
|
|
46
|
+
'Budget resets at midnight PST. Use UNBLOCKED_DAILY_BUDGET to adjust.'
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
@calls_today += 1
|
|
50
|
+
warn_if_approaching_limit
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
yield
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Number of API calls remaining in the daily budget.
|
|
57
|
+
#
|
|
58
|
+
# @return [Integer]
|
|
59
|
+
def remaining
|
|
60
|
+
@daily_budget - @calls_today
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Number of API calls used today.
|
|
64
|
+
#
|
|
65
|
+
# @return [Integer]
|
|
66
|
+
def used
|
|
67
|
+
@calls_today
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Reset the daily counter (for testing or manual reset).
|
|
71
|
+
#
|
|
72
|
+
# @return [void]
|
|
73
|
+
def reset!
|
|
74
|
+
@mutex.synchronize do
|
|
75
|
+
@calls_today = 0
|
|
76
|
+
@warned = false
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
def warn_if_approaching_limit
|
|
83
|
+
return if @warned
|
|
84
|
+
return unless @calls_today >= (@daily_budget * WARN_THRESHOLD).to_i
|
|
85
|
+
|
|
86
|
+
@warned = true
|
|
87
|
+
@warn_io&.puts(
|
|
88
|
+
"WARNING: Unblocked API usage at #{@calls_today}/#{@daily_budget} " \
|
|
89
|
+
"(#{remaining} calls remaining)"
|
|
90
|
+
)
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Woods
|
|
4
|
+
module Util
|
|
5
|
+
# Shared host-header / URL-host canonicalization used by {MCP::OriginGuard}
|
|
6
|
+
# and the {Storage::VectorStore::Qdrant} URL validator.
|
|
7
|
+
#
|
|
8
|
+
# Both components need to reject numeric IPv4 notations that `URI` and
|
|
9
|
+
# `getaddrinfo` accept but `IPAddr` does not — hex (`0x7f000001`),
|
|
10
|
+
# bare integer (`2130706433`), octal (`017700000001` or
|
|
11
|
+
# `0177.0.0.1`), short-form (`127.1`), mixed-radix (`0x7f.0.0.1`).
|
|
12
|
+
# Keeping the logic in one place prevents drift between the two
|
|
13
|
+
# defenses (which previously had slightly different regex lists).
|
|
14
|
+
module HostGuard
|
|
15
|
+
# Non-canonical numeric IPv4 forms that legitimate clients never
|
|
16
|
+
# emit but `getaddrinfo` will happily resolve — rejecting the form
|
|
17
|
+
# is safer than trying to intuit the intended IPv4.
|
|
18
|
+
NUMERIC_HOST_BYPASS = Regexp.union(
|
|
19
|
+
/\A0x[0-9a-f]+\z/, # hex: `0x7f000001`
|
|
20
|
+
/\A\d+\z/, # bare integer: `2130706433`
|
|
21
|
+
/\A0[0-7]+\z/, # bare octal: `017700000001`
|
|
22
|
+
/\A\d+\.\d+\z/, # short-form two-part: `127.1`
|
|
23
|
+
/\A\d+\.\d+\.\d+\z/ # short-form three-part: `127.0.1`
|
|
24
|
+
).freeze
|
|
25
|
+
|
|
26
|
+
# Octets inside a four-part dotted form that tag the form as
|
|
27
|
+
# non-canonical: leading zero (octal interpretation), or `0x`
|
|
28
|
+
# prefix (hex interpretation).
|
|
29
|
+
SUSPICIOUS_OCTET = Regexp.union(
|
|
30
|
+
/\A0\d+\z/, # leading-zero octal: `0177`
|
|
31
|
+
/\A0x[0-9a-f]+\z/ # hex octet: `0x7f`
|
|
32
|
+
).freeze
|
|
33
|
+
|
|
34
|
+
module_function
|
|
35
|
+
|
|
36
|
+
# Canonicalize a host string: downcase, strip port, strip the
|
|
37
|
+
# FQDN trailing dot, drop IPv6 brackets. Returns a plain host.
|
|
38
|
+
#
|
|
39
|
+
# @param host [String, nil]
|
|
40
|
+
# @return [String] canonical host, lowercase, without port/brackets.
|
|
41
|
+
def canonicalize(host)
|
|
42
|
+
host.to_s.downcase.sub(/:\d+\z/, '').sub(/\.\z/, '').delete('[]')
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Does this canonicalized host smuggle a private IP via a notation
|
|
46
|
+
# that `IPAddr.new` won't parse? Callers should reject any match
|
|
47
|
+
# rather than try to resolve it.
|
|
48
|
+
#
|
|
49
|
+
# @param canonical [String] Output of {.canonicalize}.
|
|
50
|
+
# @return [Boolean]
|
|
51
|
+
def suspicious_numeric_host?(canonical)
|
|
52
|
+
return true if canonical.match?(NUMERIC_HOST_BYPASS)
|
|
53
|
+
|
|
54
|
+
four_octet = canonical.match(/\A(\w+)\.(\w+)\.(\w+)\.(\w+)\z/)
|
|
55
|
+
return false unless four_octet
|
|
56
|
+
|
|
57
|
+
four_octet.captures.any? { |octet| octet.match?(SUSPICIOUS_OCTET) }
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
data/lib/woods/version.rb
CHANGED