woods 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +229 -0
- data/README.md +24 -8
- data/exe/woods-console +51 -6
- data/exe/woods-console-mcp +24 -4
- data/exe/woods-mcp +30 -7
- data/exe/woods-mcp-http +47 -6
- data/lib/generators/woods/install_generator.rb +13 -4
- data/lib/generators/woods/templates/woods.rb.tt +155 -0
- data/lib/tasks/woods.rake +37 -51
- data/lib/woods/builder.rb +174 -9
- data/lib/woods/cache/cache_middleware.rb +360 -31
- data/lib/woods/chunking/semantic_chunker.rb +334 -7
- data/lib/woods/console/adapters/job_adapter.rb +10 -4
- data/lib/woods/console/audit_logger.rb +76 -4
- data/lib/woods/console/bridge.rb +48 -15
- data/lib/woods/console/bridge_protocol.rb +44 -0
- data/lib/woods/console/confirmation.rb +3 -4
- data/lib/woods/console/console_response_renderer.rb +56 -18
- data/lib/woods/console/credential_index.rb +201 -0
- data/lib/woods/console/credential_scanner.rb +302 -0
- data/lib/woods/console/dispatch_pipeline.rb +138 -0
- data/lib/woods/console/embedded_executor.rb +682 -35
- data/lib/woods/console/eval_guard.rb +319 -0
- data/lib/woods/console/model_validator.rb +1 -3
- data/lib/woods/console/rack_middleware.rb +185 -29
- data/lib/woods/console/redactor.rb +161 -0
- data/lib/woods/console/response_context.rb +127 -0
- data/lib/woods/console/safe_context.rb +220 -23
- data/lib/woods/console/scope_predicate_parser.rb +131 -0
- data/lib/woods/console/server.rb +417 -486
- data/lib/woods/console/sql_noise_stripper.rb +87 -0
- data/lib/woods/console/sql_table_scanner.rb +213 -0
- data/lib/woods/console/sql_validator.rb +81 -31
- data/lib/woods/console/table_gate.rb +93 -0
- data/lib/woods/console/tool_specs.rb +552 -0
- data/lib/woods/console/tools/tier1.rb +3 -3
- data/lib/woods/console/tools/tier4.rb +7 -1
- data/lib/woods/dependency_graph.rb +66 -7
- data/lib/woods/embedding/indexer.rb +190 -6
- data/lib/woods/embedding/openai.rb +40 -4
- data/lib/woods/embedding/provider.rb +104 -8
- data/lib/woods/embedding/text_preparer.rb +23 -3
- data/lib/woods/embedding/token_counter.rb +133 -0
- data/lib/woods/evaluation/baseline_runner.rb +20 -2
- data/lib/woods/evaluation/metrics.rb +4 -1
- data/lib/woods/extracted_unit.rb +1 -0
- data/lib/woods/extractor.rb +7 -1
- data/lib/woods/extractors/controller_extractor.rb +10 -4
- data/lib/woods/extractors/mailer_extractor.rb +16 -2
- data/lib/woods/extractors/model_extractor.rb +6 -1
- data/lib/woods/extractors/phlex_extractor.rb +13 -4
- data/lib/woods/extractors/rails_source_extractor.rb +2 -0
- data/lib/woods/extractors/route_helper_resolver.rb +130 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
- data/lib/woods/extractors/view_component_extractor.rb +12 -1
- data/lib/woods/extractors/view_engines/base.rb +141 -0
- data/lib/woods/extractors/view_engines/erb.rb +145 -0
- data/lib/woods/extractors/view_template_extractor.rb +92 -133
- data/lib/woods/flow_assembler.rb +23 -15
- data/lib/woods/flow_precomputer.rb +21 -2
- data/lib/woods/graph_analyzer.rb +3 -4
- data/lib/woods/index_artifact.rb +173 -0
- data/lib/woods/mcp/bearer_auth.rb +45 -0
- data/lib/woods/mcp/bootstrap_state.rb +94 -0
- data/lib/woods/mcp/bootstrapper.rb +337 -16
- data/lib/woods/mcp/config_resolver.rb +288 -0
- data/lib/woods/mcp/errors.rb +134 -0
- data/lib/woods/mcp/index_reader.rb +265 -30
- data/lib/woods/mcp/origin_guard.rb +132 -0
- data/lib/woods/mcp/provider_probe.rb +166 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
- data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
- data/lib/woods/mcp/server.rb +737 -137
- data/lib/woods/model_name_cache.rb +78 -2
- data/lib/woods/notion/client.rb +25 -2
- data/lib/woods/notion/mappers/model_mapper.rb +36 -2
- data/lib/woods/railtie.rb +55 -15
- data/lib/woods/resilience/circuit_breaker.rb +9 -2
- data/lib/woods/resilience/retryable_provider.rb +40 -3
- data/lib/woods/resolved_config.rb +299 -0
- data/lib/woods/retrieval/context_assembler.rb +112 -5
- data/lib/woods/retrieval/query_classifier.rb +1 -1
- data/lib/woods/retrieval/ranker.rb +55 -6
- data/lib/woods/retrieval/search_executor.rb +42 -13
- data/lib/woods/retriever.rb +330 -24
- data/lib/woods/session_tracer/middleware.rb +35 -1
- data/lib/woods/storage/graph_store.rb +39 -0
- data/lib/woods/storage/inapplicable_backend.rb +14 -0
- data/lib/woods/storage/metadata_store.rb +129 -1
- data/lib/woods/storage/pgvector.rb +70 -8
- data/lib/woods/storage/qdrant.rb +196 -5
- data/lib/woods/storage/snapshotter/metadata.rb +172 -0
- data/lib/woods/storage/snapshotter/vector.rb +238 -0
- data/lib/woods/storage/snapshotter.rb +24 -0
- data/lib/woods/storage/vector_store.rb +184 -35
- data/lib/woods/tasks.rb +85 -0
- data/lib/woods/temporal/snapshot_store.rb +49 -1
- data/lib/woods/token_utils.rb +44 -5
- data/lib/woods/unblocked/client.rb +88 -7
- data/lib/woods/unblocked/document_builder.rb +75 -36
- data/lib/woods/unblocked/exporter.rb +234 -18
- data/lib/woods/unblocked/rate_limiter.rb +10 -2
- data/lib/woods/unblocked/sync_manifest.rb +135 -0
- data/lib/woods/util/host_guard.rb +61 -0
- data/lib/woods/version.rb +1 -1
- data/lib/woods.rb +126 -6
- metadata +70 -4
|
@@ -10,7 +10,7 @@ module Woods
|
|
|
10
10
|
# side effects, and structural complexity.
|
|
11
11
|
#
|
|
12
12
|
# @example
|
|
13
|
-
# builder = DocumentBuilder.new(repo_url: "https://github.com/
|
|
13
|
+
# builder = DocumentBuilder.new(repo_url: "https://github.com/acme/myapp")
|
|
14
14
|
# doc = builder.build(unit_data)
|
|
15
15
|
# # => { title: "Order (model)", body: "# Order (model)\n...", uri: "https://..." }
|
|
16
16
|
#
|
|
@@ -27,34 +27,66 @@ module Woods
|
|
|
27
27
|
def build(unit_data)
|
|
28
28
|
type = unit_data['type']
|
|
29
29
|
identifier = unit_data['identifier']
|
|
30
|
-
file_path = unit_data['file_path']
|
|
31
30
|
|
|
32
31
|
{
|
|
33
32
|
title: "#{identifier} (#{type})",
|
|
34
33
|
body: build_body(unit_data),
|
|
35
|
-
uri:
|
|
34
|
+
uri: uri_for(unit_data)
|
|
36
35
|
}
|
|
37
36
|
end
|
|
38
37
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
38
|
+
# The citation URI for a unit (GitHub blob URL, or the repo root when the
|
|
39
|
+
# unit has no file_path). Public so callers can compute a unit's URI
|
|
40
|
+
# cheaply — e.g. to build the set of currently-existing URIs — without
|
|
41
|
+
# building the full document body.
|
|
42
|
+
#
|
|
43
|
+
# @param unit_data [Hash] Parsed unit JSON (needs 'file_path')
|
|
44
|
+
# @return [String] Citation URI
|
|
45
|
+
def uri_for(unit_data)
|
|
46
|
+
file_path = unit_data['file_path']
|
|
42
47
|
return @repo_url unless file_path
|
|
43
48
|
|
|
44
49
|
"#{@repo_url}/blob/main/#{file_path}"
|
|
45
50
|
end
|
|
46
51
|
|
|
52
|
+
private
|
|
53
|
+
|
|
47
54
|
def build_body(unit_data)
|
|
48
55
|
type = unit_data['type']
|
|
49
|
-
case type
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
56
|
+
body = case type
|
|
57
|
+
when 'model' then build_model_body(unit_data)
|
|
58
|
+
when 'controller' then build_controller_body(unit_data)
|
|
59
|
+
when 'service', 'job', 'mailer', 'manager', 'decorator', 'concern'
|
|
60
|
+
build_generic_body(unit_data)
|
|
61
|
+
when 'graphql', 'graphql_type', 'graphql_mutation', 'graphql_resolver', 'graphql_query'
|
|
62
|
+
build_graphql_body(unit_data)
|
|
63
|
+
else build_generic_body(unit_data)
|
|
64
|
+
end
|
|
65
|
+
# Defensive credential scrub — current builders only emit structured
|
|
66
|
+
# metadata, but if a future formatter adds source_code or comments
|
|
67
|
+
# (mirroring Notion's `ModelMapper#extract_description`) the scrub
|
|
68
|
+
# keeps credential material from reaching Unblocked.
|
|
69
|
+
redact_credentials(body)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Run the assembled body through CredentialScanner. Fails closed (empty
|
|
73
|
+
# body) if the scanner raises, so a shipping failure never leaks
|
|
74
|
+
# unredacted content.
|
|
75
|
+
#
|
|
76
|
+
# @param body [String]
|
|
77
|
+
# @return [String]
|
|
78
|
+
def redact_credentials(body)
|
|
79
|
+
return body if body.nil? || body.empty?
|
|
80
|
+
|
|
81
|
+
require 'woods/console/credential_scanner'
|
|
82
|
+
redacted, _counts = credential_scanner.scan(body)
|
|
83
|
+
redacted
|
|
84
|
+
rescue StandardError
|
|
85
|
+
''
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def credential_scanner
|
|
89
|
+
@credential_scanner ||= Woods::Console::CredentialScanner.new
|
|
58
90
|
end
|
|
59
91
|
|
|
60
92
|
# ── Model formatting ─────────────────────────────────────────────
|
|
@@ -100,7 +132,9 @@ module Woods
|
|
|
100
132
|
dep = a.dig('options', 'dependent')
|
|
101
133
|
dep ? "#{name} (#{dep})" : name
|
|
102
134
|
end
|
|
103
|
-
|
|
135
|
+
# Sorted so the body is a function of association content, not order
|
|
136
|
+
# (the exporter hashes this body to detect changes).
|
|
137
|
+
lines << "**#{type}:** #{targets.sort.join(', ')}"
|
|
104
138
|
end
|
|
105
139
|
|
|
106
140
|
lines.join("\n")
|
|
@@ -111,7 +145,7 @@ module Woods
|
|
|
111
145
|
return nil if deps.empty?
|
|
112
146
|
|
|
113
147
|
grouped = deps.group_by { |d| d['type'] }
|
|
114
|
-
summary_parts = grouped.map { |type, items| "#{items.size} #{type}s" }
|
|
148
|
+
summary_parts = grouped.sort_by { |type, _| type.to_s }.map { |type, items| "#{items.size} #{type}s" }
|
|
115
149
|
|
|
116
150
|
lines = ["## Dependents (#{deps.size} units)"]
|
|
117
151
|
lines << summary_parts.join(', ')
|
|
@@ -135,9 +169,9 @@ module Woods
|
|
|
135
169
|
return nil if controllers.empty? && graphql.empty?
|
|
136
170
|
|
|
137
171
|
lines = ['## Entry Points']
|
|
138
|
-
lines << "**Controllers:** #{controllers.map { |c| c['identifier'] }.join(', ')}" if controllers.any?
|
|
139
|
-
lines << "**GraphQL:** #{graphql.map { |g| g['identifier'] }.join(', ')}" if graphql.any?
|
|
140
|
-
lines << "**Jobs:** #{jobs.map { |j| j['identifier'] }.join(', ')}" if jobs.any?
|
|
172
|
+
lines << "**Controllers:** #{controllers.map { |c| c['identifier'] }.sort.join(', ')}" if controllers.any?
|
|
173
|
+
lines << "**GraphQL:** #{graphql.map { |g| g['identifier'] }.sort.join(', ')}" if graphql.any?
|
|
174
|
+
lines << "**Jobs:** #{jobs.map { |j| j['identifier'] }.sort.join(', ')}" if jobs.any?
|
|
141
175
|
|
|
142
176
|
lines.join("\n")
|
|
143
177
|
end
|
|
@@ -147,15 +181,16 @@ module Woods
|
|
|
147
181
|
|
|
148
182
|
enums = meta['enums']
|
|
149
183
|
if enums.is_a?(Hash) && enums.any?
|
|
150
|
-
enum_strs = enums.
|
|
184
|
+
enum_strs = enums.sort_by { |name, _| name.to_s }
|
|
185
|
+
.map { |name, values| "#{name} (#{format_enum_values(values)})" }
|
|
151
186
|
parts << "**Enums:** #{enum_strs.join('; ')}"
|
|
152
187
|
end
|
|
153
188
|
|
|
154
189
|
scopes = meta['scopes']
|
|
155
|
-
parts << "**Scopes:** #{scopes.map { |s| s['name'] }.join(', ')}" if scopes.is_a?(Array) && scopes.any?
|
|
190
|
+
parts << "**Scopes:** #{scopes.map { |s| s['name'] }.sort.join(', ')}" if scopes.is_a?(Array) && scopes.any?
|
|
156
191
|
|
|
157
192
|
concerns = meta['inlined_concerns']
|
|
158
|
-
parts << "**Concerns:** #{concerns.join(', ')}" if concerns.is_a?(Array) && concerns.any?
|
|
193
|
+
parts << "**Concerns:** #{concerns.sort.join(', ')}" if concerns.is_a?(Array) && concerns.any?
|
|
159
194
|
|
|
160
195
|
callbacks = meta['callbacks']
|
|
161
196
|
if callbacks.is_a?(Array) && callbacks.any?
|
|
@@ -175,8 +210,8 @@ module Woods
|
|
|
175
210
|
return nil if jobs.empty? && mailers.empty?
|
|
176
211
|
|
|
177
212
|
lines = ['## Side Effects']
|
|
178
|
-
lines << "**Jobs:** #{jobs.map { |j| j['identifier'] }.join(', ')}" if jobs.any?
|
|
179
|
-
lines << "**Mailers:** #{mailers.map { |m| m['identifier'] }.join(', ')}" if mailers.any?
|
|
213
|
+
lines << "**Jobs:** #{jobs.map { |j| j['identifier'] }.sort.join(', ')}" if jobs.any?
|
|
214
|
+
lines << "**Mailers:** #{mailers.map { |m| m['identifier'] }.sort.join(', ')}" if mailers.any?
|
|
180
215
|
|
|
181
216
|
lines.join("\n")
|
|
182
217
|
end
|
|
@@ -204,18 +239,21 @@ module Woods
|
|
|
204
239
|
routes = meta['routes']
|
|
205
240
|
return nil unless routes.is_a?(Hash) && routes.any?
|
|
206
241
|
|
|
207
|
-
|
|
242
|
+
route_lines = []
|
|
208
243
|
routes.each do |action, route_list|
|
|
209
244
|
next unless route_list.is_a?(Array)
|
|
210
245
|
|
|
211
246
|
route_list.each do |route|
|
|
212
247
|
next unless route.is_a?(Hash)
|
|
213
248
|
|
|
214
|
-
|
|
249
|
+
route_lines << "- `#{route['verb']} #{route['path']}` (#{action})"
|
|
215
250
|
end
|
|
216
251
|
end
|
|
217
252
|
|
|
218
|
-
|
|
253
|
+
# Sort before truncating so the kept subset is stable across runs.
|
|
254
|
+
return nil if route_lines.empty?
|
|
255
|
+
|
|
256
|
+
(['## Routes'] + route_lines.sort.first(20)).join("\n")
|
|
219
257
|
end
|
|
220
258
|
|
|
221
259
|
def controller_dependencies(unit)
|
|
@@ -225,7 +263,7 @@ module Woods
|
|
|
225
263
|
models = deps.select { |d| d['type'] == 'model' }.map { |d| d['target'] }
|
|
226
264
|
return nil if models.empty?
|
|
227
265
|
|
|
228
|
-
"## Dependencies\n**Models:** #{models.join(', ')}"
|
|
266
|
+
"## Dependencies\n**Models:** #{models.sort.join(', ')}"
|
|
229
267
|
end
|
|
230
268
|
|
|
231
269
|
def controller_dependents(unit)
|
|
@@ -233,7 +271,7 @@ module Woods
|
|
|
233
271
|
views = deps.select { |d| d['type'] == 'view_template' }
|
|
234
272
|
return nil if views.empty?
|
|
235
273
|
|
|
236
|
-
"## Views\n#{views.map { |v| "- `#{v['identifier']}`" }.first(10).join("\n")}"
|
|
274
|
+
"## Views\n#{views.map { |v| "- `#{v['identifier']}`" }.sort.first(10).join("\n")}"
|
|
237
275
|
end
|
|
238
276
|
|
|
239
277
|
# ── GraphQL formatting ───────────────────────────────────────────
|
|
@@ -246,7 +284,7 @@ module Woods
|
|
|
246
284
|
|
|
247
285
|
deps = unit['dependencies'] || []
|
|
248
286
|
models = deps.select { |d| d['type'] == 'model' }.map { |d| d['target'] }
|
|
249
|
-
sections << "**Models:** #{models.join(', ')}" if models.any?
|
|
287
|
+
sections << "**Models:** #{models.sort.join(', ')}" if models.any?
|
|
250
288
|
|
|
251
289
|
dependents = unit['dependents'] || []
|
|
252
290
|
sections << "**Referenced by:** #{dependents.size} units" if dependents.any?
|
|
@@ -267,14 +305,15 @@ module Woods
|
|
|
267
305
|
deps = unit['dependencies'] || []
|
|
268
306
|
if deps.any?
|
|
269
307
|
by_type = deps.group_by { |d| d['type'] }
|
|
270
|
-
dep_parts = by_type.
|
|
308
|
+
dep_parts = by_type.sort_by { |type, _| type.to_s }
|
|
309
|
+
.map { |type, items| "#{type}: #{items.map { |d| d['target'] }.sort.join(', ')}" }
|
|
271
310
|
sections << "## Dependencies\n#{dep_parts.join("\n")}"
|
|
272
311
|
end
|
|
273
312
|
|
|
274
313
|
dependents = unit['dependents'] || []
|
|
275
314
|
if dependents.any?
|
|
276
315
|
grouped = dependents.group_by { |d| d['type'] }
|
|
277
|
-
summary = grouped.map { |type, items| "#{items.size} #{type}s" }
|
|
316
|
+
summary = grouped.sort_by { |type, _| type.to_s }.map { |type, items| "#{items.size} #{type}s" }
|
|
278
317
|
sections << "## Dependents (#{dependents.size})\n#{summary.join(', ')}"
|
|
279
318
|
end
|
|
280
319
|
|
|
@@ -292,9 +331,9 @@ module Woods
|
|
|
292
331
|
end
|
|
293
332
|
|
|
294
333
|
def format_callbacks(callbacks)
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
334
|
+
# Sort before truncating so both the selection and order are stable
|
|
335
|
+
# regardless of input order (the body is hashed for change detection).
|
|
336
|
+
callbacks.map { |cb| "#{cb['type']}: #{cb['filter']}" }.sort.first(5).join(', ')
|
|
298
337
|
end
|
|
299
338
|
end
|
|
300
339
|
end
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'set'
|
|
4
|
+
require 'digest'
|
|
3
5
|
require 'woods'
|
|
4
6
|
require_relative 'client'
|
|
5
7
|
require_relative 'rate_limiter'
|
|
6
8
|
require_relative 'document_builder'
|
|
9
|
+
require_relative 'sync_manifest'
|
|
7
10
|
|
|
8
11
|
module Woods
|
|
9
12
|
module Unblocked
|
|
@@ -11,16 +14,27 @@ module Woods
|
|
|
11
14
|
#
|
|
12
15
|
# Reads extraction output from disk via IndexReader, converts units to
|
|
13
16
|
# condensed Markdown documents, and pushes via the Unblocked Documents API.
|
|
14
|
-
#
|
|
17
|
+
# Syncs are incremental: a {SyncManifest} records the content hash and
|
|
18
|
+
# remote document_id of everything last pushed, so each run only PUTs
|
|
19
|
+
# new/changed documents, skips unchanged ones, and deletes documents whose
|
|
20
|
+
# source unit has disappeared. Documents are upserted by URI, so a missing
|
|
21
|
+
# manifest (first run / CI cache miss) degrades to a correct full sync.
|
|
15
22
|
#
|
|
16
23
|
# @example
|
|
17
24
|
# exporter = Exporter.new(index_dir: "tmp/woods")
|
|
18
25
|
# stats = exporter.sync_all
|
|
19
|
-
# # => { synced:
|
|
26
|
+
# # => { synced: 12, skipped: 928, deleted: 1, errors: [] }
|
|
20
27
|
#
|
|
21
28
|
class Exporter
|
|
22
29
|
MAX_ERRORS = 100
|
|
23
30
|
|
|
31
|
+
# Mass-deletion guard: refuse to purge when more than this fraction of a
|
|
32
|
+
# manifest of at least PURGE_GUARD_MIN_DOCS entries would be deleted —
|
|
33
|
+
# the signature of a sync run against a partial index. Override with
|
|
34
|
+
# force_purge.
|
|
35
|
+
PURGE_GUARD_FRACTION = 0.30
|
|
36
|
+
PURGE_GUARD_MIN_DOCS = 10
|
|
37
|
+
|
|
24
38
|
# Unit types to sync, in priority order.
|
|
25
39
|
# All units are synced for these types.
|
|
26
40
|
FULL_SYNC_TYPES = %w[
|
|
@@ -39,9 +53,13 @@ module Woods
|
|
|
39
53
|
# @param config [Configuration] Woods configuration (default: global config)
|
|
40
54
|
# @param client [Client, nil] Unblocked API client (auto-created from config if nil)
|
|
41
55
|
# @param reader [Object, nil] IndexReader instance (auto-created if nil)
|
|
56
|
+
# @param manifest [SyncManifest, nil] Sync manifest (auto-created under index_dir if nil)
|
|
57
|
+
# @param force_full [Boolean] Re-push every unit, ignoring the unchanged check
|
|
58
|
+
# @param force_purge [Boolean] Bypass the mass-deletion guard
|
|
42
59
|
# @param output [IO] Progress output stream (default: $stdout)
|
|
43
60
|
# @raise [ConfigurationError] if required config is missing
|
|
44
|
-
def initialize(index_dir:, config: Woods.configuration, client: nil, reader: nil,
|
|
61
|
+
def initialize(index_dir:, config: Woods.configuration, client: nil, reader: nil,
|
|
62
|
+
manifest: nil, force_full: false, force_purge: false, output: $stdout)
|
|
45
63
|
@collection_id = config.unblocked_collection_id
|
|
46
64
|
raise ConfigurationError, 'unblocked_collection_id is required' unless @collection_id
|
|
47
65
|
|
|
@@ -51,24 +69,37 @@ module Woods
|
|
|
51
69
|
api_token = config.unblocked_api_token
|
|
52
70
|
raise ConfigurationError, 'unblocked_api_token is required' unless api_token
|
|
53
71
|
|
|
54
|
-
budget = ENV.fetch('UNBLOCKED_DAILY_BUDGET', RateLimiter::DEFAULT_BUDGET).to_i
|
|
72
|
+
budget = ENV.fetch('UNBLOCKED_DAILY_BUDGET', RateLimiter::DEFAULT_BUDGET.to_s).to_i
|
|
55
73
|
limiter = RateLimiter.new(daily_budget: budget)
|
|
56
74
|
|
|
57
75
|
@client = client || Client.new(api_token: api_token, rate_limiter: limiter)
|
|
58
76
|
@reader = reader || build_reader(index_dir)
|
|
59
77
|
@builder = DocumentBuilder.new(repo_url: repo_url)
|
|
78
|
+
@manifest = manifest || build_manifest(index_dir)
|
|
79
|
+
@force_full = force_full
|
|
80
|
+
@force_purge = force_purge
|
|
60
81
|
@output = output
|
|
82
|
+
# Initialized here as well as in sync_all so the public sync_type /
|
|
83
|
+
# sync_type_partial methods work standalone (track_uri needs them).
|
|
84
|
+
@current_uris = Set.new
|
|
85
|
+
@budget_exhausted = false
|
|
61
86
|
end
|
|
62
87
|
|
|
63
88
|
# Sync all configured unit types to the Unblocked collection.
|
|
64
89
|
#
|
|
65
|
-
# @return [Hash] { synced
|
|
90
|
+
# @return [Hash] { synced:, skipped:, deleted:, errors: }
|
|
66
91
|
def sync_all
|
|
92
|
+
@current_uris = Set.new
|
|
93
|
+
@budget_exhausted = false
|
|
94
|
+
reconcile_from_remote if @manifest.empty?
|
|
95
|
+
|
|
67
96
|
synced = 0
|
|
68
97
|
skipped = 0
|
|
69
98
|
errors = []
|
|
70
99
|
|
|
71
100
|
FULL_SYNC_TYPES.each do |type|
|
|
101
|
+
break if @budget_exhausted
|
|
102
|
+
|
|
72
103
|
result = sync_type(type)
|
|
73
104
|
synced += result[:synced]
|
|
74
105
|
skipped += result[:skipped]
|
|
@@ -76,19 +107,24 @@ module Woods
|
|
|
76
107
|
end
|
|
77
108
|
|
|
78
109
|
PARTIAL_SYNC_TYPES.each do |type, max_count|
|
|
110
|
+
break if @budget_exhausted
|
|
111
|
+
|
|
79
112
|
result = sync_type_partial(type, max_count)
|
|
80
113
|
synced += result[:synced]
|
|
81
114
|
skipped += result[:skipped]
|
|
82
115
|
errors.concat(result[:errors])
|
|
83
116
|
end
|
|
84
117
|
|
|
85
|
-
|
|
118
|
+
deleted = @budget_exhausted ? 0 : purge_stale(errors)
|
|
119
|
+
{ synced: synced, skipped: skipped, deleted: deleted, errors: cap_errors(errors) }
|
|
120
|
+
ensure
|
|
121
|
+
save_manifest
|
|
86
122
|
end
|
|
87
123
|
|
|
88
124
|
# Sync all units of a given type.
|
|
89
125
|
#
|
|
90
126
|
# @param type [String] Unit type (e.g. "model", "controller")
|
|
91
|
-
# @return [Hash] { synced
|
|
127
|
+
# @return [Hash] { synced:, skipped:, errors: }
|
|
92
128
|
def sync_type(type)
|
|
93
129
|
units = @reader.list_units(type: type)
|
|
94
130
|
log " #{type}: #{units.size} units"
|
|
@@ -100,7 +136,7 @@ module Woods
|
|
|
100
136
|
#
|
|
101
137
|
# @param type [String] Unit type
|
|
102
138
|
# @param max_count [Integer] Maximum units to sync
|
|
103
|
-
# @return [Hash] { synced
|
|
139
|
+
# @return [Hash] { synced:, skipped:, errors: }
|
|
104
140
|
def sync_type_partial(type, max_count)
|
|
105
141
|
units = @reader.list_units(type: type)
|
|
106
142
|
return empty_stats if units.empty?
|
|
@@ -114,8 +150,14 @@ module Woods
|
|
|
114
150
|
{ entry: entry, data: data, dep_count: dep_count }
|
|
115
151
|
end
|
|
116
152
|
|
|
153
|
+
# Every unit of this type still exists — track its URI so partial units
|
|
154
|
+
# that fall *out* of the top-N are never mistaken for deletions.
|
|
155
|
+
units_with_data.each { |u| track_uri(u[:data]) }
|
|
156
|
+
|
|
117
157
|
top_units = units_with_data.sort_by { |u| -u[:dep_count] }.first(max_count)
|
|
118
|
-
|
|
158
|
+
# Count against what was actually synced — units.size includes entries
|
|
159
|
+
# whose unit data was missing (dropped by the filter_map above).
|
|
160
|
+
skipped_count = units.size - top_units.size
|
|
119
161
|
|
|
120
162
|
log " #{type}: #{top_units.size}/#{units.size} units (top by dependents)"
|
|
121
163
|
|
|
@@ -138,13 +180,19 @@ module Woods
|
|
|
138
180
|
next
|
|
139
181
|
end
|
|
140
182
|
|
|
141
|
-
|
|
142
|
-
|
|
183
|
+
track_uri(unit_data)
|
|
184
|
+
if push_document(unit_data) == :skipped
|
|
185
|
+
skipped += 1
|
|
186
|
+
else
|
|
187
|
+
synced += 1
|
|
188
|
+
end
|
|
143
189
|
rescue Woods::Error => e
|
|
144
190
|
errors << "#{entry['identifier']}: #{e.message}"
|
|
145
|
-
break if e
|
|
191
|
+
break if note_budget_exhaustion(e)
|
|
146
192
|
rescue StandardError => e
|
|
147
|
-
|
|
193
|
+
# Include the class — "undefined method for nil" without it is
|
|
194
|
+
# unactionable in CI logs.
|
|
195
|
+
errors << "#{entry['identifier']}: #{e.class}: #{e.message}"
|
|
148
196
|
end
|
|
149
197
|
|
|
150
198
|
{ synced: synced, skipped: skipped, errors: errors }
|
|
@@ -156,26 +204,177 @@ module Woods
|
|
|
156
204
|
errors = []
|
|
157
205
|
|
|
158
206
|
entries_with_data.each do |entry, unit_data|
|
|
159
|
-
|
|
160
|
-
|
|
207
|
+
track_uri(unit_data)
|
|
208
|
+
if push_document(unit_data) == :skipped
|
|
209
|
+
skipped += 1
|
|
210
|
+
else
|
|
211
|
+
synced += 1
|
|
212
|
+
end
|
|
161
213
|
rescue Woods::Error => e
|
|
162
214
|
errors << "#{entry['identifier']}: #{e.message}"
|
|
163
|
-
break if e
|
|
215
|
+
break if note_budget_exhaustion(e)
|
|
164
216
|
rescue StandardError => e
|
|
165
|
-
|
|
217
|
+
# Include the class — "undefined method for nil" without it is
|
|
218
|
+
# unactionable in CI logs.
|
|
219
|
+
errors << "#{entry['identifier']}: #{e.class}: #{e.message}"
|
|
166
220
|
end
|
|
167
221
|
|
|
168
222
|
{ synced: synced, skipped: skipped, errors: errors }
|
|
169
223
|
end
|
|
170
224
|
|
|
225
|
+
# Build the document, skip it if the manifest says it is unchanged,
|
|
226
|
+
# otherwise upsert it and record the new hash + remote document_id.
|
|
227
|
+
#
|
|
228
|
+
# @return [Symbol] :synced or :skipped
|
|
171
229
|
def push_document(unit_data)
|
|
230
|
+
# No file_path → the URI falls back to the bare repo URL, which every
|
|
231
|
+
# such unit would share: they'd overwrite each other remotely and
|
|
232
|
+
# ping-pong the manifest hash forever. Skip them.
|
|
233
|
+
return :skipped unless unit_data['file_path']
|
|
234
|
+
|
|
172
235
|
doc = @builder.build(unit_data)
|
|
173
|
-
|
|
236
|
+
# An empty body means the credential scrub failed closed (the builders
|
|
237
|
+
# always emit at least a header). Upserting it would overwrite a good
|
|
238
|
+
# remote document with nothing — error out and leave the remote as-is.
|
|
239
|
+
if doc[:body].nil? || doc[:body].empty?
|
|
240
|
+
raise Woods::ExtractionError, 'document body empty (credential scrub failure?) — push skipped'
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
hash = fingerprint(doc)
|
|
244
|
+
return :skipped if !@force_full && @manifest.unchanged?(doc[:uri], hash)
|
|
245
|
+
|
|
246
|
+
response = @client.put_document(
|
|
174
247
|
collection_id: @collection_id,
|
|
175
248
|
title: doc[:title],
|
|
176
249
|
body: doc[:body],
|
|
177
250
|
uri: doc[:uri]
|
|
178
251
|
)
|
|
252
|
+
document_id = (response['id'] if response.is_a?(Hash)) || @manifest.document_id_for(doc[:uri])
|
|
253
|
+
@manifest.record(uri: doc[:uri], hash: hash, document_id: document_id)
|
|
254
|
+
:synced
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# Delete remote documents whose source unit no longer exists. Failures
|
|
258
|
+
# are appended to +errors+ — a delete that fails silently every run is
|
|
259
|
+
# how a collection rots while "deleted: 0" looks normal.
|
|
260
|
+
#
|
|
261
|
+
# @param errors [Array<String>] sink for delete failures
|
|
262
|
+
# @return [Integer] number of documents deleted
|
|
263
|
+
def purge_stale(errors)
|
|
264
|
+
stale = @manifest.stale_uris(@current_uris)
|
|
265
|
+
return 0 if stale.empty?
|
|
266
|
+
return 0 if guard_blocks_purge?(stale)
|
|
267
|
+
|
|
268
|
+
resolve_missing_document_ids(stale)
|
|
269
|
+
|
|
270
|
+
deleted = 0
|
|
271
|
+
stale.each do |uri|
|
|
272
|
+
document_id = @manifest.document_id_for(uri)
|
|
273
|
+
next unless document_id
|
|
274
|
+
|
|
275
|
+
@client.delete_document(document_id: document_id)
|
|
276
|
+
@manifest.forget(uri)
|
|
277
|
+
deleted += 1
|
|
278
|
+
rescue ApiError => e
|
|
279
|
+
if e.status == 404
|
|
280
|
+
# Already gone remotely — goal state reached, drop the entry
|
|
281
|
+
# rather than retrying every run.
|
|
282
|
+
@manifest.forget(uri)
|
|
283
|
+
else
|
|
284
|
+
errors << "delete #{uri}: #{e.message}"
|
|
285
|
+
end
|
|
286
|
+
rescue Woods::Error => e
|
|
287
|
+
break if note_budget_exhaustion(e)
|
|
288
|
+
|
|
289
|
+
errors << "delete #{uri}: #{e.message}"
|
|
290
|
+
rescue StandardError => e
|
|
291
|
+
# Entry stays in the manifest so a later run retries the delete —
|
|
292
|
+
# but surface the failure so systematic breakage is visible.
|
|
293
|
+
errors << "delete #{uri}: #{e.class}: #{e.message}"
|
|
294
|
+
end
|
|
295
|
+
deleted
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# A manifest entry can carry a nil document_id (e.g. the PUT response
|
|
299
|
+
# body was empty). Those entries would be permanently undeletable, so
|
|
300
|
+
# before purging, make one bounded all_documents sweep to resolve ids.
|
|
301
|
+
# Best-effort: unresolved entries are simply skipped by the purge loop.
|
|
302
|
+
def resolve_missing_document_ids(stale)
|
|
303
|
+
missing = stale.select { |uri| @manifest.document_id_for(uri).nil? }
|
|
304
|
+
return if missing.empty?
|
|
305
|
+
|
|
306
|
+
ids_by_uri = @client.all_documents(collection_id: @collection_id)
|
|
307
|
+
.to_h { |doc| [doc['uri'], doc['id']] }
|
|
308
|
+
missing.each do |uri|
|
|
309
|
+
id = ids_by_uri[uri]
|
|
310
|
+
@manifest.record(uri: uri, hash: nil, document_id: id) if id
|
|
311
|
+
end
|
|
312
|
+
rescue StandardError => e
|
|
313
|
+
log " id resolution skipped (#{e.message})"
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
# True when purging +stale+ would delete too large a fraction of the
|
|
317
|
+
# manifest — the signature of running against a partial index. The floor
|
|
318
|
+
# (PURGE_GUARD_MIN_DOCS) keeps small collections deletable.
|
|
319
|
+
def guard_blocks_purge?(stale)
|
|
320
|
+
return false if @force_purge
|
|
321
|
+
|
|
322
|
+
size = @manifest.size
|
|
323
|
+
return false if size < PURGE_GUARD_MIN_DOCS
|
|
324
|
+
|
|
325
|
+
fraction = stale.size.to_f / size
|
|
326
|
+
return false unless fraction > PURGE_GUARD_FRACTION
|
|
327
|
+
|
|
328
|
+
log " WARNING: refusing to delete #{stale.size} of #{size} documents " \
|
|
329
|
+
"(#{(fraction * 100).round}% > #{(PURGE_GUARD_FRACTION * 100).to_i}% — likely a partial index). " \
|
|
330
|
+
'Set UNBLOCKED_FORCE_PURGE=1 to override.'
|
|
331
|
+
true
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
# Seed the manifest from the remote collection when we have no local
|
|
335
|
+
# state (first run / CI cache miss). The list endpoint returns no body,
|
|
336
|
+
# so hashes are nil (everything re-pushes), but recovering document_ids
|
|
337
|
+
# lets this run still purge orphaned documents.
|
|
338
|
+
#
|
|
339
|
+
# Auth failures re-raise: a 401/403 here dooms every subsequent call,
|
|
340
|
+
# and "proceeding with full sync" would burn the whole daily budget on
|
|
341
|
+
# guaranteed failures.
|
|
342
|
+
def reconcile_from_remote
|
|
343
|
+
@client.all_documents(collection_id: @collection_id).each do |doc|
|
|
344
|
+
uri = doc['uri']
|
|
345
|
+
next unless uri
|
|
346
|
+
|
|
347
|
+
@manifest.record(uri: uri, hash: nil, document_id: doc['id'])
|
|
348
|
+
end
|
|
349
|
+
rescue ApiError => e
|
|
350
|
+
raise if [401, 403].include?(e.status)
|
|
351
|
+
|
|
352
|
+
log " reconcile skipped (#{e.message}) — proceeding with full sync"
|
|
353
|
+
rescue StandardError => e
|
|
354
|
+
log " reconcile skipped (#{e.message}) — proceeding with full sync"
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
def track_uri(unit_data)
|
|
358
|
+
# Units without a file_path are never pushed (see push_document), so
|
|
359
|
+
# their fallback repo-root URI must not be marked current either — a
|
|
360
|
+
# stale repo-root document from before this guard should purge.
|
|
361
|
+
return unless unit_data['file_path']
|
|
362
|
+
|
|
363
|
+
@current_uris << @builder.uri_for(unit_data)
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
def fingerprint(doc)
|
|
367
|
+
Digest::SHA256.hexdigest("#{doc[:title]}\n#{doc[:body]}")
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
# Records whether an error was a budget-exhaustion stop. Returns true when
|
|
371
|
+
# it was, so callers can break out of their loop. Class check first; the
|
|
372
|
+
# message match remains as a fallback for injected clients that raise
|
|
373
|
+
# plain Woods::Error.
|
|
374
|
+
def note_budget_exhaustion(error)
|
|
375
|
+
return false unless error.is_a?(BudgetExhaustedError) || error.message.include?('daily budget exhausted')
|
|
376
|
+
|
|
377
|
+
@budget_exhausted = true
|
|
179
378
|
end
|
|
180
379
|
|
|
181
380
|
def build_reader(index_dir)
|
|
@@ -183,6 +382,23 @@ module Woods
|
|
|
183
382
|
Woods::MCP::IndexReader.new(index_dir)
|
|
184
383
|
end
|
|
185
384
|
|
|
385
|
+
# Persist the manifest, downgrading failures to a warning: losing the
|
|
386
|
+
# manifest only costs a full re-check next run, which must not turn an
|
|
387
|
+
# otherwise-successful sync into a crash (this runs from an ensure, where
|
|
388
|
+
# a raise would also mask any in-flight exception).
|
|
389
|
+
def save_manifest
|
|
390
|
+
@manifest.save
|
|
391
|
+
rescue StandardError => e
|
|
392
|
+
log " WARNING: sync manifest not persisted (#{e.message}) — next run will re-push all documents"
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
def build_manifest(index_dir)
|
|
396
|
+
SyncManifest.new(
|
|
397
|
+
path: File.join(index_dir, 'unblocked_sync_manifest.json'),
|
|
398
|
+
collection_id: @collection_id
|
|
399
|
+
)
|
|
400
|
+
end
|
|
401
|
+
|
|
186
402
|
def empty_stats
|
|
187
403
|
{ synced: 0, skipped: 0, errors: [] }
|
|
188
404
|
end
|
|
@@ -1,7 +1,15 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'woods'
|
|
4
|
+
|
|
3
5
|
module Woods
|
|
4
6
|
module Unblocked
|
|
7
|
+
# Raised when the daily API call budget is exhausted. Subclasses
|
|
8
|
+
# Woods::Error so existing +rescue Woods::Error+ sites keep working;
|
|
9
|
+
# callers that need to branch on exhaustion rescue this class instead of
|
|
10
|
+
# matching the message string.
|
|
11
|
+
class BudgetExhaustedError < Woods::Error; end
|
|
12
|
+
|
|
5
13
|
# Daily budget-based rate limiter for the Unblocked API (1000 calls/day).
|
|
6
14
|
#
|
|
7
15
|
# Unlike Notion's per-second throttling, Unblocked limits by daily call count.
|
|
@@ -35,13 +43,13 @@ module Woods
|
|
|
35
43
|
#
|
|
36
44
|
# @yield The API call to execute
|
|
37
45
|
# @return [Object] The block's return value
|
|
38
|
-
# @raise [
|
|
46
|
+
# @raise [BudgetExhaustedError] if daily budget is exhausted
|
|
39
47
|
def track
|
|
40
48
|
raise ArgumentError, 'block required' unless block_given?
|
|
41
49
|
|
|
42
50
|
@mutex.synchronize do
|
|
43
51
|
if @calls_today >= @daily_budget
|
|
44
|
-
raise
|
|
52
|
+
raise BudgetExhaustedError,
|
|
45
53
|
"Unblocked API daily budget exhausted (#{@daily_budget} calls). " \
|
|
46
54
|
'Budget resets at midnight PST. Use UNBLOCKED_DAILY_BUDGET to adjust.'
|
|
47
55
|
end
|