woods 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +229 -0
  3. data/README.md +24 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +37 -51
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +10 -4
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +3 -4
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +737 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +88 -7
  102. data/lib/woods/unblocked/document_builder.rb +75 -36
  103. data/lib/woods/unblocked/exporter.rb +234 -18
  104. data/lib/woods/unblocked/rate_limiter.rb +10 -2
  105. data/lib/woods/unblocked/sync_manifest.rb +135 -0
  106. data/lib/woods/util/host_guard.rb +61 -0
  107. data/lib/woods/version.rb +1 -1
  108. data/lib/woods.rb +126 -6
  109. metadata +70 -4
@@ -10,7 +10,7 @@ module Woods
10
10
  # side effects, and structural complexity.
11
11
  #
12
12
  # @example
13
- # builder = DocumentBuilder.new(repo_url: "https://github.com/bigcartel/admin")
13
+ # builder = DocumentBuilder.new(repo_url: "https://github.com/acme/myapp")
14
14
  # doc = builder.build(unit_data)
15
15
  # # => { title: "Order (model)", body: "# Order (model)\n...", uri: "https://..." }
16
16
  #
@@ -27,34 +27,66 @@ module Woods
27
27
  def build(unit_data)
28
28
  type = unit_data['type']
29
29
  identifier = unit_data['identifier']
30
- file_path = unit_data['file_path']
31
30
 
32
31
  {
33
32
  title: "#{identifier} (#{type})",
34
33
  body: build_body(unit_data),
35
- uri: build_uri(file_path)
34
+ uri: uri_for(unit_data)
36
35
  }
37
36
  end
38
37
 
39
- private
40
-
41
- def build_uri(file_path)
38
+ # The citation URI for a unit (GitHub blob URL, or the repo root when the
39
+ # unit has no file_path). Public so callers can compute a unit's URI
40
+ # cheaply — e.g. to build the set of currently-existing URIs — without
41
+ # building the full document body.
42
+ #
43
+ # @param unit_data [Hash] Parsed unit JSON (needs 'file_path')
44
+ # @return [String] Citation URI
45
+ def uri_for(unit_data)
46
+ file_path = unit_data['file_path']
42
47
  return @repo_url unless file_path
43
48
 
44
49
  "#{@repo_url}/blob/main/#{file_path}"
45
50
  end
46
51
 
52
+ private
53
+
47
54
  def build_body(unit_data)
48
55
  type = unit_data['type']
49
- case type
50
- when 'model' then build_model_body(unit_data)
51
- when 'controller' then build_controller_body(unit_data)
52
- when 'service', 'job', 'mailer', 'manager', 'decorator', 'concern'
53
- build_generic_body(unit_data)
54
- when 'graphql', 'graphql_type', 'graphql_mutation', 'graphql_resolver', 'graphql_query'
55
- build_graphql_body(unit_data)
56
- else build_generic_body(unit_data)
57
- end
56
+ body = case type
57
+ when 'model' then build_model_body(unit_data)
58
+ when 'controller' then build_controller_body(unit_data)
59
+ when 'service', 'job', 'mailer', 'manager', 'decorator', 'concern'
60
+ build_generic_body(unit_data)
61
+ when 'graphql', 'graphql_type', 'graphql_mutation', 'graphql_resolver', 'graphql_query'
62
+ build_graphql_body(unit_data)
63
+ else build_generic_body(unit_data)
64
+ end
65
+ # Defensive credential scrub — current builders only emit structured
66
+ # metadata, but if a future formatter adds source_code or comments
67
+ # (mirroring Notion's `ModelMapper#extract_description`) the scrub
68
+ # keeps credential material from reaching Unblocked.
69
+ redact_credentials(body)
70
+ end
71
+
72
+ # Run the assembled body through CredentialScanner. Fails closed (empty
73
+ # body) if the scanner raises, so a shipping failure never leaks
74
+ # unredacted content.
75
+ #
76
+ # @param body [String]
77
+ # @return [String]
78
+ def redact_credentials(body)
79
+ return body if body.nil? || body.empty?
80
+
81
+ require 'woods/console/credential_scanner'
82
+ redacted, _counts = credential_scanner.scan(body)
83
+ redacted
84
+ rescue StandardError
85
+ ''
86
+ end
87
+
88
+ def credential_scanner
89
+ @credential_scanner ||= Woods::Console::CredentialScanner.new
58
90
  end
59
91
 
60
92
  # ── Model formatting ─────────────────────────────────────────────
@@ -100,7 +132,9 @@ module Woods
100
132
  dep = a.dig('options', 'dependent')
101
133
  dep ? "#{name} (#{dep})" : name
102
134
  end
103
- lines << "**#{type}:** #{targets.join(', ')}"
135
+ # Sorted so the body is a function of association content, not order
136
+ # (the exporter hashes this body to detect changes).
137
+ lines << "**#{type}:** #{targets.sort.join(', ')}"
104
138
  end
105
139
 
106
140
  lines.join("\n")
@@ -111,7 +145,7 @@ module Woods
111
145
  return nil if deps.empty?
112
146
 
113
147
  grouped = deps.group_by { |d| d['type'] }
114
- summary_parts = grouped.map { |type, items| "#{items.size} #{type}s" }
148
+ summary_parts = grouped.sort_by { |type, _| type.to_s }.map { |type, items| "#{items.size} #{type}s" }
115
149
 
116
150
  lines = ["## Dependents (#{deps.size} units)"]
117
151
  lines << summary_parts.join(', ')
@@ -135,9 +169,9 @@ module Woods
135
169
  return nil if controllers.empty? && graphql.empty?
136
170
 
137
171
  lines = ['## Entry Points']
138
- lines << "**Controllers:** #{controllers.map { |c| c['identifier'] }.join(', ')}" if controllers.any?
139
- lines << "**GraphQL:** #{graphql.map { |g| g['identifier'] }.join(', ')}" if graphql.any?
140
- lines << "**Jobs:** #{jobs.map { |j| j['identifier'] }.join(', ')}" if jobs.any?
172
+ lines << "**Controllers:** #{controllers.map { |c| c['identifier'] }.sort.join(', ')}" if controllers.any?
173
+ lines << "**GraphQL:** #{graphql.map { |g| g['identifier'] }.sort.join(', ')}" if graphql.any?
174
+ lines << "**Jobs:** #{jobs.map { |j| j['identifier'] }.sort.join(', ')}" if jobs.any?
141
175
 
142
176
  lines.join("\n")
143
177
  end
@@ -147,15 +181,16 @@ module Woods
147
181
 
148
182
  enums = meta['enums']
149
183
  if enums.is_a?(Hash) && enums.any?
150
- enum_strs = enums.map { |name, values| "#{name} (#{format_enum_values(values)})" }
184
+ enum_strs = enums.sort_by { |name, _| name.to_s }
185
+ .map { |name, values| "#{name} (#{format_enum_values(values)})" }
151
186
  parts << "**Enums:** #{enum_strs.join('; ')}"
152
187
  end
153
188
 
154
189
  scopes = meta['scopes']
155
- parts << "**Scopes:** #{scopes.map { |s| s['name'] }.join(', ')}" if scopes.is_a?(Array) && scopes.any?
190
+ parts << "**Scopes:** #{scopes.map { |s| s['name'] }.sort.join(', ')}" if scopes.is_a?(Array) && scopes.any?
156
191
 
157
192
  concerns = meta['inlined_concerns']
158
- parts << "**Concerns:** #{concerns.join(', ')}" if concerns.is_a?(Array) && concerns.any?
193
+ parts << "**Concerns:** #{concerns.sort.join(', ')}" if concerns.is_a?(Array) && concerns.any?
159
194
 
160
195
  callbacks = meta['callbacks']
161
196
  if callbacks.is_a?(Array) && callbacks.any?
@@ -175,8 +210,8 @@ module Woods
175
210
  return nil if jobs.empty? && mailers.empty?
176
211
 
177
212
  lines = ['## Side Effects']
178
- lines << "**Jobs:** #{jobs.map { |j| j['identifier'] }.join(', ')}" if jobs.any?
179
- lines << "**Mailers:** #{mailers.map { |m| m['identifier'] }.join(', ')}" if mailers.any?
213
+ lines << "**Jobs:** #{jobs.map { |j| j['identifier'] }.sort.join(', ')}" if jobs.any?
214
+ lines << "**Mailers:** #{mailers.map { |m| m['identifier'] }.sort.join(', ')}" if mailers.any?
180
215
 
181
216
  lines.join("\n")
182
217
  end
@@ -204,18 +239,21 @@ module Woods
204
239
  routes = meta['routes']
205
240
  return nil unless routes.is_a?(Hash) && routes.any?
206
241
 
207
- lines = ['## Routes']
242
+ route_lines = []
208
243
  routes.each do |action, route_list|
209
244
  next unless route_list.is_a?(Array)
210
245
 
211
246
  route_list.each do |route|
212
247
  next unless route.is_a?(Hash)
213
248
 
214
- lines << "- `#{route['verb']} #{route['path']}` (#{action})"
249
+ route_lines << "- `#{route['verb']} #{route['path']}` (#{action})"
215
250
  end
216
251
  end
217
252
 
218
- lines.size > 1 ? lines.first(20).join("\n") : nil
253
+ # Sort before truncating so the kept subset is stable across runs.
254
+ return nil if route_lines.empty?
255
+
256
+ (['## Routes'] + route_lines.sort.first(20)).join("\n")
219
257
  end
220
258
 
221
259
  def controller_dependencies(unit)
@@ -225,7 +263,7 @@ module Woods
225
263
  models = deps.select { |d| d['type'] == 'model' }.map { |d| d['target'] }
226
264
  return nil if models.empty?
227
265
 
228
- "## Dependencies\n**Models:** #{models.join(', ')}"
266
+ "## Dependencies\n**Models:** #{models.sort.join(', ')}"
229
267
  end
230
268
 
231
269
  def controller_dependents(unit)
@@ -233,7 +271,7 @@ module Woods
233
271
  views = deps.select { |d| d['type'] == 'view_template' }
234
272
  return nil if views.empty?
235
273
 
236
- "## Views\n#{views.map { |v| "- `#{v['identifier']}`" }.first(10).join("\n")}"
274
+ "## Views\n#{views.map { |v| "- `#{v['identifier']}`" }.sort.first(10).join("\n")}"
237
275
  end
238
276
 
239
277
  # ── GraphQL formatting ───────────────────────────────────────────
@@ -246,7 +284,7 @@ module Woods
246
284
 
247
285
  deps = unit['dependencies'] || []
248
286
  models = deps.select { |d| d['type'] == 'model' }.map { |d| d['target'] }
249
- sections << "**Models:** #{models.join(', ')}" if models.any?
287
+ sections << "**Models:** #{models.sort.join(', ')}" if models.any?
250
288
 
251
289
  dependents = unit['dependents'] || []
252
290
  sections << "**Referenced by:** #{dependents.size} units" if dependents.any?
@@ -267,14 +305,15 @@ module Woods
267
305
  deps = unit['dependencies'] || []
268
306
  if deps.any?
269
307
  by_type = deps.group_by { |d| d['type'] }
270
- dep_parts = by_type.map { |type, items| "#{type}: #{items.map { |d| d['target'] }.join(', ')}" }
308
+ dep_parts = by_type.sort_by { |type, _| type.to_s }
309
+ .map { |type, items| "#{type}: #{items.map { |d| d['target'] }.sort.join(', ')}" }
271
310
  sections << "## Dependencies\n#{dep_parts.join("\n")}"
272
311
  end
273
312
 
274
313
  dependents = unit['dependents'] || []
275
314
  if dependents.any?
276
315
  grouped = dependents.group_by { |d| d['type'] }
277
- summary = grouped.map { |type, items| "#{items.size} #{type}s" }
316
+ summary = grouped.sort_by { |type, _| type.to_s }.map { |type, items| "#{items.size} #{type}s" }
278
317
  sections << "## Dependents (#{dependents.size})\n#{summary.join(', ')}"
279
318
  end
280
319
 
@@ -292,9 +331,9 @@ module Woods
292
331
  end
293
332
 
294
333
  def format_callbacks(callbacks)
295
- callbacks.first(5).map do |cb|
296
- "#{cb['type']}: #{cb['filter']}"
297
- end.join(', ')
334
+ # Sort before truncating so both the selection and order are stable
335
+ # regardless of input order (the body is hashed for change detection).
336
+ callbacks.map { |cb| "#{cb['type']}: #{cb['filter']}" }.sort.first(5).join(', ')
298
337
  end
299
338
  end
300
339
  end
@@ -1,9 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'set'
4
+ require 'digest'
3
5
  require 'woods'
4
6
  require_relative 'client'
5
7
  require_relative 'rate_limiter'
6
8
  require_relative 'document_builder'
9
+ require_relative 'sync_manifest'
7
10
 
8
11
  module Woods
9
12
  module Unblocked
@@ -11,16 +14,27 @@ module Woods
11
14
  #
12
15
  # Reads extraction output from disk via IndexReader, converts units to
13
16
  # condensed Markdown documents, and pushes via the Unblocked Documents API.
14
- # All syncs are idempotent documents are upserted by URI.
17
+ # Syncs are incremental: a {SyncManifest} records the content hash and
18
+ # remote document_id of everything last pushed, so each run only PUTs
19
+ # new/changed documents, skips unchanged ones, and deletes documents whose
20
+ # source unit has disappeared. Documents are upserted by URI, so a missing
21
+ # manifest (first run / CI cache miss) degrades to a correct full sync.
15
22
  #
16
23
  # @example
17
24
  # exporter = Exporter.new(index_dir: "tmp/woods")
18
25
  # stats = exporter.sync_all
19
- # # => { synced: 940, skipped: 5060, errors: [] }
26
+ # # => { synced: 12, skipped: 928, deleted: 1, errors: [] }
20
27
  #
21
28
  class Exporter
22
29
  MAX_ERRORS = 100
23
30
 
31
+ # Mass-deletion guard: refuse to purge when more than this fraction of a
32
+ # manifest of at least PURGE_GUARD_MIN_DOCS entries would be deleted —
33
+ # the signature of a sync run against a partial index. Override with
34
+ # force_purge.
35
+ PURGE_GUARD_FRACTION = 0.30
36
+ PURGE_GUARD_MIN_DOCS = 10
37
+
24
38
  # Unit types to sync, in priority order.
25
39
  # All units are synced for these types.
26
40
  FULL_SYNC_TYPES = %w[
@@ -39,9 +53,13 @@ module Woods
39
53
  # @param config [Configuration] Woods configuration (default: global config)
40
54
  # @param client [Client, nil] Unblocked API client (auto-created from config if nil)
41
55
  # @param reader [Object, nil] IndexReader instance (auto-created if nil)
56
+ # @param manifest [SyncManifest, nil] Sync manifest (auto-created under index_dir if nil)
57
+ # @param force_full [Boolean] Re-push every unit, ignoring the unchanged check
58
+ # @param force_purge [Boolean] Bypass the mass-deletion guard
42
59
  # @param output [IO] Progress output stream (default: $stdout)
43
60
  # @raise [ConfigurationError] if required config is missing
44
- def initialize(index_dir:, config: Woods.configuration, client: nil, reader: nil, output: $stdout)
61
+ def initialize(index_dir:, config: Woods.configuration, client: nil, reader: nil,
62
+ manifest: nil, force_full: false, force_purge: false, output: $stdout)
45
63
  @collection_id = config.unblocked_collection_id
46
64
  raise ConfigurationError, 'unblocked_collection_id is required' unless @collection_id
47
65
 
@@ -51,24 +69,37 @@ module Woods
51
69
  api_token = config.unblocked_api_token
52
70
  raise ConfigurationError, 'unblocked_api_token is required' unless api_token
53
71
 
54
- budget = ENV.fetch('UNBLOCKED_DAILY_BUDGET', RateLimiter::DEFAULT_BUDGET).to_i
72
+ budget = ENV.fetch('UNBLOCKED_DAILY_BUDGET', RateLimiter::DEFAULT_BUDGET.to_s).to_i
55
73
  limiter = RateLimiter.new(daily_budget: budget)
56
74
 
57
75
  @client = client || Client.new(api_token: api_token, rate_limiter: limiter)
58
76
  @reader = reader || build_reader(index_dir)
59
77
  @builder = DocumentBuilder.new(repo_url: repo_url)
78
+ @manifest = manifest || build_manifest(index_dir)
79
+ @force_full = force_full
80
+ @force_purge = force_purge
60
81
  @output = output
82
+ # Initialized here as well as in sync_all so the public sync_type /
83
+ # sync_type_partial methods work standalone (track_uri needs them).
84
+ @current_uris = Set.new
85
+ @budget_exhausted = false
61
86
  end
62
87
 
63
88
  # Sync all configured unit types to the Unblocked collection.
64
89
  #
65
- # @return [Hash] { synced: Integer, skipped: Integer, errors: Array<String> }
90
+ # @return [Hash] { synced:, skipped:, deleted:, errors: }
66
91
  def sync_all
92
+ @current_uris = Set.new
93
+ @budget_exhausted = false
94
+ reconcile_from_remote if @manifest.empty?
95
+
67
96
  synced = 0
68
97
  skipped = 0
69
98
  errors = []
70
99
 
71
100
  FULL_SYNC_TYPES.each do |type|
101
+ break if @budget_exhausted
102
+
72
103
  result = sync_type(type)
73
104
  synced += result[:synced]
74
105
  skipped += result[:skipped]
@@ -76,19 +107,24 @@ module Woods
76
107
  end
77
108
 
78
109
  PARTIAL_SYNC_TYPES.each do |type, max_count|
110
+ break if @budget_exhausted
111
+
79
112
  result = sync_type_partial(type, max_count)
80
113
  synced += result[:synced]
81
114
  skipped += result[:skipped]
82
115
  errors.concat(result[:errors])
83
116
  end
84
117
 
85
- { synced: synced, skipped: skipped, errors: cap_errors(errors) }
118
+ deleted = @budget_exhausted ? 0 : purge_stale(errors)
119
+ { synced: synced, skipped: skipped, deleted: deleted, errors: cap_errors(errors) }
120
+ ensure
121
+ save_manifest
86
122
  end
87
123
 
88
124
  # Sync all units of a given type.
89
125
  #
90
126
  # @param type [String] Unit type (e.g. "model", "controller")
91
- # @return [Hash] { synced: Integer, skipped: Integer, errors: Array<String> }
127
+ # @return [Hash] { synced:, skipped:, errors: }
92
128
  def sync_type(type)
93
129
  units = @reader.list_units(type: type)
94
130
  log " #{type}: #{units.size} units"
@@ -100,7 +136,7 @@ module Woods
100
136
  #
101
137
  # @param type [String] Unit type
102
138
  # @param max_count [Integer] Maximum units to sync
103
- # @return [Hash] { synced: Integer, skipped: Integer, errors: Array<String> }
139
+ # @return [Hash] { synced:, skipped:, errors: }
104
140
  def sync_type_partial(type, max_count)
105
141
  units = @reader.list_units(type: type)
106
142
  return empty_stats if units.empty?
@@ -114,8 +150,14 @@ module Woods
114
150
  { entry: entry, data: data, dep_count: dep_count }
115
151
  end
116
152
 
153
+ # Every unit of this type still exists — track its URI so partial units
154
+ # that fall *out* of the top-N are never mistaken for deletions.
155
+ units_with_data.each { |u| track_uri(u[:data]) }
156
+
117
157
  top_units = units_with_data.sort_by { |u| -u[:dep_count] }.first(max_count)
118
- skipped_count = [units.size - max_count, 0].max
158
+ # Count against what was actually synced — units.size includes entries
159
+ # whose unit data was missing (dropped by the filter_map above).
160
+ skipped_count = units.size - top_units.size
119
161
 
120
162
  log " #{type}: #{top_units.size}/#{units.size} units (top by dependents)"
121
163
 
@@ -138,13 +180,19 @@ module Woods
138
180
  next
139
181
  end
140
182
 
141
- push_document(unit_data)
142
- synced += 1
183
+ track_uri(unit_data)
184
+ if push_document(unit_data) == :skipped
185
+ skipped += 1
186
+ else
187
+ synced += 1
188
+ end
143
189
  rescue Woods::Error => e
144
190
  errors << "#{entry['identifier']}: #{e.message}"
145
- break if e.message.include?('daily budget exhausted')
191
+ break if note_budget_exhaustion(e)
146
192
  rescue StandardError => e
147
- errors << "#{entry['identifier']}: #{e.message}"
193
+ # Include the class — "undefined method for nil" without it is
194
+ # unactionable in CI logs.
195
+ errors << "#{entry['identifier']}: #{e.class}: #{e.message}"
148
196
  end
149
197
 
150
198
  { synced: synced, skipped: skipped, errors: errors }
@@ -156,26 +204,177 @@ module Woods
156
204
  errors = []
157
205
 
158
206
  entries_with_data.each do |entry, unit_data|
159
- push_document(unit_data)
160
- synced += 1
207
+ track_uri(unit_data)
208
+ if push_document(unit_data) == :skipped
209
+ skipped += 1
210
+ else
211
+ synced += 1
212
+ end
161
213
  rescue Woods::Error => e
162
214
  errors << "#{entry['identifier']}: #{e.message}"
163
- break if e.message.include?('daily budget exhausted')
215
+ break if note_budget_exhaustion(e)
164
216
  rescue StandardError => e
165
- errors << "#{entry['identifier']}: #{e.message}"
217
+ # Include the class — "undefined method for nil" without it is
218
+ # unactionable in CI logs.
219
+ errors << "#{entry['identifier']}: #{e.class}: #{e.message}"
166
220
  end
167
221
 
168
222
  { synced: synced, skipped: skipped, errors: errors }
169
223
  end
170
224
 
225
+ # Build the document, skip it if the manifest says it is unchanged,
226
+ # otherwise upsert it and record the new hash + remote document_id.
227
+ #
228
+ # @return [Symbol] :synced or :skipped
171
229
  def push_document(unit_data)
230
+ # No file_path → the URI falls back to the bare repo URL, which every
231
+ # such unit would share: they'd overwrite each other remotely and
232
+ # ping-pong the manifest hash forever. Skip them.
233
+ return :skipped unless unit_data['file_path']
234
+
172
235
  doc = @builder.build(unit_data)
173
- @client.put_document(
236
+ # An empty body means the credential scrub failed closed (the builders
237
+ # always emit at least a header). Upserting it would overwrite a good
238
+ # remote document with nothing — error out and leave the remote as-is.
239
+ if doc[:body].nil? || doc[:body].empty?
240
+ raise Woods::ExtractionError, 'document body empty (credential scrub failure?) — push skipped'
241
+ end
242
+
243
+ hash = fingerprint(doc)
244
+ return :skipped if !@force_full && @manifest.unchanged?(doc[:uri], hash)
245
+
246
+ response = @client.put_document(
174
247
  collection_id: @collection_id,
175
248
  title: doc[:title],
176
249
  body: doc[:body],
177
250
  uri: doc[:uri]
178
251
  )
252
+ document_id = (response['id'] if response.is_a?(Hash)) || @manifest.document_id_for(doc[:uri])
253
+ @manifest.record(uri: doc[:uri], hash: hash, document_id: document_id)
254
+ :synced
255
+ end
256
+
257
+ # Delete remote documents whose source unit no longer exists. Failures
258
+ # are appended to +errors+ — a delete that fails silently every run is
259
+ # how a collection rots while "deleted: 0" looks normal.
260
+ #
261
+ # @param errors [Array<String>] sink for delete failures
262
+ # @return [Integer] number of documents deleted
263
+ def purge_stale(errors)
264
+ stale = @manifest.stale_uris(@current_uris)
265
+ return 0 if stale.empty?
266
+ return 0 if guard_blocks_purge?(stale)
267
+
268
+ resolve_missing_document_ids(stale)
269
+
270
+ deleted = 0
271
+ stale.each do |uri|
272
+ document_id = @manifest.document_id_for(uri)
273
+ next unless document_id
274
+
275
+ @client.delete_document(document_id: document_id)
276
+ @manifest.forget(uri)
277
+ deleted += 1
278
+ rescue ApiError => e
279
+ if e.status == 404
280
+ # Already gone remotely — goal state reached, drop the entry
281
+ # rather than retrying every run.
282
+ @manifest.forget(uri)
283
+ else
284
+ errors << "delete #{uri}: #{e.message}"
285
+ end
286
+ rescue Woods::Error => e
287
+ break if note_budget_exhaustion(e)
288
+
289
+ errors << "delete #{uri}: #{e.message}"
290
+ rescue StandardError => e
291
+ # Entry stays in the manifest so a later run retries the delete —
292
+ # but surface the failure so systematic breakage is visible.
293
+ errors << "delete #{uri}: #{e.class}: #{e.message}"
294
+ end
295
+ deleted
296
+ end
297
+
298
+ # A manifest entry can carry a nil document_id (e.g. the PUT response
299
+ # body was empty). Those entries would be permanently undeletable, so
300
+ # before purging, make one bounded all_documents sweep to resolve ids.
301
+ # Best-effort: unresolved entries are simply skipped by the purge loop.
302
+ def resolve_missing_document_ids(stale)
303
+ missing = stale.select { |uri| @manifest.document_id_for(uri).nil? }
304
+ return if missing.empty?
305
+
306
+ ids_by_uri = @client.all_documents(collection_id: @collection_id)
307
+ .to_h { |doc| [doc['uri'], doc['id']] }
308
+ missing.each do |uri|
309
+ id = ids_by_uri[uri]
310
+ @manifest.record(uri: uri, hash: nil, document_id: id) if id
311
+ end
312
+ rescue StandardError => e
313
+ log " id resolution skipped (#{e.message})"
314
+ end
315
+
316
+ # True when purging +stale+ would delete too large a fraction of the
317
+ # manifest — the signature of running against a partial index. The floor
318
+ # (PURGE_GUARD_MIN_DOCS) keeps small collections deletable.
319
+ def guard_blocks_purge?(stale)
320
+ return false if @force_purge
321
+
322
+ size = @manifest.size
323
+ return false if size < PURGE_GUARD_MIN_DOCS
324
+
325
+ fraction = stale.size.to_f / size
326
+ return false unless fraction > PURGE_GUARD_FRACTION
327
+
328
+ log " WARNING: refusing to delete #{stale.size} of #{size} documents " \
329
+ "(#{(fraction * 100).round}% > #{(PURGE_GUARD_FRACTION * 100).to_i}% — likely a partial index). " \
330
+ 'Set UNBLOCKED_FORCE_PURGE=1 to override.'
331
+ true
332
+ end
333
+
334
+ # Seed the manifest from the remote collection when we have no local
335
+ # state (first run / CI cache miss). The list endpoint returns no body,
336
+ # so hashes are nil (everything re-pushes), but recovering document_ids
337
+ # lets this run still purge orphaned documents.
338
+ #
339
+ # Auth failures re-raise: a 401/403 here dooms every subsequent call,
340
+ # and "proceeding with full sync" would burn the whole daily budget on
341
+ # guaranteed failures.
342
+ def reconcile_from_remote
343
+ @client.all_documents(collection_id: @collection_id).each do |doc|
344
+ uri = doc['uri']
345
+ next unless uri
346
+
347
+ @manifest.record(uri: uri, hash: nil, document_id: doc['id'])
348
+ end
349
+ rescue ApiError => e
350
+ raise if [401, 403].include?(e.status)
351
+
352
+ log " reconcile skipped (#{e.message}) — proceeding with full sync"
353
+ rescue StandardError => e
354
+ log " reconcile skipped (#{e.message}) — proceeding with full sync"
355
+ end
356
+
357
+ def track_uri(unit_data)
358
+ # Units without a file_path are never pushed (see push_document), so
359
+ # their fallback repo-root URI must not be marked current either — a
360
+ # stale repo-root document from before this guard should purge.
361
+ return unless unit_data['file_path']
362
+
363
+ @current_uris << @builder.uri_for(unit_data)
364
+ end
365
+
366
+ def fingerprint(doc)
367
+ Digest::SHA256.hexdigest("#{doc[:title]}\n#{doc[:body]}")
368
+ end
369
+
370
+ # Records whether an error was a budget-exhaustion stop. Returns true when
371
+ # it was, so callers can break out of their loop. Class check first; the
372
+ # message match remains as a fallback for injected clients that raise
373
+ # plain Woods::Error.
374
+ def note_budget_exhaustion(error)
375
+ return false unless error.is_a?(BudgetExhaustedError) || error.message.include?('daily budget exhausted')
376
+
377
+ @budget_exhausted = true
179
378
  end
180
379
 
181
380
  def build_reader(index_dir)
@@ -183,6 +382,23 @@ module Woods
183
382
  Woods::MCP::IndexReader.new(index_dir)
184
383
  end
185
384
 
385
+ # Persist the manifest, downgrading failures to a warning: losing the
386
+ # manifest only costs a full re-check next run, which must not turn an
387
+ # otherwise-successful sync into a crash (this runs from an ensure, where
388
+ # a raise would also mask any in-flight exception).
389
+ def save_manifest
390
+ @manifest.save
391
+ rescue StandardError => e
392
+ log " WARNING: sync manifest not persisted (#{e.message}) — next run will re-push all documents"
393
+ end
394
+
395
+ def build_manifest(index_dir)
396
+ SyncManifest.new(
397
+ path: File.join(index_dir, 'unblocked_sync_manifest.json'),
398
+ collection_id: @collection_id
399
+ )
400
+ end
401
+
186
402
  def empty_stats
187
403
  { synced: 0, skipped: 0, errors: [] }
188
404
  end
@@ -1,7 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'woods'
4
+
3
5
  module Woods
4
6
  module Unblocked
7
+ # Raised when the daily API call budget is exhausted. Subclasses
8
+ # Woods::Error so existing +rescue Woods::Error+ sites keep working;
9
+ # callers that need to branch on exhaustion rescue this class instead of
10
+ # matching the message string.
11
+ class BudgetExhaustedError < Woods::Error; end
12
+
5
13
  # Daily budget-based rate limiter for the Unblocked API (1000 calls/day).
6
14
  #
7
15
  # Unlike Notion's per-second throttling, Unblocked limits by daily call count.
@@ -35,13 +43,13 @@ module Woods
35
43
  #
36
44
  # @yield The API call to execute
37
45
  # @return [Object] The block's return value
38
- # @raise [Woods::Error] if daily budget is exhausted
46
+ # @raise [BudgetExhaustedError] if daily budget is exhausted
39
47
  def track
40
48
  raise ArgumentError, 'block required' unless block_given?
41
49
 
42
50
  @mutex.synchronize do
43
51
  if @calls_today >= @daily_budget
44
- raise Woods::Error,
52
+ raise BudgetExhaustedError,
45
53
  "Unblocked API daily budget exhausted (#{@daily_budget} calls). " \
46
54
  'Budget resets at midnight PST. Use UNBLOCKED_DAILY_BUDGET to adjust.'
47
55
  end