woods 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +186 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +69 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +210 -0
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +100 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +771 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +163 -0
  102. data/lib/woods/unblocked/document_builder.rb +326 -0
  103. data/lib/woods/unblocked/exporter.rb +201 -0
  104. data/lib/woods/unblocked/rate_limiter.rb +94 -0
  105. data/lib/woods/util/host_guard.rb +61 -0
  106. data/lib/woods/version.rb +1 -1
  107. data/lib/woods.rb +130 -6
  108. metadata +73 -4
@@ -0,0 +1,326 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Woods
4
+ module Unblocked
5
+ # Converts extracted unit JSON into condensed Markdown documents
6
+ # optimized for Unblocked's code review and Q&A context.
7
+ #
8
+ # Each unit type has a specialized formatting strategy that emphasizes
9
+ # what matters for code review: associations, blast radius, entry points,
10
+ # side effects, and structural complexity.
11
+ #
12
+ # @example
13
+ # builder = DocumentBuilder.new(repo_url: "https://github.com/acme/myapp")
14
+ # doc = builder.build(unit_data)
15
+ # # => { title: "Order (model)", body: "# Order (model)\n...", uri: "https://..." }
16
+ #
17
+ class DocumentBuilder
18
+ # @param repo_url [String] GitHub repo base URL for citation URIs
19
+ def initialize(repo_url:)
20
+ @repo_url = repo_url.chomp('/')
21
+ end
22
+
23
+ # Build a document hash from a unit's extracted data.
24
+ #
25
+ # @param unit_data [Hash] Parsed unit JSON (from IndexReader)
26
+ # @return [Hash] { title:, body:, uri: }
27
+ def build(unit_data)
28
+ type = unit_data['type']
29
+ identifier = unit_data['identifier']
30
+ file_path = unit_data['file_path']
31
+
32
+ {
33
+ title: "#{identifier} (#{type})",
34
+ body: build_body(unit_data),
35
+ uri: build_uri(file_path)
36
+ }
37
+ end
38
+
39
+ private
40
+
41
+ def build_uri(file_path)
42
+ return @repo_url unless file_path
43
+
44
+ "#{@repo_url}/blob/main/#{file_path}"
45
+ end
46
+
47
+ def build_body(unit_data)
48
+ type = unit_data['type']
49
+ body = case type
50
+ when 'model' then build_model_body(unit_data)
51
+ when 'controller' then build_controller_body(unit_data)
52
+ when 'service', 'job', 'mailer', 'manager', 'decorator', 'concern'
53
+ build_generic_body(unit_data)
54
+ when 'graphql', 'graphql_type', 'graphql_mutation', 'graphql_resolver', 'graphql_query'
55
+ build_graphql_body(unit_data)
56
+ else build_generic_body(unit_data)
57
+ end
58
+ # Defensive credential scrub — current builders only emit structured
59
+ # metadata, but if a future formatter adds source_code or comments
60
+ # (mirroring Notion's `ModelMapper#extract_description`) the scrub
61
+ # keeps credential material from reaching Unblocked.
62
+ redact_credentials(body)
63
+ end
64
+
65
+ # Run the assembled body through CredentialScanner. Fails closed (empty
66
+ # body) if the scanner raises, so a shipping failure never leaks
67
+ # unredacted content.
68
+ #
69
+ # @param body [String]
70
+ # @return [String]
71
+ def redact_credentials(body)
72
+ return body if body.nil? || body.empty?
73
+
74
+ require 'woods/console/credential_scanner'
75
+ redacted, _counts = credential_scanner.scan(body)
76
+ redacted
77
+ rescue StandardError
78
+ ''
79
+ end
80
+
81
+ def credential_scanner
82
+ @credential_scanner ||= Woods::Console::CredentialScanner.new
83
+ end
84
+
85
+ # ── Model formatting ─────────────────────────────────────────────
86
+
87
+ def build_model_body(unit)
88
+ meta = unit['metadata'] || {}
89
+ sections = []
90
+
91
+ sections << model_header(unit, meta)
92
+ sections << model_associations(meta)
93
+ sections << model_dependents(unit)
94
+ sections << model_entry_points(unit)
95
+ sections << model_schema_highlights(meta)
96
+ sections << model_side_effects(unit)
97
+
98
+ sections.compact.join("\n\n")
99
+ end
100
+
101
+ def model_header(unit, meta)
102
+ parts = ["# #{unit['identifier']} (model)"]
103
+ file_info = ["**File:** `#{unit['file_path']}`"]
104
+ file_info << "**LOC:** #{meta['loc']}" if meta['loc']
105
+ file_info << "**Table:** #{meta['table_name']}" if meta['table_name']
106
+ column_count = meta['column_count'] || (meta['columns'] || []).size
107
+ file_info << "(#{column_count} columns)" if column_count&.positive?
108
+ parts << file_info.join(' | ')
109
+ parts.join("\n")
110
+ end
111
+
112
+ def model_associations(meta)
113
+ assocs = meta['associations'] || []
114
+ return nil if assocs.empty?
115
+
116
+ grouped = assocs.group_by { |a| a['type'] }
117
+ lines = ["## Associations (#{assocs.size})"]
118
+
119
+ %w[belongs_to has_many has_one has_and_belongs_to_many].each do |type|
120
+ items = grouped[type]
121
+ next unless items&.any?
122
+
123
+ targets = items.map do |a|
124
+ name = a['target'] || a['name']
125
+ dep = a.dig('options', 'dependent')
126
+ dep ? "#{name} (#{dep})" : name
127
+ end
128
+ lines << "**#{type}:** #{targets.join(', ')}"
129
+ end
130
+
131
+ lines.join("\n")
132
+ end
133
+
134
+ def model_dependents(unit)
135
+ deps = unit['dependents'] || []
136
+ return nil if deps.empty?
137
+
138
+ grouped = deps.group_by { |d| d['type'] }
139
+ summary_parts = grouped.map { |type, items| "#{items.size} #{type}s" }
140
+
141
+ lines = ["## Dependents (#{deps.size} units)"]
142
+ lines << summary_parts.join(', ')
143
+
144
+ # Blast radius assessment
145
+ if deps.size > 50
146
+ lines << '**High blast radius** — changes here affect many parts of the codebase'
147
+ elsif deps.size > 20
148
+ lines << '**Moderate blast radius** — changes may ripple to dependent code'
149
+ end
150
+
151
+ lines.join("\n")
152
+ end
153
+
154
+ def model_entry_points(unit)
155
+ deps = unit['dependents'] || []
156
+ controllers = deps.select { |d| d['type'] == 'controller' }
157
+ graphql = deps.select { |d| d['type']&.start_with?('graphql') }
158
+ jobs = deps.select { |d| d['type'] == 'job' }
159
+
160
+ return nil if controllers.empty? && graphql.empty?
161
+
162
+ lines = ['## Entry Points']
163
+ lines << "**Controllers:** #{controllers.map { |c| c['identifier'] }.join(', ')}" if controllers.any?
164
+ lines << "**GraphQL:** #{graphql.map { |g| g['identifier'] }.join(', ')}" if graphql.any?
165
+ lines << "**Jobs:** #{jobs.map { |j| j['identifier'] }.join(', ')}" if jobs.any?
166
+
167
+ lines.join("\n")
168
+ end
169
+
170
+ def model_schema_highlights(meta)
171
+ parts = []
172
+
173
+ enums = meta['enums']
174
+ if enums.is_a?(Hash) && enums.any?
175
+ enum_strs = enums.map { |name, values| "#{name} (#{format_enum_values(values)})" }
176
+ parts << "**Enums:** #{enum_strs.join('; ')}"
177
+ end
178
+
179
+ scopes = meta['scopes']
180
+ parts << "**Scopes:** #{scopes.map { |s| s['name'] }.join(', ')}" if scopes.is_a?(Array) && scopes.any?
181
+
182
+ concerns = meta['inlined_concerns']
183
+ parts << "**Concerns:** #{concerns.join(', ')}" if concerns.is_a?(Array) && concerns.any?
184
+
185
+ callbacks = meta['callbacks']
186
+ if callbacks.is_a?(Array) && callbacks.any?
187
+ parts << "**Callbacks (#{callbacks.size}):** #{format_callbacks(callbacks)}"
188
+ end
189
+
190
+ return nil if parts.empty?
191
+
192
+ (['## Schema Highlights'] + parts).join("\n")
193
+ end
194
+
195
+ def model_side_effects(unit)
196
+ deps = unit['dependents'] || []
197
+ jobs = deps.select { |d| d['type'] == 'job' }
198
+ mailers = deps.select { |d| d['type'] == 'mailer' }
199
+
200
+ return nil if jobs.empty? && mailers.empty?
201
+
202
+ lines = ['## Side Effects']
203
+ lines << "**Jobs:** #{jobs.map { |j| j['identifier'] }.join(', ')}" if jobs.any?
204
+ lines << "**Mailers:** #{mailers.map { |m| m['identifier'] }.join(', ')}" if mailers.any?
205
+
206
+ lines.join("\n")
207
+ end
208
+
209
+ # ── Controller formatting ────────────────────────────────────────
210
+
211
+ def build_controller_body(unit)
212
+ meta = unit['metadata'] || {}
213
+ sections = []
214
+
215
+ sections << "# #{unit['identifier']} (controller)"
216
+ sections << "**File:** `#{unit['file_path']}`"
217
+
218
+ ancestors = meta['ancestors']
219
+ sections << "**Inherits:** #{ancestors[1..3]&.join(' → ')}" if ancestors.is_a?(Array) && ancestors.size > 1
220
+
221
+ sections << controller_routes(meta)
222
+ sections << controller_dependencies(unit)
223
+ sections << controller_dependents(unit)
224
+
225
+ sections.compact.join("\n\n")
226
+ end
227
+
228
+ def controller_routes(meta)
229
+ routes = meta['routes']
230
+ return nil unless routes.is_a?(Hash) && routes.any?
231
+
232
+ lines = ['## Routes']
233
+ routes.each do |action, route_list|
234
+ next unless route_list.is_a?(Array)
235
+
236
+ route_list.each do |route|
237
+ next unless route.is_a?(Hash)
238
+
239
+ lines << "- `#{route['verb']} #{route['path']}` (#{action})"
240
+ end
241
+ end
242
+
243
+ lines.size > 1 ? lines.first(20).join("\n") : nil
244
+ end
245
+
246
+ def controller_dependencies(unit)
247
+ deps = unit['dependencies'] || []
248
+ return nil if deps.empty?
249
+
250
+ models = deps.select { |d| d['type'] == 'model' }.map { |d| d['target'] }
251
+ return nil if models.empty?
252
+
253
+ "## Dependencies\n**Models:** #{models.join(', ')}"
254
+ end
255
+
256
+ def controller_dependents(unit)
257
+ deps = unit['dependents'] || []
258
+ views = deps.select { |d| d['type'] == 'view_template' }
259
+ return nil if views.empty?
260
+
261
+ "## Views\n#{views.map { |v| "- `#{v['identifier']}`" }.first(10).join("\n")}"
262
+ end
263
+
264
+ # ── GraphQL formatting ───────────────────────────────────────────
265
+
266
+ def build_graphql_body(unit)
267
+ sections = []
268
+
269
+ sections << "# #{unit['identifier']} (#{unit['type']})"
270
+ sections << "**File:** `#{unit['file_path']}`"
271
+
272
+ deps = unit['dependencies'] || []
273
+ models = deps.select { |d| d['type'] == 'model' }.map { |d| d['target'] }
274
+ sections << "**Models:** #{models.join(', ')}" if models.any?
275
+
276
+ dependents = unit['dependents'] || []
277
+ sections << "**Referenced by:** #{dependents.size} units" if dependents.any?
278
+
279
+ sections.compact.join("\n\n")
280
+ end
281
+
282
+ # ── Generic formatting (services, jobs, mailers, etc.) ──────────
283
+
284
+ def build_generic_body(unit)
285
+ meta = unit['metadata'] || {}
286
+ sections = []
287
+
288
+ sections << "# #{unit['identifier']} (#{unit['type']})"
289
+ sections << "**File:** `#{unit['file_path']}`"
290
+ sections << "**LOC:** #{meta['loc']}" if meta['loc']
291
+
292
+ deps = unit['dependencies'] || []
293
+ if deps.any?
294
+ by_type = deps.group_by { |d| d['type'] }
295
+ dep_parts = by_type.map { |type, items| "#{type}: #{items.map { |d| d['target'] }.join(', ')}" }
296
+ sections << "## Dependencies\n#{dep_parts.join("\n")}"
297
+ end
298
+
299
+ dependents = unit['dependents'] || []
300
+ if dependents.any?
301
+ grouped = dependents.group_by { |d| d['type'] }
302
+ summary = grouped.map { |type, items| "#{items.size} #{type}s" }
303
+ sections << "## Dependents (#{dependents.size})\n#{summary.join(', ')}"
304
+ end
305
+
306
+ sections.compact.join("\n\n")
307
+ end
308
+
309
+ # ── Helpers ──────────────────────────────────────────────────────
310
+
311
+ def format_enum_values(values)
312
+ case values
313
+ when Hash then values.keys.first(5).join(', ')
314
+ when Array then values.first(5).join(', ')
315
+ else values.to_s
316
+ end
317
+ end
318
+
319
+ def format_callbacks(callbacks)
320
+ callbacks.first(5).map do |cb|
321
+ "#{cb['type']}: #{cb['filter']}"
322
+ end.join(', ')
323
+ end
324
+ end
325
+ end
326
+ end
@@ -0,0 +1,201 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'woods'
4
+ require_relative 'client'
5
+ require_relative 'rate_limiter'
6
+ require_relative 'document_builder'
7
+
8
+ module Woods
9
+ module Unblocked
10
+ # Orchestrates syncing Woods extraction data to an Unblocked collection.
11
+ #
12
+ # Reads extraction output from disk via IndexReader, converts units to
13
+ # condensed Markdown documents, and pushes via the Unblocked Documents API.
14
+ # All syncs are idempotent — documents are upserted by URI.
15
+ #
16
+ # @example
17
+ # exporter = Exporter.new(index_dir: "tmp/woods")
18
+ # stats = exporter.sync_all
19
+ # # => { synced: 940, skipped: 5060, errors: [] }
20
+ #
21
+ class Exporter
22
+ MAX_ERRORS = 100
23
+
24
+ # Unit types to sync, in priority order.
25
+ # All units are synced for these types.
26
+ FULL_SYNC_TYPES = %w[
27
+ model controller service job mailer manager decorator concern serializer
28
+ graphql graphql_type graphql_mutation graphql_resolver graphql_query
29
+ ].freeze
30
+
31
+ # Unit types where only the most-connected units are synced.
32
+ # Each entry: [type, max_count]
33
+ PARTIAL_SYNC_TYPES = [
34
+ ['poro', 100],
35
+ ['lib', 50]
36
+ ].freeze
37
+
38
+ # @param index_dir [String] Path to extraction output directory
39
+ # @param config [Configuration] Woods configuration (default: global config)
40
+ # @param client [Client, nil] Unblocked API client (auto-created from config if nil)
41
+ # @param reader [Object, nil] IndexReader instance (auto-created if nil)
42
+ # @param output [IO] Progress output stream (default: $stdout)
43
+ # @raise [ConfigurationError] if required config is missing
44
+ def initialize(index_dir:, config: Woods.configuration, client: nil, reader: nil, output: $stdout)
45
+ @collection_id = config.unblocked_collection_id
46
+ raise ConfigurationError, 'unblocked_collection_id is required' unless @collection_id
47
+
48
+ repo_url = config.unblocked_repo_url
49
+ raise ConfigurationError, 'unblocked_repo_url is required' unless repo_url
50
+
51
+ api_token = config.unblocked_api_token
52
+ raise ConfigurationError, 'unblocked_api_token is required' unless api_token
53
+
54
+ budget = ENV.fetch('UNBLOCKED_DAILY_BUDGET', RateLimiter::DEFAULT_BUDGET.to_s).to_i
55
+ limiter = RateLimiter.new(daily_budget: budget)
56
+
57
+ @client = client || Client.new(api_token: api_token, rate_limiter: limiter)
58
+ @reader = reader || build_reader(index_dir)
59
+ @builder = DocumentBuilder.new(repo_url: repo_url)
60
+ @output = output
61
+ end
62
+
63
+ # Sync all configured unit types to the Unblocked collection.
64
+ #
65
+ # @return [Hash] { synced: Integer, skipped: Integer, errors: Array<String> }
66
+ def sync_all
67
+ synced = 0
68
+ skipped = 0
69
+ errors = []
70
+
71
+ FULL_SYNC_TYPES.each do |type|
72
+ result = sync_type(type)
73
+ synced += result[:synced]
74
+ skipped += result[:skipped]
75
+ errors.concat(result[:errors])
76
+ end
77
+
78
+ PARTIAL_SYNC_TYPES.each do |type, max_count|
79
+ result = sync_type_partial(type, max_count)
80
+ synced += result[:synced]
81
+ skipped += result[:skipped]
82
+ errors.concat(result[:errors])
83
+ end
84
+
85
+ { synced: synced, skipped: skipped, errors: cap_errors(errors) }
86
+ end
87
+
88
+ # Sync all units of a given type.
89
+ #
90
+ # @param type [String] Unit type (e.g. "model", "controller")
91
+ # @return [Hash] { synced: Integer, skipped: Integer, errors: Array<String> }
92
+ def sync_type(type)
93
+ units = @reader.list_units(type: type)
94
+ log " #{type}: #{units.size} units"
95
+
96
+ sync_units(units)
97
+ end
98
+
99
+ # Sync the top N most-connected units of a type (by dependent count).
100
+ #
101
+ # @param type [String] Unit type
102
+ # @param max_count [Integer] Maximum units to sync
103
+ # @return [Hash] { synced: Integer, skipped: Integer, errors: Array<String> }
104
+ def sync_type_partial(type, max_count)
105
+ units = @reader.list_units(type: type)
106
+ return empty_stats if units.empty?
107
+
108
+ # Load full data to sort by dependent count
109
+ units_with_data = units.filter_map do |entry|
110
+ data = @reader.find_unit(entry['identifier'])
111
+ next unless data
112
+
113
+ dep_count = (data['dependents'] || []).size
114
+ { entry: entry, data: data, dep_count: dep_count }
115
+ end
116
+
117
+ top_units = units_with_data.sort_by { |u| -u[:dep_count] }.first(max_count)
118
+ skipped_count = [units.size - max_count, 0].max
119
+
120
+ log " #{type}: #{top_units.size}/#{units.size} units (top by dependents)"
121
+
122
+ result = sync_unit_data(top_units.map { |u| [u[:entry], u[:data]] })
123
+ result[:skipped] += skipped_count
124
+ result
125
+ end
126
+
127
+ private
128
+
129
+ def sync_units(units)
130
+ synced = 0
131
+ skipped = 0
132
+ errors = []
133
+
134
+ units.each do |entry|
135
+ unit_data = @reader.find_unit(entry['identifier'])
136
+ unless unit_data
137
+ skipped += 1
138
+ next
139
+ end
140
+
141
+ push_document(unit_data)
142
+ synced += 1
143
+ rescue Woods::Error => e
144
+ errors << "#{entry['identifier']}: #{e.message}"
145
+ break if e.message.include?('daily budget exhausted')
146
+ rescue StandardError => e
147
+ errors << "#{entry['identifier']}: #{e.message}"
148
+ end
149
+
150
+ { synced: synced, skipped: skipped, errors: errors }
151
+ end
152
+
153
+ def sync_unit_data(entries_with_data)
154
+ synced = 0
155
+ skipped = 0
156
+ errors = []
157
+
158
+ entries_with_data.each do |entry, unit_data|
159
+ push_document(unit_data)
160
+ synced += 1
161
+ rescue Woods::Error => e
162
+ errors << "#{entry['identifier']}: #{e.message}"
163
+ break if e.message.include?('daily budget exhausted')
164
+ rescue StandardError => e
165
+ errors << "#{entry['identifier']}: #{e.message}"
166
+ end
167
+
168
+ { synced: synced, skipped: skipped, errors: errors }
169
+ end
170
+
171
+ def push_document(unit_data)
172
+ doc = @builder.build(unit_data)
173
+ @client.put_document(
174
+ collection_id: @collection_id,
175
+ title: doc[:title],
176
+ body: doc[:body],
177
+ uri: doc[:uri]
178
+ )
179
+ end
180
+
181
+ def build_reader(index_dir)
182
+ require_relative '../mcp/index_reader'
183
+ Woods::MCP::IndexReader.new(index_dir)
184
+ end
185
+
186
+ def empty_stats
187
+ { synced: 0, skipped: 0, errors: [] }
188
+ end
189
+
190
+ def cap_errors(errors)
191
+ return errors if errors.size <= MAX_ERRORS
192
+
193
+ errors.first(MAX_ERRORS) + ["... and #{errors.size - MAX_ERRORS} more errors"]
194
+ end
195
+
196
+ def log(message)
197
+ @output&.puts(message)
198
+ end
199
+ end
200
+ end
201
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Woods
4
+ module Unblocked
5
+ # Daily budget-based rate limiter for the Unblocked API (1000 calls/day).
6
+ #
7
+ # Unlike Notion's per-second throttling, Unblocked limits by daily call count.
8
+ # Tracks usage against a configurable budget, warns when approaching the limit,
9
+ # and raises when exhausted.
10
+ #
11
+ # @example
12
+ # limiter = RateLimiter.new(daily_budget: 1000)
13
+ # limiter.track { client.put_document(...) } # => result
14
+ # limiter.remaining # => 999
15
+ #
16
+ class RateLimiter
17
+ DEFAULT_BUDGET = 1000
18
+ WARN_THRESHOLD = 0.8 # Warn at 80% usage
19
+
20
+ # @param daily_budget [Integer] Maximum API calls per day
21
+ # @param warn_io [IO] Where to write warnings (default: $stderr)
22
+ def initialize(daily_budget: DEFAULT_BUDGET, warn_io: $stderr)
23
+ unless daily_budget.is_a?(Integer) && daily_budget.positive?
24
+ raise ArgumentError, 'daily_budget must be positive'
25
+ end
26
+
27
+ @daily_budget = daily_budget
28
+ @calls_today = 0
29
+ @warn_io = warn_io
30
+ @warned = false
31
+ @mutex = Mutex.new
32
+ end
33
+
34
+ # Execute a block, tracking the API call against the daily budget.
35
+ #
36
+ # @yield The API call to execute
37
+ # @return [Object] The block's return value
38
+ # @raise [Woods::Error] if daily budget is exhausted
39
+ def track
40
+ raise ArgumentError, 'block required' unless block_given?
41
+
42
+ @mutex.synchronize do
43
+ if @calls_today >= @daily_budget
44
+ raise Woods::Error,
45
+ "Unblocked API daily budget exhausted (#{@daily_budget} calls). " \
46
+ 'Budget resets at midnight PST. Use UNBLOCKED_DAILY_BUDGET to adjust.'
47
+ end
48
+
49
+ @calls_today += 1
50
+ warn_if_approaching_limit
51
+ end
52
+
53
+ yield
54
+ end
55
+
56
+ # Number of API calls remaining in the daily budget.
57
+ #
58
+ # @return [Integer]
59
+ def remaining
60
+ @daily_budget - @calls_today
61
+ end
62
+
63
+ # Number of API calls used today.
64
+ #
65
+ # @return [Integer]
66
+ def used
67
+ @calls_today
68
+ end
69
+
70
+ # Reset the daily counter (for testing or manual reset).
71
+ #
72
+ # @return [void]
73
+ def reset!
74
+ @mutex.synchronize do
75
+ @calls_today = 0
76
+ @warned = false
77
+ end
78
+ end
79
+
80
+ private
81
+
82
+ def warn_if_approaching_limit
83
+ return if @warned
84
+ return unless @calls_today >= (@daily_budget * WARN_THRESHOLD).to_i
85
+
86
+ @warned = true
87
+ @warn_io&.puts(
88
+ "WARNING: Unblocked API usage at #{@calls_today}/#{@daily_budget} " \
89
+ "(#{remaining} calls remaining)"
90
+ )
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Woods
4
+ module Util
5
+ # Shared host-header / URL-host canonicalization used by {MCP::OriginGuard}
6
+ # and the {Storage::VectorStore::Qdrant} URL validator.
7
+ #
8
+ # Both components need to reject numeric IPv4 notations that `URI` and
9
+ # `getaddrinfo` accept but `IPAddr` does not — hex (`0x7f000001`),
10
+ # bare integer (`2130706433`), octal (`017700000001` or
11
+ # `0177.0.0.1`), short-form (`127.1`), mixed-radix (`0x7f.0.0.1`).
12
+ # Keeping the logic in one place prevents drift between the two
13
+ # defenses (which previously had slightly different regex lists).
14
+ module HostGuard
15
+ # Non-canonical numeric IPv4 forms that legitimate clients never
16
+ # emit but `getaddrinfo` will happily resolve — rejecting the form
17
+ # is safer than trying to intuit the intended IPv4.
18
+ NUMERIC_HOST_BYPASS = Regexp.union(
19
+ /\A0x[0-9a-f]+\z/, # hex: `0x7f000001`
20
+ /\A\d+\z/, # bare integer: `2130706433`
21
+ /\A0[0-7]+\z/, # bare octal: `017700000001`
22
+ /\A\d+\.\d+\z/, # short-form two-part: `127.1`
23
+ /\A\d+\.\d+\.\d+\z/ # short-form three-part: `127.0.1`
24
+ ).freeze
25
+
26
+ # Octets inside a four-part dotted form that tag the form as
27
+ # non-canonical: leading zero (octal interpretation), or `0x`
28
+ # prefix (hex interpretation).
29
+ SUSPICIOUS_OCTET = Regexp.union(
30
+ /\A0\d+\z/, # leading-zero octal: `0177`
31
+ /\A0x[0-9a-f]+\z/ # hex octet: `0x7f`
32
+ ).freeze
33
+
34
+ module_function
35
+
36
+ # Canonicalize a host string: downcase, strip port, strip the
37
+ # FQDN trailing dot, drop IPv6 brackets. Returns a plain host.
38
+ #
39
+ # @param host [String, nil]
40
+ # @return [String] canonical host, lowercase, without port/brackets.
41
+ def canonicalize(host)
42
+ host.to_s.downcase.sub(/:\d+\z/, '').sub(/\.\z/, '').delete('[]')
43
+ end
44
+
45
+ # Does this canonicalized host smuggle a private IP via a notation
46
+ # that `IPAddr.new` won't parse? Callers should reject any match
47
+ # rather than try to resolve it.
48
+ #
49
+ # @param canonical [String] Output of {.canonicalize}.
50
+ # @return [Boolean]
51
+ def suspicious_numeric_host?(canonical)
52
+ return true if canonical.match?(NUMERIC_HOST_BYPASS)
53
+
54
+ four_octet = canonical.match(/\A(\w+)\.(\w+)\.(\w+)\.(\w+)\z/)
55
+ return false unless four_octet
56
+
57
+ four_octet.captures.any? { |octet| octet.match?(SUSPICIOUS_OCTET) }
58
+ end
59
+ end
60
+ end
61
+ end
data/lib/woods/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Woods
4
- VERSION = '1.1.0'
4
+ VERSION = '1.3.0'
5
5
  end