woods 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +169 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +15 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +3 -4
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +737 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +1 -1
  102. data/lib/woods/unblocked/document_builder.rb +35 -10
  103. data/lib/woods/unblocked/exporter.rb +1 -1
  104. data/lib/woods/util/host_guard.rb +61 -0
  105. data/lib/woods/version.rb +1 -1
  106. data/lib/woods.rb +126 -6
  107. metadata +69 -4
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'set'
4
+
3
5
  module Woods
4
6
  module Storage
5
7
  # VectorStore provides an interface for storing and searching embedding vectors.
@@ -36,11 +38,39 @@ module Woods
36
38
  entries.each { |e| store(e[:id], e[:vector], e[:metadata] || {}) }
37
39
  end
38
40
 
41
+ # Iterate over every live entry, yielding `(id, vector, metadata)`.
42
+ #
43
+ # Persistence seam for Snapshotter and similar consumers. Default
44
+ # implementation falls through to `NotImplementedError`; adapters
45
+ # that need to support dumping must implement it. Persistent
46
+ # backends (pgvector, Qdrant) aren't expected to implement this —
47
+ # the Snapshotter only touches non-persistent stores.
48
+ #
49
+ # @yield [id, vector, metadata]
50
+ # @return [Enumerator] when no block given
51
+ def each_entry
52
+ raise NotImplementedError
53
+ end
54
+
55
+ # Bulk-load pre-computed entries. Dual of {#each_entry} — the
56
+ # Snapshotter hydrates a store by feeding this the dump contents.
57
+ #
58
+ # @param entries [Enumerable<Hash>] Each entry has :id, :vector, :metadata keys
59
+ def bulk_load(entries)
60
+ store_batch(entries.to_a)
61
+ end
62
+
39
63
  # Search for similar vectors using cosine similarity.
40
64
  #
65
+ # Filter values may be scalars (exact match) or Arrays (membership
66
+ # match — "value ∈ array"). Adapters implement the membership
67
+ # semantics natively: in-memory loops, pgvector IN (...), Qdrant
68
+ # `match: { any: [...] }`.
69
+ #
41
70
  # @param query_vector [Array<Float>] The query embedding vector
42
71
  # @param limit [Integer] Maximum number of results to return
43
- # @param filters [Hash] Optional metadata filters to apply
72
+ # @param filters [Hash] Optional metadata filters values may be
73
+ # scalars or Arrays
44
74
  # @return [Array<SearchResult>] Results sorted by descending similarity
45
75
  # @raise [NotImplementedError] if not implemented by adapter
46
76
  def search(query_vector, limit: 10, filters: {})
@@ -87,79 +117,198 @@ module Woods
87
117
  # store.search([1.0, 0.0], limit: 1)
88
118
  # # => [#<SearchResult id="doc1", score=1.0, metadata={type: "model"}>]
89
119
  #
90
- class InMemory
120
+ class InMemory # rubocop:disable Metrics/ClassLength
91
121
  include Interface
92
122
 
123
+ # Flat-buffer backing. One Array<Float> of length count*dim holds
124
+ # every vector contiguously; two parallel Arrays hold the ids and
125
+ # metadata at matching positions. Deleted entries are tombstoned
126
+ # (their index is added to @tombstones) rather than removed, so
127
+ # stored vector positions stay stable under concurrent iteration
128
+ # and dumps. Tombstones are compacted at next full-embed run.
129
+ #
130
+ # The flat buffer exists both for cache friendliness during the
131
+ # cosine kernel (all vectors live in one contiguous allocation)
132
+ # and to make dump/load via `pack("e*")` a single call rather
133
+ # than a per-vector concatenation.
93
134
  def initialize
94
- @entries = {} # id => { vector:, metadata: }
135
+ @dim = nil
136
+ @ids = [] # Array<String> (frozen)
137
+ @vectors_flat = [] # flat Array<Float>, length @ids.size * @dim
138
+ @metadata = [] # Array<Hash>, index-aligned with @ids
139
+ @id_to_index = {} # id => Integer for O(1) delete/overwrite
140
+ @tombstones = Set.new
95
141
  end
96
142
 
143
+ # @return [Integer, nil] dimension of stored vectors, nil if empty
144
+ attr_reader :dim
145
+
97
146
  # @see Interface#store
98
147
  def store(id, vector, metadata = {})
99
- @entries[id] = { vector: vector, metadata: metadata }
148
+ @dim ||= vector.length
149
+ unless vector.length == @dim
150
+ raise ArgumentError,
151
+ "Vector dimension mismatch (#{vector.length} vs #{@dim})"
152
+ end
153
+
154
+ frozen_id = id.frozen? ? id : id.dup.freeze
155
+ existing = @id_to_index[frozen_id]
156
+ if existing
157
+ overwrite(existing, vector, metadata)
158
+ else
159
+ append(frozen_id, vector, metadata)
160
+ end
161
+ end
162
+
163
+ # @see Interface#bulk_load
164
+ # Single-pass hydrate — more efficient than N store calls when
165
+ # the Snapshotter feeds a large dump at boot time.
166
+ def bulk_load(entries)
167
+ entries.each { |entry| store(entry[:id], entry[:vector], entry[:metadata] || {}) }
168
+ end
169
+
170
+ # Drop every stored entry, restoring the store to its post-+new+ state.
171
+ #
172
+ # Used by the MCP +reload+ tool to pick up a fresh embed run without
173
+ # restarting the process. A subsequent +#bulk_load+ then repopulates
174
+ # from disk. Safe on an already-empty store.
175
+ def clear!
176
+ @dim = nil
177
+ @ids = []
178
+ @vectors_flat = []
179
+ @metadata = []
180
+ @id_to_index = {}
181
+ @tombstones = Set.new
182
+ end
183
+
184
+ # @see Interface#each_entry
185
+ def each_entry(&block)
186
+ return enum_for(:each_entry) unless block
187
+
188
+ @ids.each_with_index do |id, idx|
189
+ next if @tombstones.include?(idx)
190
+
191
+ base = idx * @dim
192
+ yield(id, @vectors_flat[base, @dim], @metadata[idx])
193
+ end
100
194
  end
101
195
 
102
196
  # @see Interface#search
103
197
  def search(query_vector, limit: 10, filters: {})
104
- candidates = filter_entries(filters)
198
+ return [] if @dim.nil?
105
199
 
106
- scored = candidates.map do |id, entry|
107
- score = cosine_similarity(query_vector, entry[:vector])
108
- SearchResult.new(id: id, score: score, metadata: entry[:metadata])
200
+ unless query_vector.length == @dim
201
+ raise ArgumentError,
202
+ "Vector dimension mismatch (#{query_vector.length} vs #{@dim})"
109
203
  end
110
- scored.sort_by { |r| -r.score }.first(limit)
204
+
205
+ scored = gather_candidates(query_vector, filters)
206
+ scored.sort_by! { |r| -r.score }
207
+ scored.first(limit)
111
208
  end
112
209
 
113
210
  # @see Interface#delete
114
211
  def delete(id)
115
- @entries.delete(id)
212
+ idx = @id_to_index.delete(id)
213
+ @tombstones << idx if idx
116
214
  end
117
215
 
118
216
  # @see Interface#delete_by_filter
119
217
  def delete_by_filter(filters)
120
- @entries.reject! do |_id, entry|
121
- filters.all? { |key, value| entry[:metadata][key] == value }
218
+ @ids.each_with_index do |id, idx|
219
+ next if @tombstones.include?(idx)
220
+ next unless filters.all? { |key, value| @metadata[idx][key] == value }
221
+
222
+ @tombstones << idx
223
+ @id_to_index.delete(id)
122
224
  end
123
225
  end
124
226
 
125
227
  # @see Interface#count
126
228
  def count
127
- @entries.size
229
+ @ids.size - @tombstones.size
128
230
  end
129
231
 
130
232
  private
131
233
 
132
- # Filter entries by metadata key-value pairs.
133
- #
134
- # @param filters [Hash] Metadata filters
135
- # @return [Hash] Filtered entries
136
- def filter_entries(filters)
137
- return @entries if filters.empty?
234
+ # Match a filter value against a metadata value. Arrays are
235
+ # membership filters ("any of"); scalars are equality.
236
+ def filter_match?(filter_value, meta_value)
237
+ filter_value.is_a?(Array) ? filter_value.include?(meta_value) : filter_value == meta_value
238
+ end
239
+
240
+ # Append a new entry to the flat buffer.
241
+ def append(id, vector, metadata)
242
+ idx = @ids.size
243
+ @ids << id
244
+ @vectors_flat.concat(vector)
245
+ @metadata << metadata
246
+ @id_to_index[id] = idx
247
+ end
138
248
 
139
- @entries.select do |_id, entry|
140
- filters.all? { |key, value| entry[:metadata][key] == value }
249
+ # Overwrite an existing entry in place. Tombstones the old slot's
250
+ # deletion marker (if any) so the new vector is live again.
251
+ def overwrite(idx, vector, metadata)
252
+ base = idx * @dim
253
+ i = 0
254
+ while i < @dim
255
+ @vectors_flat[base + i] = vector[i]
256
+ i += 1
141
257
  end
258
+ @metadata[idx] = metadata
259
+ @tombstones.delete(idx)
142
260
  end
143
261
 
144
- # Compute cosine similarity between two vectors.
145
- #
146
- # @param vec_a [Array<Float>] First vector
147
- # @param vec_b [Array<Float>] Second vector
148
- # @return [Float] Cosine similarity between -1.0 and 1.0
149
- # @raise [ArgumentError] if vectors have different dimensions
150
- def cosine_similarity(vec_a, vec_b)
151
- unless vec_a.length == vec_b.length
152
- raise ArgumentError,
153
- "Vector dimension mismatch (#{vec_a.length} vs #{vec_b.length})"
262
+ # Walk every non-tombstoned index, apply filters, score survivors.
263
+ # Filter check runs BEFORE the cosine kernel — avoids computing
264
+ # 12k dot products only to discard most of them.
265
+ def gather_candidates(query_vector, filters)
266
+ scored = []
267
+ len = @ids.size
268
+ idx = 0
269
+ while idx < len
270
+ if @tombstones.include?(idx)
271
+ idx += 1
272
+ next
273
+ end
274
+ meta = @metadata[idx]
275
+ unless filters.empty? || filters.all? { |k, v| filter_match?(v, meta[k]) }
276
+ idx += 1
277
+ next
278
+ end
279
+
280
+ score = cosine_similarity_strided(query_vector, idx * @dim)
281
+ scored << SearchResult.new(id: @ids[idx], score: score, metadata: meta)
282
+ idx += 1
154
283
  end
284
+ scored
285
+ end
155
286
 
156
- dot = vec_a.zip(vec_b).sum { |x, y| x * y }
157
- mag_a = Math.sqrt(vec_a.sum { |x| x**2 })
158
- mag_b = Math.sqrt(vec_b.sum { |x| x**2 })
287
+ # Cosine similarity between a query Array<Float> and a vector
288
+ # that lives at @vectors_flat[base, @dim]. Strided access avoids
289
+ # allocating a copy of the stored vector on every comparison.
290
+ #
291
+ # See bench/vector_query_and_serialization.rb for the allocation
292
+ # story — the old Enumerable path allocated ~770 objects per pair;
293
+ # this loop allocates none inside the hot path.
294
+ def cosine_similarity_strided(query, base)
295
+ len = @dim
296
+ i = 0
297
+ dot = 0.0
298
+ mag_a = 0.0
299
+ mag_b = 0.0
300
+ while i < len
301
+ a = query[i]
302
+ b = @vectors_flat[base + i]
303
+ dot += a * b
304
+ mag_a += a * a
305
+ mag_b += b * b
306
+ i += 1
307
+ end
159
308
 
160
309
  return 0.0 if mag_a.zero? || mag_b.zero?
161
310
 
162
- dot / (mag_a * mag_b)
311
+ dot / (Math.sqrt(mag_a) * Math.sqrt(mag_b))
163
312
  end
164
313
  end
165
314
  end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'builder'
4
+ require_relative 'embedding/indexer'
5
+ require_relative 'embedding/text_preparer'
6
+ require_relative 'resolved_config'
7
+
8
+ module Woods
9
+ # Small helpers invoked from `lib/tasks/woods.rake`.
10
+ #
11
+ # Keeps rake task bodies to a couple of lines each so the real work lives in
12
+ # plain Ruby that can be unit-tested without Rake's global state.
13
+ module Tasks
14
+ module_function
15
+
16
+ # Build an {Embedding::Indexer} wired to the provider and stores described
17
+ # by {Woods.configuration}. Uses {Builder} so `config.embedding_provider`,
18
+ # `config.embedding_options`, and `config.vector_store(_options)` are all
19
+ # honoured — prior to this the rake tasks hardcoded Ollama + InMemory and
20
+ # silently ignored configuration, which was invisible until the provider
21
+ # tried to reach an unreachable default host.
22
+ #
23
+ # The TextPreparer and SemanticChunker are tuned to the selected
24
+ # provider so oversize units are split into chunks that fit the
25
+ # provider's input budget (e.g. Ollama's num_ctx, OpenAI's 8k cap).
26
+ #
27
+ # @return [Embedding::Indexer]
28
+ def build_embed_indexer
29
+ config = Woods.configuration
30
+ builder = Builder.new(config)
31
+ provider = builder.build_embedding_provider
32
+
33
+ # Wire the persistence-arc pieces (resolved_config, metadata_store,
34
+ # dump_retention_count) so Indexer#persist_snapshot can write
35
+ # woods.json, dump metadata, and honour the user's retention setting.
36
+ # Without these kwargs, embed writes vectors.bin + latest pointer but
37
+ # never writes woods.json — which breaks the standalone woods-mcp
38
+ # Shape-2 boot path entirely.
39
+ #
40
+ # metadata_store and resolved_config are nil-safe — hosts that don't
41
+ # configure metadata or that pre-date the persistence arc still work.
42
+ Embedding::Indexer.new(
43
+ provider: provider,
44
+ text_preparer: builder.build_text_preparer(provider),
45
+ vector_store: builder.build_vector_store,
46
+ metadata_store: config.metadata_store ? builder.build_metadata_store : nil,
47
+ resolved_config: build_resolved_config(config, provider: provider),
48
+ chunker: builder.build_chunker(provider),
49
+ dump_retention_count: config.dump_retention_count,
50
+ output_dir: ENV.fetch('WOODS_OUTPUT', config.output_dir)
51
+ )
52
+ end
53
+
54
+ # Build a ResolvedConfig snapshot from the live Woods::Configuration.
55
+ # Returns nil if the configuration doesn't have enough to produce one
56
+ # (pre-persistence-arc hosts) so the Indexer falls back to the legacy
57
+ # dump-without-woods.json behaviour.
58
+ #
59
+ # Passes the live +provider+ so {ResolvedConfig.from_configuration} can
60
+ # probe +provider.dimensions+ — without this, Ollama snapshots record
61
+ # +dimension: 0+ and every subsequent MCP boot fails a spurious
62
+ # dimension-mismatch check against the real stored vectors.
63
+ def build_resolved_config(config, provider: nil)
64
+ return nil unless config.embedding_provider
65
+
66
+ ResolvedConfig.from_configuration(config, provider: provider)
67
+ rescue StandardError
68
+ nil
69
+ end
70
+
71
+ # Print an indexer stats hash in the format the rake tasks have historically
72
+ # used. `mode:` only affects the header line.
73
+ #
74
+ # @param stats [Hash]
75
+ # @param mode [Symbol] :full or :incremental
76
+ def print_embed_stats(stats, mode:)
77
+ header = mode == :incremental ? 'Incremental embedding complete!' : 'Embedding complete!'
78
+ puts
79
+ puts header
80
+ puts " Processed: #{stats[:processed]}"
81
+ puts " Skipped: #{stats[:skipped]}"
82
+ puts " Errors: #{stats[:errors]}"
83
+ end
84
+ end
85
+ end
@@ -23,10 +23,58 @@ module Woods
23
23
  #
24
24
  class SnapshotStore # rubocop:disable Metrics/ClassLength
25
25
  # @param connection [Object] Database connection supporting #execute and #get_first_row
26
- def initialize(connection:)
26
+ # @param validate_schema [Boolean] If true (default), probe both required
27
+ # tables at construction time and raise a descriptive error pointing at
28
+ # migrations 004+005 when they are missing. Set false in tests that
29
+ # construct the store with a bare mock.
30
+ def initialize(connection:, validate_schema: true)
27
31
  @db = connection
32
+ validate_schema! if validate_schema
28
33
  end
29
34
 
35
+ REQUIRED_TABLES = %w[woods_snapshots woods_snapshot_units].freeze
36
+
37
+ # Probe that `woods_snapshots` and `woods_snapshot_units` exist. If
38
+ # they don't, raise with guidance to run migrations 004 + 005 —
39
+ # without this, the first call to {#capture}/{#find} raises a generic
40
+ # adapter error that doesn't tell operators why.
41
+ #
42
+ # When the connection responds to `#columns` (ActiveRecord-shaped) or
43
+ # `#table_exists?`, use that — these are hard to spoof from a test
44
+ # mock, so a partial mock can no longer silently pass. Falls back to
45
+ # the `SELECT 1 FROM t LIMIT 1` probe for minimal connections.
46
+ #
47
+ # @raise [Woods::Error]
48
+ def validate_schema!
49
+ REQUIRED_TABLES.each { |t| probe_table!(t) }
50
+ rescue Woods::Error
51
+ raise
52
+ rescue StandardError => e
53
+ raise Woods::Error, schema_error_message(e)
54
+ end
55
+
56
+ private
57
+
58
+ def probe_table!(table)
59
+ if @db.respond_to?(:table_exists?)
60
+ raise Woods::Error, schema_error_message("table `#{table}` does not exist") unless @db.table_exists?(table)
61
+ elsif @db.respond_to?(:columns)
62
+ cols = @db.columns(table)
63
+ raise Woods::Error, schema_error_message("no columns for `#{table}`") if cols.nil? || cols.empty?
64
+ else
65
+ @db.execute("SELECT 1 FROM #{table} LIMIT 1")
66
+ end
67
+ end
68
+
69
+ def schema_error_message(detail)
70
+ 'SnapshotStore requires the `woods_snapshots` and ' \
71
+ '`woods_snapshot_units` tables (migrations 004 + 005 under ' \
72
+ '`lib/woods/db/migrations/`). Run `rake woods:migrate` on the ' \
73
+ "metadata DB and retry. Underlying error: #{detail}"
74
+ end
75
+
76
+ public
77
+
30
78
  # Capture a snapshot after extraction completes.
31
79
  #
32
80
  # Stores the manifest metadata and per-unit content hashes.
@@ -1,19 +1,58 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Woods
4
- # Shared token estimation utility.
4
+ # Shared token estimation utility — the single source of truth for the
5
+ # chars-per-token ratio used across cost estimation, context assembly,
6
+ # and embedding budgeting.
5
7
  #
6
- # Uses project convention: (string.length / 4.0).ceil
7
- # See docs/TOKEN_BENCHMARK.mdconservative floor (~10.6% overestimate).
8
+ # Ratios:
9
+ # - `:openai` / default 4.0 chars/token. Benchmarked against tiktoken
10
+ # (cl100k_base) on 19 Ruby source files (mean 4.41 chars/token). We use
11
+ # 4.0 as a conservative floor (~10.6 % overestimate) so truncation never
12
+ # hands the model more tokens than it budgeted for. See
13
+ # `docs/TOKEN_BENCHMARK.md`.
14
+ # - `:ollama` — 1.5 chars/token. Matches the BERT WordPiece tokenizers
15
+ # used by nomic-embed-text and mxbai-embed-large. See
16
+ # `docs/EMBEDDING_MODELS.md` and `Woods::Builder#chars_per_token_for`.
17
+ #
18
+ # Callers should prefer {.chars_per_token_for} over hardcoding a divisor
19
+ # so future tokenizer changes propagate in one place instead of drifting
20
+ # between {ContextAssembler}, {Builder}, and cost-model components.
8
21
  module TokenUtils
22
+ CHARS_PER_TOKEN_BY_PROVIDER = {
23
+ openai: 4.0,
24
+ ollama: 1.5
25
+ }.freeze
26
+
27
+ DEFAULT_CHARS_PER_TOKEN = CHARS_PER_TOKEN_BY_PROVIDER[:openai]
28
+
9
29
  module_function
10
30
 
11
- # Estimate token count for a string.
31
+ # Chars-per-token ratio for the given embedding provider.
32
+ #
33
+ # @param provider [Symbol, String, nil] Provider identifier. Unknown or
34
+ # nil providers fall back to {DEFAULT_CHARS_PER_TOKEN}.
35
+ # @return [Float]
36
+ def chars_per_token_for(provider)
37
+ CHARS_PER_TOKEN_BY_PROVIDER.fetch(provider&.to_sym, DEFAULT_CHARS_PER_TOKEN)
38
+ end
39
+
40
+ # Estimate token count for a string using the default (OpenAI) ratio.
41
+ # Use {.estimate_tokens_for} when a specific provider is in play.
12
42
  #
13
43
  # @param text [String] Text to estimate
14
44
  # @return [Integer] Estimated token count
15
45
  def estimate_tokens(text)
16
- (text.length / 4.0).ceil
46
+ estimate_tokens_for(text, provider: nil)
47
+ end
48
+
49
+ # Estimate token count for a string using the provider's native ratio.
50
+ #
51
+ # @param text [String] Text to estimate
52
+ # @param provider [Symbol, String, nil] `:openai`, `:ollama`, or nil.
53
+ # @return [Integer] Estimated token count
54
+ def estimate_tokens_for(text, provider:)
55
+ (text.length / chars_per_token_for(provider)).ceil
17
56
  end
18
57
  end
19
58
  end
@@ -118,7 +118,7 @@ module Woods
118
118
  http.request(req)
119
119
  rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNRESET, Errno::ECONNREFUSED => e
120
120
  attempts += 1
121
- raise Woods::Error, "Network error after #{attempts} retries: #{e.message}" if attempts >= MAX_RETRIES
121
+ raise Woods::Error, "Network error after #{attempts} retries: #{e.message}" if attempts > MAX_RETRIES
122
122
 
123
123
  sleep(2**attempts)
124
124
  retry
@@ -10,7 +10,7 @@ module Woods
10
10
  # side effects, and structural complexity.
11
11
  #
12
12
  # @example
13
- # builder = DocumentBuilder.new(repo_url: "https://github.com/bigcartel/admin")
13
+ # builder = DocumentBuilder.new(repo_url: "https://github.com/acme/myapp")
14
14
  # doc = builder.build(unit_data)
15
15
  # # => { title: "Order (model)", body: "# Order (model)\n...", uri: "https://..." }
16
16
  #
@@ -46,15 +46,40 @@ module Woods
46
46
 
47
47
  def build_body(unit_data)
48
48
  type = unit_data['type']
49
- case type
50
- when 'model' then build_model_body(unit_data)
51
- when 'controller' then build_controller_body(unit_data)
52
- when 'service', 'job', 'mailer', 'manager', 'decorator', 'concern'
53
- build_generic_body(unit_data)
54
- when 'graphql', 'graphql_type', 'graphql_mutation', 'graphql_resolver', 'graphql_query'
55
- build_graphql_body(unit_data)
56
- else build_generic_body(unit_data)
57
- end
49
+ body = case type
50
+ when 'model' then build_model_body(unit_data)
51
+ when 'controller' then build_controller_body(unit_data)
52
+ when 'service', 'job', 'mailer', 'manager', 'decorator', 'concern'
53
+ build_generic_body(unit_data)
54
+ when 'graphql', 'graphql_type', 'graphql_mutation', 'graphql_resolver', 'graphql_query'
55
+ build_graphql_body(unit_data)
56
+ else build_generic_body(unit_data)
57
+ end
58
+ # Defensive credential scrub — current builders only emit structured
59
+ # metadata, but if a future formatter adds source_code or comments
60
+ # (mirroring Notion's `ModelMapper#extract_description`) the scrub
61
+ # keeps credential material from reaching Unblocked.
62
+ redact_credentials(body)
63
+ end
64
+
65
+ # Run the assembled body through CredentialScanner. Fails closed (empty
66
+ # body) if the scanner raises, so a shipping failure never leaks
67
+ # unredacted content.
68
+ #
69
+ # @param body [String]
70
+ # @return [String]
71
+ def redact_credentials(body)
72
+ return body if body.nil? || body.empty?
73
+
74
+ require 'woods/console/credential_scanner'
75
+ redacted, _counts = credential_scanner.scan(body)
76
+ redacted
77
+ rescue StandardError
78
+ ''
79
+ end
80
+
81
+ def credential_scanner
82
+ @credential_scanner ||= Woods::Console::CredentialScanner.new
58
83
  end
59
84
 
60
85
  # ── Model formatting ─────────────────────────────────────────────
@@ -51,7 +51,7 @@ module Woods
51
51
  api_token = config.unblocked_api_token
52
52
  raise ConfigurationError, 'unblocked_api_token is required' unless api_token
53
53
 
54
- budget = ENV.fetch('UNBLOCKED_DAILY_BUDGET', RateLimiter::DEFAULT_BUDGET).to_i
54
+ budget = ENV.fetch('UNBLOCKED_DAILY_BUDGET', RateLimiter::DEFAULT_BUDGET.to_s).to_i
55
55
  limiter = RateLimiter.new(daily_budget: budget)
56
56
 
57
57
  @client = client || Client.new(api_token: api_token, rate_limiter: limiter)
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Woods
4
+ module Util
5
+ # Shared host-header / URL-host canonicalization used by {MCP::OriginGuard}
6
+ # and the {Storage::VectorStore::Qdrant} URL validator.
7
+ #
8
+ # Both components need to reject numeric IPv4 notations that `URI` and
9
+ # `getaddrinfo` accept but `IPAddr` does not — hex (`0x7f000001`),
10
+ # bare integer (`2130706433`), octal (`017700000001` or
11
+ # `0177.0.0.1`), short-form (`127.1`), mixed-radix (`0x7f.0.0.1`).
12
+ # Keeping the logic in one place prevents drift between the two
13
+ # defenses (which previously had slightly different regex lists).
14
+ module HostGuard
15
+ # Non-canonical numeric IPv4 forms that legitimate clients never
16
+ # emit but `getaddrinfo` will happily resolve — rejecting the form
17
+ # is safer than trying to intuit the intended IPv4.
18
+ NUMERIC_HOST_BYPASS = Regexp.union(
19
+ /\A0x[0-9a-f]+\z/, # hex: `0x7f000001`
20
+ /\A\d+\z/, # bare integer: `2130706433`
21
+ /\A0[0-7]+\z/, # bare octal: `017700000001`
22
+ /\A\d+\.\d+\z/, # short-form two-part: `127.1`
23
+ /\A\d+\.\d+\.\d+\z/ # short-form three-part: `127.0.1`
24
+ ).freeze
25
+
26
+ # Octets inside a four-part dotted form that tag the form as
27
+ # non-canonical: leading zero (octal interpretation), or `0x`
28
+ # prefix (hex interpretation).
29
+ SUSPICIOUS_OCTET = Regexp.union(
30
+ /\A0\d+\z/, # leading-zero octal: `0177`
31
+ /\A0x[0-9a-f]+\z/ # hex octet: `0x7f`
32
+ ).freeze
33
+
34
+ module_function
35
+
36
+ # Canonicalize a host string: downcase, strip port, strip the
37
+ # FQDN trailing dot, drop IPv6 brackets. Returns a plain host.
38
+ #
39
+ # @param host [String, nil]
40
+ # @return [String] canonical host, lowercase, without port/brackets.
41
+ def canonicalize(host)
42
+ host.to_s.downcase.sub(/:\d+\z/, '').sub(/\.\z/, '').delete('[]')
43
+ end
44
+
45
+ # Does this canonicalized host smuggle a private IP via a notation
46
+ # that `IPAddr.new` won't parse? Callers should reject any match
47
+ # rather than try to resolve it.
48
+ #
49
+ # @param canonical [String] Output of {.canonicalize}.
50
+ # @return [Boolean]
51
+ def suspicious_numeric_host?(canonical)
52
+ return true if canonical.match?(NUMERIC_HOST_BYPASS)
53
+
54
+ four_octet = canonical.match(/\A(\w+)\.(\w+)\.(\w+)\.(\w+)\z/)
55
+ return false unless four_octet
56
+
57
+ four_octet.captures.any? { |octet| octet.match?(SUSPICIOUS_OCTET) }
58
+ end
59
+ end
60
+ end
61
+ end
data/lib/woods/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Woods
4
- VERSION = '1.2.0'
4
+ VERSION = '1.3.0'
5
5
  end