woods 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +229 -0
  3. data/README.md +24 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +37 -51
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +10 -4
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +3 -4
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +737 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +88 -7
  102. data/lib/woods/unblocked/document_builder.rb +75 -36
  103. data/lib/woods/unblocked/exporter.rb +234 -18
  104. data/lib/woods/unblocked/rate_limiter.rb +10 -2
  105. data/lib/woods/unblocked/sync_manifest.rb +135 -0
  106. data/lib/woods/util/host_guard.rb +61 -0
  107. data/lib/woods/version.rb +1 -1
  108. data/lib/woods.rb +126 -6
  109. metadata +70 -4
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'set'
4
+
3
5
  module Woods
4
6
  module Storage
5
7
  # VectorStore provides an interface for storing and searching embedding vectors.
@@ -36,11 +38,39 @@ module Woods
36
38
  entries.each { |e| store(e[:id], e[:vector], e[:metadata] || {}) }
37
39
  end
38
40
 
41
+ # Iterate over every live entry, yielding `(id, vector, metadata)`.
42
+ #
43
+ # Persistence seam for Snapshotter and similar consumers. Default
44
+ # implementation falls through to `NotImplementedError`; adapters
45
+ # that need to support dumping must implement it. Persistent
46
+ # backends (pgvector, Qdrant) aren't expected to implement this —
47
+ # the Snapshotter only touches non-persistent stores.
48
+ #
49
+ # @yield [id, vector, metadata]
50
+ # @return [Enumerator] when no block given
51
+ def each_entry
52
+ raise NotImplementedError
53
+ end
54
+
55
+ # Bulk-load pre-computed entries. Dual of {#each_entry} — the
56
+ # Snapshotter hydrates a store by feeding this the dump contents.
57
+ #
58
+ # @param entries [Enumerable<Hash>] Each entry has :id, :vector, :metadata keys
59
+ def bulk_load(entries)
60
+ store_batch(entries.to_a)
61
+ end
62
+
39
63
  # Search for similar vectors using cosine similarity.
40
64
  #
65
+ # Filter values may be scalars (exact match) or Arrays (membership
66
+ # match — "value ∈ array"). Adapters implement the membership
67
+ # semantics natively: in-memory loops, pgvector IN (...), Qdrant
68
+ # `match: { any: [...] }`.
69
+ #
41
70
  # @param query_vector [Array<Float>] The query embedding vector
42
71
  # @param limit [Integer] Maximum number of results to return
43
- # @param filters [Hash] Optional metadata filters to apply
72
+ # @param filters [Hash] Optional metadata filters values may be
73
+ # scalars or Arrays
44
74
  # @return [Array<SearchResult>] Results sorted by descending similarity
45
75
  # @raise [NotImplementedError] if not implemented by adapter
46
76
  def search(query_vector, limit: 10, filters: {})
@@ -87,79 +117,198 @@ module Woods
87
117
  # store.search([1.0, 0.0], limit: 1)
88
118
  # # => [#<SearchResult id="doc1", score=1.0, metadata={type: "model"}>]
89
119
  #
90
- class InMemory
120
+ class InMemory # rubocop:disable Metrics/ClassLength
91
121
  include Interface
92
122
 
123
+ # Flat-buffer backing. One Array<Float> of length count*dim holds
124
+ # every vector contiguously; two parallel Arrays hold the ids and
125
+ # metadata at matching positions. Deleted entries are tombstoned
126
+ # (their index is added to @tombstones) rather than removed, so
127
+ # stored vector positions stay stable under concurrent iteration
128
+ # and dumps. Tombstones are compacted at next full-embed run.
129
+ #
130
+ # The flat buffer exists both for cache friendliness during the
131
+ # cosine kernel (all vectors live in one contiguous allocation)
132
+ # and to make dump/load via `pack("e*")` a single call rather
133
+ # than a per-vector concatenation.
93
134
  def initialize
94
- @entries = {} # id => { vector:, metadata: }
135
+ @dim = nil
136
+ @ids = [] # Array<String> (frozen)
137
+ @vectors_flat = [] # flat Array<Float>, length @ids.size * @dim
138
+ @metadata = [] # Array<Hash>, index-aligned with @ids
139
+ @id_to_index = {} # id => Integer for O(1) delete/overwrite
140
+ @tombstones = Set.new
95
141
  end
96
142
 
143
+ # @return [Integer, nil] dimension of stored vectors, nil if empty
144
+ attr_reader :dim
145
+
97
146
  # @see Interface#store
98
147
  def store(id, vector, metadata = {})
99
- @entries[id] = { vector: vector, metadata: metadata }
148
+ @dim ||= vector.length
149
+ unless vector.length == @dim
150
+ raise ArgumentError,
151
+ "Vector dimension mismatch (#{vector.length} vs #{@dim})"
152
+ end
153
+
154
+ frozen_id = id.frozen? ? id : id.dup.freeze
155
+ existing = @id_to_index[frozen_id]
156
+ if existing
157
+ overwrite(existing, vector, metadata)
158
+ else
159
+ append(frozen_id, vector, metadata)
160
+ end
161
+ end
162
+
163
+ # @see Interface#bulk_load
164
+ # Single-pass hydrate — more efficient than N store calls when
165
+ # the Snapshotter feeds a large dump at boot time.
166
+ def bulk_load(entries)
167
+ entries.each { |entry| store(entry[:id], entry[:vector], entry[:metadata] || {}) }
168
+ end
169
+
170
+ # Drop every stored entry, restoring the store to its post-+new+ state.
171
+ #
172
+ # Used by the MCP +reload+ tool to pick up a fresh embed run without
173
+ # restarting the process. A subsequent +#bulk_load+ then repopulates
174
+ # from disk. Safe on an already-empty store.
175
+ def clear!
176
+ @dim = nil
177
+ @ids = []
178
+ @vectors_flat = []
179
+ @metadata = []
180
+ @id_to_index = {}
181
+ @tombstones = Set.new
182
+ end
183
+
184
+ # @see Interface#each_entry
185
+ def each_entry(&block)
186
+ return enum_for(:each_entry) unless block
187
+
188
+ @ids.each_with_index do |id, idx|
189
+ next if @tombstones.include?(idx)
190
+
191
+ base = idx * @dim
192
+ yield(id, @vectors_flat[base, @dim], @metadata[idx])
193
+ end
100
194
  end
101
195
 
102
196
  # @see Interface#search
103
197
  def search(query_vector, limit: 10, filters: {})
104
- candidates = filter_entries(filters)
198
+ return [] if @dim.nil?
105
199
 
106
- scored = candidates.map do |id, entry|
107
- score = cosine_similarity(query_vector, entry[:vector])
108
- SearchResult.new(id: id, score: score, metadata: entry[:metadata])
200
+ unless query_vector.length == @dim
201
+ raise ArgumentError,
202
+ "Vector dimension mismatch (#{query_vector.length} vs #{@dim})"
109
203
  end
110
- scored.sort_by { |r| -r.score }.first(limit)
204
+
205
+ scored = gather_candidates(query_vector, filters)
206
+ scored.sort_by! { |r| -r.score }
207
+ scored.first(limit)
111
208
  end
112
209
 
113
210
  # @see Interface#delete
114
211
  def delete(id)
115
- @entries.delete(id)
212
+ idx = @id_to_index.delete(id)
213
+ @tombstones << idx if idx
116
214
  end
117
215
 
118
216
  # @see Interface#delete_by_filter
119
217
  def delete_by_filter(filters)
120
- @entries.reject! do |_id, entry|
121
- filters.all? { |key, value| entry[:metadata][key] == value }
218
+ @ids.each_with_index do |id, idx|
219
+ next if @tombstones.include?(idx)
220
+ next unless filters.all? { |key, value| @metadata[idx][key] == value }
221
+
222
+ @tombstones << idx
223
+ @id_to_index.delete(id)
122
224
  end
123
225
  end
124
226
 
125
227
  # @see Interface#count
126
228
  def count
127
- @entries.size
229
+ @ids.size - @tombstones.size
128
230
  end
129
231
 
130
232
  private
131
233
 
132
- # Filter entries by metadata key-value pairs.
133
- #
134
- # @param filters [Hash] Metadata filters
135
- # @return [Hash] Filtered entries
136
- def filter_entries(filters)
137
- return @entries if filters.empty?
234
+ # Match a filter value against a metadata value. Arrays are
235
+ # membership filters ("any of"); scalars are equality.
236
+ def filter_match?(filter_value, meta_value)
237
+ filter_value.is_a?(Array) ? filter_value.include?(meta_value) : filter_value == meta_value
238
+ end
239
+
240
+ # Append a new entry to the flat buffer.
241
+ def append(id, vector, metadata)
242
+ idx = @ids.size
243
+ @ids << id
244
+ @vectors_flat.concat(vector)
245
+ @metadata << metadata
246
+ @id_to_index[id] = idx
247
+ end
138
248
 
139
- @entries.select do |_id, entry|
140
- filters.all? { |key, value| entry[:metadata][key] == value }
249
+ # Overwrite an existing entry in place. Tombstones the old slot's
250
+ # deletion marker (if any) so the new vector is live again.
251
+ def overwrite(idx, vector, metadata)
252
+ base = idx * @dim
253
+ i = 0
254
+ while i < @dim
255
+ @vectors_flat[base + i] = vector[i]
256
+ i += 1
141
257
  end
258
+ @metadata[idx] = metadata
259
+ @tombstones.delete(idx)
142
260
  end
143
261
 
144
- # Compute cosine similarity between two vectors.
145
- #
146
- # @param vec_a [Array<Float>] First vector
147
- # @param vec_b [Array<Float>] Second vector
148
- # @return [Float] Cosine similarity between -1.0 and 1.0
149
- # @raise [ArgumentError] if vectors have different dimensions
150
- def cosine_similarity(vec_a, vec_b)
151
- unless vec_a.length == vec_b.length
152
- raise ArgumentError,
153
- "Vector dimension mismatch (#{vec_a.length} vs #{vec_b.length})"
262
+ # Walk every non-tombstoned index, apply filters, score survivors.
263
+ # Filter check runs BEFORE the cosine kernel — avoids computing
264
+ # 12k dot products only to discard most of them.
265
+ def gather_candidates(query_vector, filters)
266
+ scored = []
267
+ len = @ids.size
268
+ idx = 0
269
+ while idx < len
270
+ if @tombstones.include?(idx)
271
+ idx += 1
272
+ next
273
+ end
274
+ meta = @metadata[idx]
275
+ unless filters.empty? || filters.all? { |k, v| filter_match?(v, meta[k]) }
276
+ idx += 1
277
+ next
278
+ end
279
+
280
+ score = cosine_similarity_strided(query_vector, idx * @dim)
281
+ scored << SearchResult.new(id: @ids[idx], score: score, metadata: meta)
282
+ idx += 1
154
283
  end
284
+ scored
285
+ end
155
286
 
156
- dot = vec_a.zip(vec_b).sum { |x, y| x * y }
157
- mag_a = Math.sqrt(vec_a.sum { |x| x**2 })
158
- mag_b = Math.sqrt(vec_b.sum { |x| x**2 })
287
+ # Cosine similarity between a query Array<Float> and a vector
288
+ # that lives at @vectors_flat[base, @dim]. Strided access avoids
289
+ # allocating a copy of the stored vector on every comparison.
290
+ #
291
+ # See bench/vector_query_and_serialization.rb for the allocation
292
+ # story — the old Enumerable path allocated ~770 objects per pair;
293
+ # this loop allocates none inside the hot path.
294
+ def cosine_similarity_strided(query, base)
295
+ len = @dim
296
+ i = 0
297
+ dot = 0.0
298
+ mag_a = 0.0
299
+ mag_b = 0.0
300
+ while i < len
301
+ a = query[i]
302
+ b = @vectors_flat[base + i]
303
+ dot += a * b
304
+ mag_a += a * a
305
+ mag_b += b * b
306
+ i += 1
307
+ end
159
308
 
160
309
  return 0.0 if mag_a.zero? || mag_b.zero?
161
310
 
162
- dot / (mag_a * mag_b)
311
+ dot / (Math.sqrt(mag_a) * Math.sqrt(mag_b))
163
312
  end
164
313
  end
165
314
  end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'builder'
4
+ require_relative 'embedding/indexer'
5
+ require_relative 'embedding/text_preparer'
6
+ require_relative 'resolved_config'
7
+
8
+ module Woods
9
+ # Small helpers invoked from `lib/tasks/woods.rake`.
10
+ #
11
+ # Keeps rake task bodies to a couple of lines each so the real work lives in
12
+ # plain Ruby that can be unit-tested without Rake's global state.
13
+ module Tasks
14
+ module_function
15
+
16
+ # Build an {Embedding::Indexer} wired to the provider and stores described
17
+ # by {Woods.configuration}. Uses {Builder} so `config.embedding_provider`,
18
+ # `config.embedding_options`, and `config.vector_store(_options)` are all
19
+ # honoured — prior to this the rake tasks hardcoded Ollama + InMemory and
20
+ # silently ignored configuration, which was invisible until the provider
21
+ # tried to reach an unreachable default host.
22
+ #
23
+ # The TextPreparer and SemanticChunker are tuned to the selected
24
+ # provider so oversize units are split into chunks that fit the
25
+ # provider's input budget (e.g. Ollama's num_ctx, OpenAI's 8k cap).
26
+ #
27
+ # @return [Embedding::Indexer]
28
+ def build_embed_indexer
29
+ config = Woods.configuration
30
+ builder = Builder.new(config)
31
+ provider = builder.build_embedding_provider
32
+
33
+ # Wire the persistence-arc pieces (resolved_config, metadata_store,
34
+ # dump_retention_count) so Indexer#persist_snapshot can write
35
+ # woods.json, dump metadata, and honour the user's retention setting.
36
+ # Without these kwargs, embed writes vectors.bin + latest pointer but
37
+ # never writes woods.json — which breaks the standalone woods-mcp
38
+ # Shape-2 boot path entirely.
39
+ #
40
+ # metadata_store and resolved_config are nil-safe — hosts that don't
41
+ # configure metadata or that pre-date the persistence arc still work.
42
+ Embedding::Indexer.new(
43
+ provider: provider,
44
+ text_preparer: builder.build_text_preparer(provider),
45
+ vector_store: builder.build_vector_store,
46
+ metadata_store: config.metadata_store ? builder.build_metadata_store : nil,
47
+ resolved_config: build_resolved_config(config, provider: provider),
48
+ chunker: builder.build_chunker(provider),
49
+ dump_retention_count: config.dump_retention_count,
50
+ output_dir: ENV.fetch('WOODS_OUTPUT', config.output_dir)
51
+ )
52
+ end
53
+
54
+ # Build a ResolvedConfig snapshot from the live Woods::Configuration.
55
+ # Returns nil if the configuration doesn't have enough to produce one
56
+ # (pre-persistence-arc hosts) so the Indexer falls back to the legacy
57
+ # dump-without-woods.json behaviour.
58
+ #
59
+ # Passes the live +provider+ so {ResolvedConfig.from_configuration} can
60
+ # probe +provider.dimensions+ — without this, Ollama snapshots record
61
+ # +dimension: 0+ and every subsequent MCP boot fails a spurious
62
+ # dimension-mismatch check against the real stored vectors.
63
+ def build_resolved_config(config, provider: nil)
64
+ return nil unless config.embedding_provider
65
+
66
+ ResolvedConfig.from_configuration(config, provider: provider)
67
+ rescue StandardError
68
+ nil
69
+ end
70
+
71
+ # Print an indexer stats hash in the format the rake tasks have historically
72
+ # used. `mode:` only affects the header line.
73
+ #
74
+ # @param stats [Hash]
75
+ # @param mode [Symbol] :full or :incremental
76
+ def print_embed_stats(stats, mode:)
77
+ header = mode == :incremental ? 'Incremental embedding complete!' : 'Embedding complete!'
78
+ puts
79
+ puts header
80
+ puts " Processed: #{stats[:processed]}"
81
+ puts " Skipped: #{stats[:skipped]}"
82
+ puts " Errors: #{stats[:errors]}"
83
+ end
84
+ end
85
+ end
@@ -23,10 +23,58 @@ module Woods
23
23
  #
24
24
  class SnapshotStore # rubocop:disable Metrics/ClassLength
25
25
  # @param connection [Object] Database connection supporting #execute and #get_first_row
26
- def initialize(connection:)
26
+ # @param validate_schema [Boolean] If true (default), probe both required
27
+ # tables at construction time and raise a descriptive error pointing at
28
+ # migrations 004+005 when they are missing. Set false in tests that
29
+ # construct the store with a bare mock.
30
+ def initialize(connection:, validate_schema: true)
27
31
  @db = connection
32
+ validate_schema! if validate_schema
28
33
  end
29
34
 
35
+ REQUIRED_TABLES = %w[woods_snapshots woods_snapshot_units].freeze
36
+
37
+ # Probe that `woods_snapshots` and `woods_snapshot_units` exist. If
38
+ # they don't, raise with guidance to run migrations 004 + 005 —
39
+ # without this, the first call to {#capture}/{#find} raises a generic
40
+ # adapter error that doesn't tell operators why.
41
+ #
42
+ # When the connection responds to `#columns` (ActiveRecord-shaped) or
43
+ # `#table_exists?`, use that — these are hard to spoof from a test
44
+ # mock, so a partial mock can no longer silently pass. Falls back to
45
+ # the `SELECT 1 FROM t LIMIT 1` probe for minimal connections.
46
+ #
47
+ # @raise [Woods::Error]
48
+ def validate_schema!
49
+ REQUIRED_TABLES.each { |t| probe_table!(t) }
50
+ rescue Woods::Error
51
+ raise
52
+ rescue StandardError => e
53
+ raise Woods::Error, schema_error_message(e)
54
+ end
55
+
56
+ private
57
+
58
+ def probe_table!(table)
59
+ if @db.respond_to?(:table_exists?)
60
+ raise Woods::Error, schema_error_message("table `#{table}` does not exist") unless @db.table_exists?(table)
61
+ elsif @db.respond_to?(:columns)
62
+ cols = @db.columns(table)
63
+ raise Woods::Error, schema_error_message("no columns for `#{table}`") if cols.nil? || cols.empty?
64
+ else
65
+ @db.execute("SELECT 1 FROM #{table} LIMIT 1")
66
+ end
67
+ end
68
+
69
+ def schema_error_message(detail)
70
+ 'SnapshotStore requires the `woods_snapshots` and ' \
71
+ '`woods_snapshot_units` tables (migrations 004 + 005 under ' \
72
+ '`lib/woods/db/migrations/`). Run `rake woods:migrate` on the ' \
73
+ "metadata DB and retry. Underlying error: #{detail}"
74
+ end
75
+
76
+ public
77
+
30
78
  # Capture a snapshot after extraction completes.
31
79
  #
32
80
  # Stores the manifest metadata and per-unit content hashes.
@@ -1,19 +1,58 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Woods
4
- # Shared token estimation utility.
4
+ # Shared token estimation utility — the single source of truth for the
5
+ # chars-per-token ratio used across cost estimation, context assembly,
6
+ # and embedding budgeting.
5
7
  #
6
- # Uses project convention: (string.length / 4.0).ceil
7
- # See docs/TOKEN_BENCHMARK.mdconservative floor (~10.6% overestimate).
8
+ # Ratios:
9
+ # - `:openai` / default 4.0 chars/token. Benchmarked against tiktoken
10
+ # (cl100k_base) on 19 Ruby source files (mean 4.41 chars/token). We use
11
+ # 4.0 as a conservative floor (~10.6 % overestimate) so truncation never
12
+ # hands the model more tokens than it budgeted for. See
13
+ # `docs/TOKEN_BENCHMARK.md`.
14
+ # - `:ollama` — 1.5 chars/token. Matches the BERT WordPiece tokenizers
15
+ # used by nomic-embed-text and mxbai-embed-large. See
16
+ # `docs/EMBEDDING_MODELS.md` and `Woods::Builder#chars_per_token_for`.
17
+ #
18
+ # Callers should prefer {.chars_per_token_for} over hardcoding a divisor
19
+ # so future tokenizer changes propagate in one place instead of drifting
20
+ # between {ContextAssembler}, {Builder}, and cost-model components.
8
21
  module TokenUtils
22
+ CHARS_PER_TOKEN_BY_PROVIDER = {
23
+ openai: 4.0,
24
+ ollama: 1.5
25
+ }.freeze
26
+
27
+ DEFAULT_CHARS_PER_TOKEN = CHARS_PER_TOKEN_BY_PROVIDER[:openai]
28
+
9
29
  module_function
10
30
 
11
- # Estimate token count for a string.
31
+ # Chars-per-token ratio for the given embedding provider.
32
+ #
33
+ # @param provider [Symbol, String, nil] Provider identifier. Unknown or
34
+ # nil providers fall back to {DEFAULT_CHARS_PER_TOKEN}.
35
+ # @return [Float]
36
+ def chars_per_token_for(provider)
37
+ CHARS_PER_TOKEN_BY_PROVIDER.fetch(provider&.to_sym, DEFAULT_CHARS_PER_TOKEN)
38
+ end
39
+
40
+ # Estimate token count for a string using the default (OpenAI) ratio.
41
+ # Use {.estimate_tokens_for} when a specific provider is in play.
12
42
  #
13
43
  # @param text [String] Text to estimate
14
44
  # @return [Integer] Estimated token count
15
45
  def estimate_tokens(text)
16
- (text.length / 4.0).ceil
46
+ estimate_tokens_for(text, provider: nil)
47
+ end
48
+
49
+ # Estimate token count for a string using the provider's native ratio.
50
+ #
51
+ # @param text [String] Text to estimate
52
+ # @param provider [Symbol, String, nil] `:openai`, `:ollama`, or nil.
53
+ # @return [Integer] Estimated token count
54
+ def estimate_tokens_for(text, provider:)
55
+ (text.length / chars_per_token_for(provider)).ceil
17
56
  end
18
57
  end
19
58
  end
@@ -3,10 +3,28 @@
3
3
  require 'json'
4
4
  require 'net/http'
5
5
  require 'uri'
6
+ require 'woods'
6
7
  require_relative 'rate_limiter'
7
8
 
8
9
  module Woods
9
10
  module Unblocked
11
+ # API error carrying the HTTP status code, so callers can branch on
12
+ # status (e.g. treat a 404 on delete as "already gone") instead of
13
+ # matching message strings. Subclasses Woods::Error, so existing
14
+ # +rescue Woods::Error+ sites keep working unchanged.
15
+ class ApiError < Woods::Error
16
+ # @return [Integer] HTTP status code of the failed response
17
+ attr_reader :status
18
+
19
+ # @param message [String] Error message
20
+ # @param status [Integer] HTTP status code — required, because callers
21
+ # branch on it (a nil status would silently miss every status check)
22
+ def initialize(message, status:)
23
+ super(message)
24
+ @status = Integer(status)
25
+ end
26
+ end
27
+
10
28
  # REST client for the Unblocked API v1.
11
29
  #
12
30
  # Handles document and collection CRUD with rate limiting, retries,
@@ -25,6 +43,12 @@ module Woods
25
43
  BASE_URL = 'https://getunblocked.com/api/v1'
26
44
  MAX_RETRIES = 3
27
45
  DEFAULT_TIMEOUT = 30
46
+ # Max page size the list endpoint accepts (per API docs).
47
+ PAGE_SIZE = 200
48
+ # Repo-hosted Woods mark, used as the collection icon when none is given.
49
+ # The live API rejects collection creation without an iconUrl (despite
50
+ # the API docs marking it optional), so a working default matters.
51
+ DEFAULT_ICON_URL = 'https://raw.githubusercontent.com/lost-in-the/woods/main/assets/woods-mark-black.svg'
28
52
 
29
53
  # @param api_token [String] Unblocked API token (Personal or Team)
30
54
  # @param rate_limiter [RateLimiter] Rate limiter instance
@@ -60,12 +84,17 @@ module Woods
60
84
  #
61
85
  # @param name [String] Collection name (1-32 chars)
62
86
  # @param description [String] Collection description (1-4096 chars)
63
- # @param icon_url [String, nil] Optional icon URL
87
+ # @param icon_url [String, nil] Icon URL. The live API rejects creation
88
+ # with a bare 400 when omitted (despite the API docs marking it
89
+ # optional), so nil falls back to DEFAULT_ICON_URL — the repo-hosted
90
+ # Woods mark.
64
91
  # @return [Hash] { "id" => "collection-uuid", "name" => "...", ... }
65
92
  def create_collection(name:, description:, icon_url: nil)
66
- body = { name: name, description: description }
67
- body[:iconUrl] = icon_url if icon_url
68
- request(:post, 'collections', body)
93
+ request(:post, 'collections', {
94
+ name: name,
95
+ description: description,
96
+ iconUrl: icon_url || DEFAULT_ICON_URL
97
+ })
69
98
  end
70
99
 
71
100
  # List all collections.
@@ -73,6 +102,10 @@ module Woods
73
102
  # @return [Array<Hash>] Collection objects
74
103
  def list_collections
75
104
  result = request(:get, 'collections')
105
+ # The live API returns a bare JSON array; the envelope fallbacks are
106
+ # defensive (calling ['items'] on an Array raises TypeError).
107
+ return result if result.is_a?(Array)
108
+
76
109
  result['items'] || result['data'] || [result].flatten.compact
77
110
  end
78
111
 
@@ -84,6 +117,52 @@ module Woods
84
117
  request(:delete, "documents/#{document_id}")
85
118
  end
86
119
 
120
+ # List a single page of documents.
121
+ #
122
+ # The endpoint returns a bare JSON array of document metadata (no body):
123
+ # `id, collectionId, title, uri, createdAt, updatedAt`. Pagination is
124
+ # cursor-based via `after`/`before` (opaque cursors); there is no
125
+ # server-side collection filter.
126
+ #
127
+ # @param limit [Integer] Page size (1-200)
128
+ # @param after [String, nil] Opaque forward cursor (typically the last id)
129
+ # @return [Array<Hash>] One page of document metadata
130
+ def list_documents(limit: PAGE_SIZE, after: nil)
131
+ query = "limit=#{limit}"
132
+ query += "&after=#{URI.encode_www_form_component(after)}" if after
133
+ result = request(:get, "documents?#{query}")
134
+ return result if result.is_a?(Array)
135
+
136
+ result['items'] || result['data'] || []
137
+ end
138
+
139
+ # List every document in a collection, paging until exhausted.
140
+ #
141
+ # Filters client-side on `collectionId` since the API has no collection
142
+ # filter. ~5 calls for ~1000 documents; each goes through the rate limiter.
143
+ #
144
+ # @param collection_id [String] Collection UUID to filter to
145
+ # @return [Array<Hash>] All matching document metadata
146
+ def all_documents(collection_id:)
147
+ docs = []
148
+ after = nil
149
+
150
+ loop do
151
+ page = list_documents(limit: PAGE_SIZE, after: after)
152
+ break if page.empty?
153
+
154
+ docs.concat(page)
155
+ break if page.size < PAGE_SIZE
156
+
157
+ after = page.last['id']
158
+ # A full page with no cursor id would refetch page 1 forever —
159
+ # stop with what we have rather than loop against the budget.
160
+ break if after.nil?
161
+ end
162
+
163
+ docs.select { |doc| doc['collectionId'] == collection_id }
164
+ end
165
+
87
166
  private
88
167
 
89
168
  def request(method, path, body = nil)
@@ -118,7 +197,7 @@ module Woods
118
197
  http.request(req)
119
198
  rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNRESET, Errno::ECONNREFUSED => e
120
199
  attempts += 1
121
- raise Woods::Error, "Network error after #{attempts} retries: #{e.message}" if attempts >= MAX_RETRIES
200
+ raise Woods::Error, "Network error after #{attempts} retries: #{e.message}" if attempts > MAX_RETRIES
122
201
 
123
202
  sleep(2**attempts)
124
203
  retry
@@ -155,8 +234,10 @@ module Woods
155
234
  rescue JSON::ParserError, TypeError
156
235
  { 'message' => response.body&.slice(0, 200) || 'Unknown error' }
157
236
  end
158
- message = parsed['message'] || parsed['error'] || 'Unknown error'
159
- raise Woods::Error, "Unblocked API error #{response.code}: #{message}"
237
+ # The Unblocked API returns RFC7807-style bodies ({ status, title, detail });
238
+ # older/other paths use message/error. Check all so failures stay legible.
239
+ message = parsed['message'] || parsed['error'] || parsed['detail'] || parsed['title'] || 'Unknown error'
240
+ raise ApiError.new("Unblocked API error #{response.code}: #{message}", status: response.code.to_i)
160
241
  end
161
242
  end
162
243
  end