woods 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +186 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +69 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +210 -0
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +100 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +771 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +163 -0
  102. data/lib/woods/unblocked/document_builder.rb +326 -0
  103. data/lib/woods/unblocked/exporter.rb +201 -0
  104. data/lib/woods/unblocked/rate_limiter.rb +94 -0
  105. data/lib/woods/util/host_guard.rb +61 -0
  106. data/lib/woods/version.rb +1 -1
  107. data/lib/woods.rb +130 -6
  108. metadata +73 -4
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'set'
4
+
3
5
  module Woods
4
6
  module Storage
5
7
  # VectorStore provides an interface for storing and searching embedding vectors.
@@ -36,11 +38,39 @@ module Woods
36
38
  entries.each { |e| store(e[:id], e[:vector], e[:metadata] || {}) }
37
39
  end
38
40
 
41
+ # Iterate over every live entry, yielding `(id, vector, metadata)`.
42
+ #
43
+ # Persistence seam for Snapshotter and similar consumers. Default
44
+ # implementation falls through to `NotImplementedError`; adapters
45
+ # that need to support dumping must implement it. Persistent
46
+ # backends (pgvector, Qdrant) aren't expected to implement this —
47
+ # the Snapshotter only touches non-persistent stores.
48
+ #
49
+ # @yield [id, vector, metadata]
50
+ # @return [Enumerator] when no block given
51
+ def each_entry
52
+ raise NotImplementedError
53
+ end
54
+
55
+ # Bulk-load pre-computed entries. Dual of {#each_entry} — the
56
+ # Snapshotter hydrates a store by feeding this the dump contents.
57
+ #
58
+ # @param entries [Enumerable<Hash>] Each entry has :id, :vector, :metadata keys
59
+ def bulk_load(entries)
60
+ store_batch(entries.to_a)
61
+ end
62
+
39
63
  # Search for similar vectors using cosine similarity.
40
64
  #
65
+ # Filter values may be scalars (exact match) or Arrays (membership
66
+ # match — "value ∈ array"). Adapters implement the membership
67
+ # semantics natively: in-memory loops, pgvector IN (...), Qdrant
68
+ # `match: { any: [...] }`.
69
+ #
41
70
  # @param query_vector [Array<Float>] The query embedding vector
42
71
  # @param limit [Integer] Maximum number of results to return
43
- # @param filters [Hash] Optional metadata filters to apply
72
+ # @param filters [Hash] Optional metadata filters values may be
73
+ # scalars or Arrays
44
74
  # @return [Array<SearchResult>] Results sorted by descending similarity
45
75
  # @raise [NotImplementedError] if not implemented by adapter
46
76
  def search(query_vector, limit: 10, filters: {})
@@ -87,79 +117,198 @@ module Woods
87
117
  # store.search([1.0, 0.0], limit: 1)
88
118
  # # => [#<SearchResult id="doc1", score=1.0, metadata={type: "model"}>]
89
119
  #
90
- class InMemory
120
+ class InMemory # rubocop:disable Metrics/ClassLength
91
121
  include Interface
92
122
 
123
+ # Flat-buffer backing. One Array<Float> of length count*dim holds
124
+ # every vector contiguously; two parallel Arrays hold the ids and
125
+ # metadata at matching positions. Deleted entries are tombstoned
126
+ # (their index is added to @tombstones) rather than removed, so
127
+ # stored vector positions stay stable under concurrent iteration
128
+ # and dumps. Tombstones are compacted at next full-embed run.
129
+ #
130
+ # The flat buffer exists both for cache friendliness during the
131
+ # cosine kernel (all vectors live in one contiguous allocation)
132
+ # and to make dump/load via `pack("e*")` a single call rather
133
+ # than a per-vector concatenation.
93
134
  def initialize
94
- @entries = {} # id => { vector:, metadata: }
135
+ @dim = nil
136
+ @ids = [] # Array<String> (frozen)
137
+ @vectors_flat = [] # flat Array<Float>, length @ids.size * @dim
138
+ @metadata = [] # Array<Hash>, index-aligned with @ids
139
+ @id_to_index = {} # id => Integer for O(1) delete/overwrite
140
+ @tombstones = Set.new
95
141
  end
96
142
 
143
+ # @return [Integer, nil] dimension of stored vectors, nil if empty
144
+ attr_reader :dim
145
+
97
146
  # @see Interface#store
98
147
  def store(id, vector, metadata = {})
99
- @entries[id] = { vector: vector, metadata: metadata }
148
+ @dim ||= vector.length
149
+ unless vector.length == @dim
150
+ raise ArgumentError,
151
+ "Vector dimension mismatch (#{vector.length} vs #{@dim})"
152
+ end
153
+
154
+ frozen_id = id.frozen? ? id : id.dup.freeze
155
+ existing = @id_to_index[frozen_id]
156
+ if existing
157
+ overwrite(existing, vector, metadata)
158
+ else
159
+ append(frozen_id, vector, metadata)
160
+ end
161
+ end
162
+
163
+ # @see Interface#bulk_load
164
+ # Single-pass hydrate — more efficient than N store calls when
165
+ # the Snapshotter feeds a large dump at boot time.
166
+ def bulk_load(entries)
167
+ entries.each { |entry| store(entry[:id], entry[:vector], entry[:metadata] || {}) }
168
+ end
169
+
170
+ # Drop every stored entry, restoring the store to its post-+new+ state.
171
+ #
172
+ # Used by the MCP +reload+ tool to pick up a fresh embed run without
173
+ # restarting the process. A subsequent +#bulk_load+ then repopulates
174
+ # from disk. Safe on an already-empty store.
175
+ def clear!
176
+ @dim = nil
177
+ @ids = []
178
+ @vectors_flat = []
179
+ @metadata = []
180
+ @id_to_index = {}
181
+ @tombstones = Set.new
182
+ end
183
+
184
+ # @see Interface#each_entry
185
+ def each_entry(&block)
186
+ return enum_for(:each_entry) unless block
187
+
188
+ @ids.each_with_index do |id, idx|
189
+ next if @tombstones.include?(idx)
190
+
191
+ base = idx * @dim
192
+ yield(id, @vectors_flat[base, @dim], @metadata[idx])
193
+ end
100
194
  end
101
195
 
102
196
  # @see Interface#search
103
197
  def search(query_vector, limit: 10, filters: {})
104
- candidates = filter_entries(filters)
198
+ return [] if @dim.nil?
105
199
 
106
- scored = candidates.map do |id, entry|
107
- score = cosine_similarity(query_vector, entry[:vector])
108
- SearchResult.new(id: id, score: score, metadata: entry[:metadata])
200
+ unless query_vector.length == @dim
201
+ raise ArgumentError,
202
+ "Vector dimension mismatch (#{query_vector.length} vs #{@dim})"
109
203
  end
110
- scored.sort_by { |r| -r.score }.first(limit)
204
+
205
+ scored = gather_candidates(query_vector, filters)
206
+ scored.sort_by! { |r| -r.score }
207
+ scored.first(limit)
111
208
  end
112
209
 
113
210
  # @see Interface#delete
114
211
  def delete(id)
115
- @entries.delete(id)
212
+ idx = @id_to_index.delete(id)
213
+ @tombstones << idx if idx
116
214
  end
117
215
 
118
216
  # @see Interface#delete_by_filter
119
217
  def delete_by_filter(filters)
120
- @entries.reject! do |_id, entry|
121
- filters.all? { |key, value| entry[:metadata][key] == value }
218
+ @ids.each_with_index do |id, idx|
219
+ next if @tombstones.include?(idx)
220
+ next unless filters.all? { |key, value| @metadata[idx][key] == value }
221
+
222
+ @tombstones << idx
223
+ @id_to_index.delete(id)
122
224
  end
123
225
  end
124
226
 
125
227
  # @see Interface#count
126
228
  def count
127
- @entries.size
229
+ @ids.size - @tombstones.size
128
230
  end
129
231
 
130
232
  private
131
233
 
132
- # Filter entries by metadata key-value pairs.
133
- #
134
- # @param filters [Hash] Metadata filters
135
- # @return [Hash] Filtered entries
136
- def filter_entries(filters)
137
- return @entries if filters.empty?
234
+ # Match a filter value against a metadata value. Arrays are
235
+ # membership filters ("any of"); scalars are equality.
236
+ def filter_match?(filter_value, meta_value)
237
+ filter_value.is_a?(Array) ? filter_value.include?(meta_value) : filter_value == meta_value
238
+ end
239
+
240
+ # Append a new entry to the flat buffer.
241
+ def append(id, vector, metadata)
242
+ idx = @ids.size
243
+ @ids << id
244
+ @vectors_flat.concat(vector)
245
+ @metadata << metadata
246
+ @id_to_index[id] = idx
247
+ end
138
248
 
139
- @entries.select do |_id, entry|
140
- filters.all? { |key, value| entry[:metadata][key] == value }
249
+ # Overwrite an existing entry in place. Tombstones the old slot's
250
+ # deletion marker (if any) so the new vector is live again.
251
+ def overwrite(idx, vector, metadata)
252
+ base = idx * @dim
253
+ i = 0
254
+ while i < @dim
255
+ @vectors_flat[base + i] = vector[i]
256
+ i += 1
141
257
  end
258
+ @metadata[idx] = metadata
259
+ @tombstones.delete(idx)
142
260
  end
143
261
 
144
- # Compute cosine similarity between two vectors.
145
- #
146
- # @param vec_a [Array<Float>] First vector
147
- # @param vec_b [Array<Float>] Second vector
148
- # @return [Float] Cosine similarity between -1.0 and 1.0
149
- # @raise [ArgumentError] if vectors have different dimensions
150
- def cosine_similarity(vec_a, vec_b)
151
- unless vec_a.length == vec_b.length
152
- raise ArgumentError,
153
- "Vector dimension mismatch (#{vec_a.length} vs #{vec_b.length})"
262
+ # Walk every non-tombstoned index, apply filters, score survivors.
263
+ # Filter check runs BEFORE the cosine kernel — avoids computing
264
+ # 12k dot products only to discard most of them.
265
+ def gather_candidates(query_vector, filters)
266
+ scored = []
267
+ len = @ids.size
268
+ idx = 0
269
+ while idx < len
270
+ if @tombstones.include?(idx)
271
+ idx += 1
272
+ next
273
+ end
274
+ meta = @metadata[idx]
275
+ unless filters.empty? || filters.all? { |k, v| filter_match?(v, meta[k]) }
276
+ idx += 1
277
+ next
278
+ end
279
+
280
+ score = cosine_similarity_strided(query_vector, idx * @dim)
281
+ scored << SearchResult.new(id: @ids[idx], score: score, metadata: meta)
282
+ idx += 1
154
283
  end
284
+ scored
285
+ end
155
286
 
156
- dot = vec_a.zip(vec_b).sum { |x, y| x * y }
157
- mag_a = Math.sqrt(vec_a.sum { |x| x**2 })
158
- mag_b = Math.sqrt(vec_b.sum { |x| x**2 })
287
+ # Cosine similarity between a query Array<Float> and a vector
288
+ # that lives at @vectors_flat[base, @dim]. Strided access avoids
289
+ # allocating a copy of the stored vector on every comparison.
290
+ #
291
+ # See bench/vector_query_and_serialization.rb for the allocation
292
+ # story — the old Enumerable path allocated ~770 objects per pair;
293
+ # this loop allocates none inside the hot path.
294
+ def cosine_similarity_strided(query, base)
295
+ len = @dim
296
+ i = 0
297
+ dot = 0.0
298
+ mag_a = 0.0
299
+ mag_b = 0.0
300
+ while i < len
301
+ a = query[i]
302
+ b = @vectors_flat[base + i]
303
+ dot += a * b
304
+ mag_a += a * a
305
+ mag_b += b * b
306
+ i += 1
307
+ end
159
308
 
160
309
  return 0.0 if mag_a.zero? || mag_b.zero?
161
310
 
162
- dot / (mag_a * mag_b)
311
+ dot / (Math.sqrt(mag_a) * Math.sqrt(mag_b))
163
312
  end
164
313
  end
165
314
  end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'builder'
4
+ require_relative 'embedding/indexer'
5
+ require_relative 'embedding/text_preparer'
6
+ require_relative 'resolved_config'
7
+
8
+ module Woods
9
+ # Small helpers invoked from `lib/tasks/woods.rake`.
10
+ #
11
+ # Keeps rake task bodies to a couple of lines each so the real work lives in
12
+ # plain Ruby that can be unit-tested without Rake's global state.
13
+ module Tasks
14
+ module_function
15
+
16
+ # Build an {Embedding::Indexer} wired to the provider and stores described
17
+ # by {Woods.configuration}. Uses {Builder} so `config.embedding_provider`,
18
+ # `config.embedding_options`, and `config.vector_store(_options)` are all
19
+ # honoured — prior to this the rake tasks hardcoded Ollama + InMemory and
20
+ # silently ignored configuration, which was invisible until the provider
21
+ # tried to reach an unreachable default host.
22
+ #
23
+ # The TextPreparer and SemanticChunker are tuned to the selected
24
+ # provider so oversize units are split into chunks that fit the
25
+ # provider's input budget (e.g. Ollama's num_ctx, OpenAI's 8k cap).
26
+ #
27
+ # @return [Embedding::Indexer]
28
+ def build_embed_indexer
29
+ config = Woods.configuration
30
+ builder = Builder.new(config)
31
+ provider = builder.build_embedding_provider
32
+
33
+ # Wire the persistence-arc pieces (resolved_config, metadata_store,
34
+ # dump_retention_count) so Indexer#persist_snapshot can write
35
+ # woods.json, dump metadata, and honour the user's retention setting.
36
+ # Without these kwargs, embed writes vectors.bin + latest pointer but
37
+ # never writes woods.json — which breaks the standalone woods-mcp
38
+ # Shape-2 boot path entirely.
39
+ #
40
+ # metadata_store and resolved_config are nil-safe — hosts that don't
41
+ # configure metadata or that pre-date the persistence arc still work.
42
+ Embedding::Indexer.new(
43
+ provider: provider,
44
+ text_preparer: builder.build_text_preparer(provider),
45
+ vector_store: builder.build_vector_store,
46
+ metadata_store: config.metadata_store ? builder.build_metadata_store : nil,
47
+ resolved_config: build_resolved_config(config, provider: provider),
48
+ chunker: builder.build_chunker(provider),
49
+ dump_retention_count: config.dump_retention_count,
50
+ output_dir: ENV.fetch('WOODS_OUTPUT', config.output_dir)
51
+ )
52
+ end
53
+
54
+ # Build a ResolvedConfig snapshot from the live Woods::Configuration.
55
+ # Returns nil if the configuration doesn't have enough to produce one
56
+ # (pre-persistence-arc hosts) so the Indexer falls back to the legacy
57
+ # dump-without-woods.json behaviour.
58
+ #
59
+ # Passes the live +provider+ so {ResolvedConfig.from_configuration} can
60
+ # probe +provider.dimensions+ — without this, Ollama snapshots record
61
+ # +dimension: 0+ and every subsequent MCP boot fails a spurious
62
+ # dimension-mismatch check against the real stored vectors.
63
+ def build_resolved_config(config, provider: nil)
64
+ return nil unless config.embedding_provider
65
+
66
+ ResolvedConfig.from_configuration(config, provider: provider)
67
+ rescue StandardError
68
+ nil
69
+ end
70
+
71
+ # Print an indexer stats hash in the format the rake tasks have historically
72
+ # used. `mode:` only affects the header line.
73
+ #
74
+ # @param stats [Hash]
75
+ # @param mode [Symbol] :full or :incremental
76
+ def print_embed_stats(stats, mode:)
77
+ header = mode == :incremental ? 'Incremental embedding complete!' : 'Embedding complete!'
78
+ puts
79
+ puts header
80
+ puts " Processed: #{stats[:processed]}"
81
+ puts " Skipped: #{stats[:skipped]}"
82
+ puts " Errors: #{stats[:errors]}"
83
+ end
84
+ end
85
+ end
@@ -23,10 +23,58 @@ module Woods
23
23
  #
24
24
  class SnapshotStore # rubocop:disable Metrics/ClassLength
25
25
  # @param connection [Object] Database connection supporting #execute and #get_first_row
26
- def initialize(connection:)
26
+ # @param validate_schema [Boolean] If true (default), probe both required
27
+ # tables at construction time and raise a descriptive error pointing at
28
+ # migrations 004+005 when they are missing. Set false in tests that
29
+ # construct the store with a bare mock.
30
+ def initialize(connection:, validate_schema: true)
27
31
  @db = connection
32
+ validate_schema! if validate_schema
28
33
  end
29
34
 
35
+ REQUIRED_TABLES = %w[woods_snapshots woods_snapshot_units].freeze
36
+
37
+ # Probe that `woods_snapshots` and `woods_snapshot_units` exist. If
38
+ # they don't, raise with guidance to run migrations 004 + 005 —
39
+ # without this, the first call to {#capture}/{#find} raises a generic
40
+ # adapter error that doesn't tell operators why.
41
+ #
42
+ # When the connection responds to `#columns` (ActiveRecord-shaped) or
43
+ # `#table_exists?`, use that — these are hard to spoof from a test
44
+ # mock, so a partial mock can no longer silently pass. Falls back to
45
+ # the `SELECT 1 FROM t LIMIT 1` probe for minimal connections.
46
+ #
47
+ # @raise [Woods::Error]
48
+ def validate_schema!
49
+ REQUIRED_TABLES.each { |t| probe_table!(t) }
50
+ rescue Woods::Error
51
+ raise
52
+ rescue StandardError => e
53
+ raise Woods::Error, schema_error_message(e)
54
+ end
55
+
56
+ private
57
+
58
+ def probe_table!(table)
59
+ if @db.respond_to?(:table_exists?)
60
+ raise Woods::Error, schema_error_message("table `#{table}` does not exist") unless @db.table_exists?(table)
61
+ elsif @db.respond_to?(:columns)
62
+ cols = @db.columns(table)
63
+ raise Woods::Error, schema_error_message("no columns for `#{table}`") if cols.nil? || cols.empty?
64
+ else
65
+ @db.execute("SELECT 1 FROM #{table} LIMIT 1")
66
+ end
67
+ end
68
+
69
+ def schema_error_message(detail)
70
+ 'SnapshotStore requires the `woods_snapshots` and ' \
71
+ '`woods_snapshot_units` tables (migrations 004 + 005 under ' \
72
+ '`lib/woods/db/migrations/`). Run `rake woods:migrate` on the ' \
73
+ "metadata DB and retry. Underlying error: #{detail}"
74
+ end
75
+
76
+ public
77
+
30
78
  # Capture a snapshot after extraction completes.
31
79
  #
32
80
  # Stores the manifest metadata and per-unit content hashes.
@@ -1,19 +1,58 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Woods
4
- # Shared token estimation utility.
4
+ # Shared token estimation utility — the single source of truth for the
5
+ # chars-per-token ratio used across cost estimation, context assembly,
6
+ # and embedding budgeting.
5
7
  #
6
- # Uses project convention: (string.length / 4.0).ceil
7
- # See docs/TOKEN_BENCHMARK.mdconservative floor (~10.6% overestimate).
8
+ # Ratios:
9
+ # - `:openai` / default 4.0 chars/token. Benchmarked against tiktoken
10
+ # (cl100k_base) on 19 Ruby source files (mean 4.41 chars/token). We use
11
+ # 4.0 as a conservative floor (~10.6 % overestimate) so truncation never
12
+ # hands the model more tokens than it budgeted for. See
13
+ # `docs/TOKEN_BENCHMARK.md`.
14
+ # - `:ollama` — 1.5 chars/token. Matches the BERT WordPiece tokenizers
15
+ # used by nomic-embed-text and mxbai-embed-large. See
16
+ # `docs/EMBEDDING_MODELS.md` and `Woods::Builder#chars_per_token_for`.
17
+ #
18
+ # Callers should prefer {.chars_per_token_for} over hardcoding a divisor
19
+ # so future tokenizer changes propagate in one place instead of drifting
20
+ # between {ContextAssembler}, {Builder}, and cost-model components.
8
21
  module TokenUtils
22
+ CHARS_PER_TOKEN_BY_PROVIDER = {
23
+ openai: 4.0,
24
+ ollama: 1.5
25
+ }.freeze
26
+
27
+ DEFAULT_CHARS_PER_TOKEN = CHARS_PER_TOKEN_BY_PROVIDER[:openai]
28
+
9
29
  module_function
10
30
 
11
- # Estimate token count for a string.
31
+ # Chars-per-token ratio for the given embedding provider.
32
+ #
33
+ # @param provider [Symbol, String, nil] Provider identifier. Unknown or
34
+ # nil providers fall back to {DEFAULT_CHARS_PER_TOKEN}.
35
+ # @return [Float]
36
+ def chars_per_token_for(provider)
37
+ CHARS_PER_TOKEN_BY_PROVIDER.fetch(provider&.to_sym, DEFAULT_CHARS_PER_TOKEN)
38
+ end
39
+
40
+ # Estimate token count for a string using the default (OpenAI) ratio.
41
+ # Use {.estimate_tokens_for} when a specific provider is in play.
12
42
  #
13
43
  # @param text [String] Text to estimate
14
44
  # @return [Integer] Estimated token count
15
45
  def estimate_tokens(text)
16
- (text.length / 4.0).ceil
46
+ estimate_tokens_for(text, provider: nil)
47
+ end
48
+
49
+ # Estimate token count for a string using the provider's native ratio.
50
+ #
51
+ # @param text [String] Text to estimate
52
+ # @param provider [Symbol, String, nil] `:openai`, `:ollama`, or nil.
53
+ # @return [Integer] Estimated token count
54
+ def estimate_tokens_for(text, provider:)
55
+ (text.length / chars_per_token_for(provider)).ceil
17
56
  end
18
57
  end
19
58
  end
@@ -0,0 +1,163 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'net/http'
5
+ require 'uri'
6
+ require_relative 'rate_limiter'
7
+
8
+ module Woods
9
+ module Unblocked
10
+ # REST client for the Unblocked API v1.
11
+ #
12
+ # Handles document and collection CRUD with rate limiting, retries,
13
+ # and error handling. Uses Net::HTTP for zero external dependencies.
14
+ #
15
+ # @example
16
+ # client = Client.new(api_token: "ubk_...")
17
+ # client.put_document(
18
+ # collection_id: "uuid",
19
+ # title: "Order (model)",
20
+ # body: "# Order\n...",
21
+ # uri: "https://github.com/org/repo/blob/main/app/models/order.rb"
22
+ # )
23
+ #
24
+ class Client
25
+ BASE_URL = 'https://getunblocked.com/api/v1'
26
+ MAX_RETRIES = 3
27
+ DEFAULT_TIMEOUT = 30
28
+
29
+ # @param api_token [String] Unblocked API token (Personal or Team)
30
+ # @param rate_limiter [RateLimiter] Rate limiter instance
31
+ # @raise [ArgumentError] if api_token is nil or empty
32
+ def initialize(api_token:, rate_limiter: RateLimiter.new)
33
+ raise ArgumentError, 'api_token is required' if api_token.nil? || api_token.to_s.strip.empty?
34
+
35
+ @api_token = api_token
36
+ @rate_limiter = rate_limiter
37
+ end
38
+
39
+ # Create or update a document (upsert by URI).
40
+ #
41
+ # Documents are unique by `uri` across the organization. If a document
42
+ # with the given URI exists, it is updated; otherwise it is created.
43
+ # Documents become available for queries within ~1 minute.
44
+ #
45
+ # @param collection_id [String] Target collection UUID
46
+ # @param title [String] Document title (plain text)
47
+ # @param body [String] Document body (Markdown preferred)
48
+ # @param uri [String] Source URL (used as unique identifier and citation link)
49
+ # @return [Hash] { "id" => "document-uuid" }
50
+ def put_document(collection_id:, title:, body:, uri:)
51
+ request(:put, 'documents', {
52
+ collectionId: collection_id,
53
+ title: title,
54
+ body: body,
55
+ uri: uri
56
+ })
57
+ end
58
+
59
+ # Create a new collection.
60
+ #
61
+ # @param name [String] Collection name (1-32 chars)
62
+ # @param description [String] Collection description (1-4096 chars)
63
+ # @param icon_url [String, nil] Optional icon URL
64
+ # @return [Hash] { "id" => "collection-uuid", "name" => "...", ... }
65
+ def create_collection(name:, description:, icon_url: nil)
66
+ body = { name: name, description: description }
67
+ body[:iconUrl] = icon_url if icon_url
68
+ request(:post, 'collections', body)
69
+ end
70
+
71
+ # List all collections.
72
+ #
73
+ # @return [Array<Hash>] Collection objects
74
+ def list_collections
75
+ result = request(:get, 'collections')
76
+ result['items'] || result['data'] || [result].flatten.compact
77
+ end
78
+
79
+ # Delete a document by ID.
80
+ #
81
+ # @param document_id [String] Document UUID
82
+ # @return [Hash] API response
83
+ def delete_document(document_id:)
84
+ request(:delete, "documents/#{document_id}")
85
+ end
86
+
87
+ private
88
+
89
+ def request(method, path, body = nil)
90
+ retries = 0
91
+
92
+ loop do
93
+ response = @rate_limiter.track { execute_http(method, path, body) }
94
+
95
+ return parse_response(response) if response.is_a?(Net::HTTPSuccess)
96
+
97
+ if response.code == '429' && retries < MAX_RETRIES
98
+ retries += 1
99
+ wait_time = (response['Retry-After'] || (retries * 2)).to_f
100
+ sleep(wait_time)
101
+ next
102
+ end
103
+
104
+ raise_api_error(response)
105
+ end
106
+ end
107
+
108
+ def execute_http(method, path, body)
109
+ attempts = 0
110
+ begin
111
+ uri = URI("#{BASE_URL}/#{path}")
112
+ http = Net::HTTP.new(uri.host, uri.port)
113
+ http.use_ssl = true
114
+ http.open_timeout = DEFAULT_TIMEOUT
115
+ http.read_timeout = DEFAULT_TIMEOUT
116
+
117
+ req = build_request(method, uri, body)
118
+ http.request(req)
119
+ rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNRESET, Errno::ECONNREFUSED => e
120
+ attempts += 1
121
+ raise Woods::Error, "Network error after #{attempts} retries: #{e.message}" if attempts > MAX_RETRIES
122
+
123
+ sleep(2**attempts)
124
+ retry
125
+ end
126
+ end
127
+
128
+ def build_request(method, uri, body)
129
+ req = case method
130
+ when :put then Net::HTTP::Put.new(uri)
131
+ when :post then Net::HTTP::Post.new(uri)
132
+ when :get then Net::HTTP::Get.new(uri)
133
+ when :delete then Net::HTTP::Delete.new(uri)
134
+ else raise ArgumentError, "Unsupported HTTP method: #{method}"
135
+ end
136
+
137
+ req['Authorization'] = "Bearer #{@api_token}"
138
+ req['Content-Type'] = 'application/json'
139
+ req.body = JSON.generate(body) if body
140
+
141
+ req
142
+ end
143
+
144
+ def parse_response(response)
145
+ return {} if response.body.nil? || response.body.strip.empty?
146
+
147
+ JSON.parse(response.body)
148
+ rescue JSON::ParserError
149
+ {}
150
+ end
151
+
152
+ def raise_api_error(response)
153
+ parsed = begin
154
+ JSON.parse(response.body)
155
+ rescue JSON::ParserError, TypeError
156
+ { 'message' => response.body&.slice(0, 200) || 'Unknown error' }
157
+ end
158
+ message = parsed['message'] || parsed['error'] || 'Unknown error'
159
+ raise Woods::Error, "Unblocked API error #{response.code}: #{message}"
160
+ end
161
+ end
162
+ end
163
+ end