woods 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +186 -0
- data/README.md +20 -8
- data/exe/woods-console +51 -6
- data/exe/woods-console-mcp +24 -4
- data/exe/woods-mcp +30 -7
- data/exe/woods-mcp-http +47 -6
- data/lib/generators/woods/install_generator.rb +13 -4
- data/lib/generators/woods/templates/woods.rb.tt +155 -0
- data/lib/tasks/woods.rake +69 -50
- data/lib/woods/builder.rb +174 -9
- data/lib/woods/cache/cache_middleware.rb +360 -31
- data/lib/woods/chunking/semantic_chunker.rb +334 -7
- data/lib/woods/console/adapters/job_adapter.rb +10 -4
- data/lib/woods/console/audit_logger.rb +76 -4
- data/lib/woods/console/bridge.rb +48 -15
- data/lib/woods/console/bridge_protocol.rb +44 -0
- data/lib/woods/console/confirmation.rb +3 -4
- data/lib/woods/console/console_response_renderer.rb +56 -18
- data/lib/woods/console/credential_index.rb +201 -0
- data/lib/woods/console/credential_scanner.rb +302 -0
- data/lib/woods/console/dispatch_pipeline.rb +138 -0
- data/lib/woods/console/embedded_executor.rb +682 -35
- data/lib/woods/console/eval_guard.rb +319 -0
- data/lib/woods/console/model_validator.rb +1 -3
- data/lib/woods/console/rack_middleware.rb +185 -29
- data/lib/woods/console/redactor.rb +161 -0
- data/lib/woods/console/response_context.rb +127 -0
- data/lib/woods/console/safe_context.rb +220 -23
- data/lib/woods/console/scope_predicate_parser.rb +131 -0
- data/lib/woods/console/server.rb +417 -486
- data/lib/woods/console/sql_noise_stripper.rb +87 -0
- data/lib/woods/console/sql_table_scanner.rb +213 -0
- data/lib/woods/console/sql_validator.rb +81 -31
- data/lib/woods/console/table_gate.rb +93 -0
- data/lib/woods/console/tool_specs.rb +552 -0
- data/lib/woods/console/tools/tier1.rb +3 -3
- data/lib/woods/console/tools/tier4.rb +7 -1
- data/lib/woods/dependency_graph.rb +66 -7
- data/lib/woods/embedding/indexer.rb +190 -6
- data/lib/woods/embedding/openai.rb +40 -4
- data/lib/woods/embedding/provider.rb +104 -8
- data/lib/woods/embedding/text_preparer.rb +23 -3
- data/lib/woods/embedding/token_counter.rb +133 -0
- data/lib/woods/evaluation/baseline_runner.rb +20 -2
- data/lib/woods/evaluation/metrics.rb +4 -1
- data/lib/woods/extracted_unit.rb +1 -0
- data/lib/woods/extractor.rb +7 -1
- data/lib/woods/extractors/controller_extractor.rb +6 -0
- data/lib/woods/extractors/mailer_extractor.rb +16 -2
- data/lib/woods/extractors/model_extractor.rb +6 -1
- data/lib/woods/extractors/phlex_extractor.rb +13 -4
- data/lib/woods/extractors/rails_source_extractor.rb +2 -0
- data/lib/woods/extractors/route_helper_resolver.rb +130 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
- data/lib/woods/extractors/view_component_extractor.rb +12 -1
- data/lib/woods/extractors/view_engines/base.rb +141 -0
- data/lib/woods/extractors/view_engines/erb.rb +145 -0
- data/lib/woods/extractors/view_template_extractor.rb +92 -133
- data/lib/woods/flow_assembler.rb +23 -15
- data/lib/woods/flow_precomputer.rb +21 -2
- data/lib/woods/graph_analyzer.rb +210 -0
- data/lib/woods/index_artifact.rb +173 -0
- data/lib/woods/mcp/bearer_auth.rb +45 -0
- data/lib/woods/mcp/bootstrap_state.rb +94 -0
- data/lib/woods/mcp/bootstrapper.rb +337 -16
- data/lib/woods/mcp/config_resolver.rb +288 -0
- data/lib/woods/mcp/errors.rb +134 -0
- data/lib/woods/mcp/index_reader.rb +265 -30
- data/lib/woods/mcp/origin_guard.rb +132 -0
- data/lib/woods/mcp/provider_probe.rb +166 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +100 -3
- data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
- data/lib/woods/mcp/server.rb +771 -137
- data/lib/woods/model_name_cache.rb +78 -2
- data/lib/woods/notion/client.rb +25 -2
- data/lib/woods/notion/mappers/model_mapper.rb +36 -2
- data/lib/woods/railtie.rb +55 -15
- data/lib/woods/resilience/circuit_breaker.rb +9 -2
- data/lib/woods/resilience/retryable_provider.rb +40 -3
- data/lib/woods/resolved_config.rb +299 -0
- data/lib/woods/retrieval/context_assembler.rb +112 -5
- data/lib/woods/retrieval/query_classifier.rb +1 -1
- data/lib/woods/retrieval/ranker.rb +55 -6
- data/lib/woods/retrieval/search_executor.rb +42 -13
- data/lib/woods/retriever.rb +330 -24
- data/lib/woods/session_tracer/middleware.rb +35 -1
- data/lib/woods/storage/graph_store.rb +39 -0
- data/lib/woods/storage/inapplicable_backend.rb +14 -0
- data/lib/woods/storage/metadata_store.rb +129 -1
- data/lib/woods/storage/pgvector.rb +70 -8
- data/lib/woods/storage/qdrant.rb +196 -5
- data/lib/woods/storage/snapshotter/metadata.rb +172 -0
- data/lib/woods/storage/snapshotter/vector.rb +238 -0
- data/lib/woods/storage/snapshotter.rb +24 -0
- data/lib/woods/storage/vector_store.rb +184 -35
- data/lib/woods/tasks.rb +85 -0
- data/lib/woods/temporal/snapshot_store.rb +49 -1
- data/lib/woods/token_utils.rb +44 -5
- data/lib/woods/unblocked/client.rb +163 -0
- data/lib/woods/unblocked/document_builder.rb +326 -0
- data/lib/woods/unblocked/exporter.rb +201 -0
- data/lib/woods/unblocked/rate_limiter.rb +94 -0
- data/lib/woods/util/host_guard.rb +61 -0
- data/lib/woods/version.rb +1 -1
- data/lib/woods.rb +130 -6
- metadata +73 -4
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
3
5
|
module Woods
|
|
4
6
|
module Storage
|
|
5
7
|
# VectorStore provides an interface for storing and searching embedding vectors.
|
|
@@ -36,11 +38,39 @@ module Woods
|
|
|
36
38
|
entries.each { |e| store(e[:id], e[:vector], e[:metadata] || {}) }
|
|
37
39
|
end
|
|
38
40
|
|
|
41
|
+
# Iterate over every live entry, yielding `(id, vector, metadata)`.
|
|
42
|
+
#
|
|
43
|
+
# Persistence seam for Snapshotter and similar consumers. Default
|
|
44
|
+
# implementation falls through to `NotImplementedError`; adapters
|
|
45
|
+
# that need to support dumping must implement it. Persistent
|
|
46
|
+
# backends (pgvector, Qdrant) aren't expected to implement this —
|
|
47
|
+
# the Snapshotter only touches non-persistent stores.
|
|
48
|
+
#
|
|
49
|
+
# @yield [id, vector, metadata]
|
|
50
|
+
# @return [Enumerator] when no block given
|
|
51
|
+
def each_entry
|
|
52
|
+
raise NotImplementedError
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Bulk-load pre-computed entries. Dual of {#each_entry} — the
|
|
56
|
+
# Snapshotter hydrates a store by feeding this the dump contents.
|
|
57
|
+
#
|
|
58
|
+
# @param entries [Enumerable<Hash>] Each entry has :id, :vector, :metadata keys
|
|
59
|
+
def bulk_load(entries)
|
|
60
|
+
store_batch(entries.to_a)
|
|
61
|
+
end
|
|
62
|
+
|
|
39
63
|
# Search for similar vectors using cosine similarity.
|
|
40
64
|
#
|
|
65
|
+
# Filter values may be scalars (exact match) or Arrays (membership
|
|
66
|
+
# match — "value ∈ array"). Adapters implement the membership
|
|
67
|
+
# semantics natively: in-memory loops, pgvector IN (...), Qdrant
|
|
68
|
+
# `match: { any: [...] }`.
|
|
69
|
+
#
|
|
41
70
|
# @param query_vector [Array<Float>] The query embedding vector
|
|
42
71
|
# @param limit [Integer] Maximum number of results to return
|
|
43
|
-
# @param filters [Hash] Optional metadata filters
|
|
72
|
+
# @param filters [Hash] Optional metadata filters — values may be
|
|
73
|
+
# scalars or Arrays
|
|
44
74
|
# @return [Array<SearchResult>] Results sorted by descending similarity
|
|
45
75
|
# @raise [NotImplementedError] if not implemented by adapter
|
|
46
76
|
def search(query_vector, limit: 10, filters: {})
|
|
@@ -87,79 +117,198 @@ module Woods
|
|
|
87
117
|
# store.search([1.0, 0.0], limit: 1)
|
|
88
118
|
# # => [#<SearchResult id="doc1", score=1.0, metadata={type: "model"}>]
|
|
89
119
|
#
|
|
90
|
-
class InMemory
|
|
120
|
+
class InMemory # rubocop:disable Metrics/ClassLength
|
|
91
121
|
include Interface
|
|
92
122
|
|
|
123
|
+
# Flat-buffer backing. One Array<Float> of length count*dim holds
|
|
124
|
+
# every vector contiguously; two parallel Arrays hold the ids and
|
|
125
|
+
# metadata at matching positions. Deleted entries are tombstoned
|
|
126
|
+
# (their index is added to @tombstones) rather than removed, so
|
|
127
|
+
# stored vector positions stay stable under concurrent iteration
|
|
128
|
+
# and dumps. Tombstones are compacted at next full-embed run.
|
|
129
|
+
#
|
|
130
|
+
# The flat buffer exists both for cache friendliness during the
|
|
131
|
+
# cosine kernel (all vectors live in one contiguous allocation)
|
|
132
|
+
# and to make dump/load via `pack("e*")` a single call rather
|
|
133
|
+
# than a per-vector concatenation.
|
|
93
134
|
def initialize
|
|
94
|
-
@
|
|
135
|
+
@dim = nil
|
|
136
|
+
@ids = [] # Array<String> (frozen)
|
|
137
|
+
@vectors_flat = [] # flat Array<Float>, length @ids.size * @dim
|
|
138
|
+
@metadata = [] # Array<Hash>, index-aligned with @ids
|
|
139
|
+
@id_to_index = {} # id => Integer for O(1) delete/overwrite
|
|
140
|
+
@tombstones = Set.new
|
|
95
141
|
end
|
|
96
142
|
|
|
143
|
+
# @return [Integer, nil] dimension of stored vectors, nil if empty
|
|
144
|
+
attr_reader :dim
|
|
145
|
+
|
|
97
146
|
# @see Interface#store
|
|
98
147
|
def store(id, vector, metadata = {})
|
|
99
|
-
@
|
|
148
|
+
@dim ||= vector.length
|
|
149
|
+
unless vector.length == @dim
|
|
150
|
+
raise ArgumentError,
|
|
151
|
+
"Vector dimension mismatch (#{vector.length} vs #{@dim})"
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
frozen_id = id.frozen? ? id : id.dup.freeze
|
|
155
|
+
existing = @id_to_index[frozen_id]
|
|
156
|
+
if existing
|
|
157
|
+
overwrite(existing, vector, metadata)
|
|
158
|
+
else
|
|
159
|
+
append(frozen_id, vector, metadata)
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# @see Interface#bulk_load
|
|
164
|
+
# Single-pass hydrate — more efficient than N store calls when
|
|
165
|
+
# the Snapshotter feeds a large dump at boot time.
|
|
166
|
+
def bulk_load(entries)
|
|
167
|
+
entries.each { |entry| store(entry[:id], entry[:vector], entry[:metadata] || {}) }
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Drop every stored entry, restoring the store to its post-+new+ state.
|
|
171
|
+
#
|
|
172
|
+
# Used by the MCP +reload+ tool to pick up a fresh embed run without
|
|
173
|
+
# restarting the process. A subsequent +#bulk_load+ then repopulates
|
|
174
|
+
# from disk. Safe on an already-empty store.
|
|
175
|
+
def clear!
|
|
176
|
+
@dim = nil
|
|
177
|
+
@ids = []
|
|
178
|
+
@vectors_flat = []
|
|
179
|
+
@metadata = []
|
|
180
|
+
@id_to_index = {}
|
|
181
|
+
@tombstones = Set.new
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# @see Interface#each_entry
|
|
185
|
+
def each_entry(&block)
|
|
186
|
+
return enum_for(:each_entry) unless block
|
|
187
|
+
|
|
188
|
+
@ids.each_with_index do |id, idx|
|
|
189
|
+
next if @tombstones.include?(idx)
|
|
190
|
+
|
|
191
|
+
base = idx * @dim
|
|
192
|
+
yield(id, @vectors_flat[base, @dim], @metadata[idx])
|
|
193
|
+
end
|
|
100
194
|
end
|
|
101
195
|
|
|
102
196
|
# @see Interface#search
|
|
103
197
|
def search(query_vector, limit: 10, filters: {})
|
|
104
|
-
|
|
198
|
+
return [] if @dim.nil?
|
|
105
199
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
200
|
+
unless query_vector.length == @dim
|
|
201
|
+
raise ArgumentError,
|
|
202
|
+
"Vector dimension mismatch (#{query_vector.length} vs #{@dim})"
|
|
109
203
|
end
|
|
110
|
-
|
|
204
|
+
|
|
205
|
+
scored = gather_candidates(query_vector, filters)
|
|
206
|
+
scored.sort_by! { |r| -r.score }
|
|
207
|
+
scored.first(limit)
|
|
111
208
|
end
|
|
112
209
|
|
|
113
210
|
# @see Interface#delete
|
|
114
211
|
def delete(id)
|
|
115
|
-
@
|
|
212
|
+
idx = @id_to_index.delete(id)
|
|
213
|
+
@tombstones << idx if idx
|
|
116
214
|
end
|
|
117
215
|
|
|
118
216
|
# @see Interface#delete_by_filter
|
|
119
217
|
def delete_by_filter(filters)
|
|
120
|
-
@
|
|
121
|
-
|
|
218
|
+
@ids.each_with_index do |id, idx|
|
|
219
|
+
next if @tombstones.include?(idx)
|
|
220
|
+
next unless filters.all? { |key, value| @metadata[idx][key] == value }
|
|
221
|
+
|
|
222
|
+
@tombstones << idx
|
|
223
|
+
@id_to_index.delete(id)
|
|
122
224
|
end
|
|
123
225
|
end
|
|
124
226
|
|
|
125
227
|
# @see Interface#count
|
|
126
228
|
def count
|
|
127
|
-
@
|
|
229
|
+
@ids.size - @tombstones.size
|
|
128
230
|
end
|
|
129
231
|
|
|
130
232
|
private
|
|
131
233
|
|
|
132
|
-
#
|
|
133
|
-
#
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
234
|
+
# Match a filter value against a metadata value. Arrays are
|
|
235
|
+
# membership filters ("any of"); scalars are equality.
|
|
236
|
+
def filter_match?(filter_value, meta_value)
|
|
237
|
+
filter_value.is_a?(Array) ? filter_value.include?(meta_value) : filter_value == meta_value
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Append a new entry to the flat buffer.
|
|
241
|
+
def append(id, vector, metadata)
|
|
242
|
+
idx = @ids.size
|
|
243
|
+
@ids << id
|
|
244
|
+
@vectors_flat.concat(vector)
|
|
245
|
+
@metadata << metadata
|
|
246
|
+
@id_to_index[id] = idx
|
|
247
|
+
end
|
|
138
248
|
|
|
139
|
-
|
|
140
|
-
|
|
249
|
+
# Overwrite an existing entry in place. Tombstones the old slot's
|
|
250
|
+
# deletion marker (if any) so the new vector is live again.
|
|
251
|
+
def overwrite(idx, vector, metadata)
|
|
252
|
+
base = idx * @dim
|
|
253
|
+
i = 0
|
|
254
|
+
while i < @dim
|
|
255
|
+
@vectors_flat[base + i] = vector[i]
|
|
256
|
+
i += 1
|
|
141
257
|
end
|
|
258
|
+
@metadata[idx] = metadata
|
|
259
|
+
@tombstones.delete(idx)
|
|
142
260
|
end
|
|
143
261
|
|
|
144
|
-
#
|
|
145
|
-
#
|
|
146
|
-
#
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
262
|
+
# Walk every non-tombstoned index, apply filters, score survivors.
|
|
263
|
+
# Filter check runs BEFORE the cosine kernel — avoids computing
|
|
264
|
+
# 12k dot products only to discard most of them.
|
|
265
|
+
def gather_candidates(query_vector, filters)
|
|
266
|
+
scored = []
|
|
267
|
+
len = @ids.size
|
|
268
|
+
idx = 0
|
|
269
|
+
while idx < len
|
|
270
|
+
if @tombstones.include?(idx)
|
|
271
|
+
idx += 1
|
|
272
|
+
next
|
|
273
|
+
end
|
|
274
|
+
meta = @metadata[idx]
|
|
275
|
+
unless filters.empty? || filters.all? { |k, v| filter_match?(v, meta[k]) }
|
|
276
|
+
idx += 1
|
|
277
|
+
next
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
score = cosine_similarity_strided(query_vector, idx * @dim)
|
|
281
|
+
scored << SearchResult.new(id: @ids[idx], score: score, metadata: meta)
|
|
282
|
+
idx += 1
|
|
154
283
|
end
|
|
284
|
+
scored
|
|
285
|
+
end
|
|
155
286
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
287
|
+
# Cosine similarity between a query Array<Float> and a vector
|
|
288
|
+
# that lives at @vectors_flat[base, @dim]. Strided access avoids
|
|
289
|
+
# allocating a copy of the stored vector on every comparison.
|
|
290
|
+
#
|
|
291
|
+
# See bench/vector_query_and_serialization.rb for the allocation
|
|
292
|
+
# story — the old Enumerable path allocated ~770 objects per pair;
|
|
293
|
+
# this loop allocates none inside the hot path.
|
|
294
|
+
def cosine_similarity_strided(query, base)
|
|
295
|
+
len = @dim
|
|
296
|
+
i = 0
|
|
297
|
+
dot = 0.0
|
|
298
|
+
mag_a = 0.0
|
|
299
|
+
mag_b = 0.0
|
|
300
|
+
while i < len
|
|
301
|
+
a = query[i]
|
|
302
|
+
b = @vectors_flat[base + i]
|
|
303
|
+
dot += a * b
|
|
304
|
+
mag_a += a * a
|
|
305
|
+
mag_b += b * b
|
|
306
|
+
i += 1
|
|
307
|
+
end
|
|
159
308
|
|
|
160
309
|
return 0.0 if mag_a.zero? || mag_b.zero?
|
|
161
310
|
|
|
162
|
-
dot / (mag_a * mag_b)
|
|
311
|
+
dot / (Math.sqrt(mag_a) * Math.sqrt(mag_b))
|
|
163
312
|
end
|
|
164
313
|
end
|
|
165
314
|
end
|
data/lib/woods/tasks.rb
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'builder'
|
|
4
|
+
require_relative 'embedding/indexer'
|
|
5
|
+
require_relative 'embedding/text_preparer'
|
|
6
|
+
require_relative 'resolved_config'
|
|
7
|
+
|
|
8
|
+
module Woods
|
|
9
|
+
# Small helpers invoked from `lib/tasks/woods.rake`.
|
|
10
|
+
#
|
|
11
|
+
# Keeps rake task bodies to a couple of lines each so the real work lives in
|
|
12
|
+
# plain Ruby that can be unit-tested without Rake's global state.
|
|
13
|
+
module Tasks
|
|
14
|
+
module_function
|
|
15
|
+
|
|
16
|
+
# Build an {Embedding::Indexer} wired to the provider and stores described
|
|
17
|
+
# by {Woods.configuration}. Uses {Builder} so `config.embedding_provider`,
|
|
18
|
+
# `config.embedding_options`, and `config.vector_store(_options)` are all
|
|
19
|
+
# honoured — prior to this the rake tasks hardcoded Ollama + InMemory and
|
|
20
|
+
# silently ignored configuration, which was invisible until the provider
|
|
21
|
+
# tried to reach an unreachable default host.
|
|
22
|
+
#
|
|
23
|
+
# The TextPreparer and SemanticChunker are tuned to the selected
|
|
24
|
+
# provider so oversize units are split into chunks that fit the
|
|
25
|
+
# provider's input budget (e.g. Ollama's num_ctx, OpenAI's 8k cap).
|
|
26
|
+
#
|
|
27
|
+
# @return [Embedding::Indexer]
|
|
28
|
+
def build_embed_indexer
|
|
29
|
+
config = Woods.configuration
|
|
30
|
+
builder = Builder.new(config)
|
|
31
|
+
provider = builder.build_embedding_provider
|
|
32
|
+
|
|
33
|
+
# Wire the persistence-arc pieces (resolved_config, metadata_store,
|
|
34
|
+
# dump_retention_count) so Indexer#persist_snapshot can write
|
|
35
|
+
# woods.json, dump metadata, and honour the user's retention setting.
|
|
36
|
+
# Without these kwargs, embed writes vectors.bin + latest pointer but
|
|
37
|
+
# never writes woods.json — which breaks the standalone woods-mcp
|
|
38
|
+
# Shape-2 boot path entirely.
|
|
39
|
+
#
|
|
40
|
+
# metadata_store and resolved_config are nil-safe — hosts that don't
|
|
41
|
+
# configure metadata or that pre-date the persistence arc still work.
|
|
42
|
+
Embedding::Indexer.new(
|
|
43
|
+
provider: provider,
|
|
44
|
+
text_preparer: builder.build_text_preparer(provider),
|
|
45
|
+
vector_store: builder.build_vector_store,
|
|
46
|
+
metadata_store: config.metadata_store ? builder.build_metadata_store : nil,
|
|
47
|
+
resolved_config: build_resolved_config(config, provider: provider),
|
|
48
|
+
chunker: builder.build_chunker(provider),
|
|
49
|
+
dump_retention_count: config.dump_retention_count,
|
|
50
|
+
output_dir: ENV.fetch('WOODS_OUTPUT', config.output_dir)
|
|
51
|
+
)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Build a ResolvedConfig snapshot from the live Woods::Configuration.
|
|
55
|
+
# Returns nil if the configuration doesn't have enough to produce one
|
|
56
|
+
# (pre-persistence-arc hosts) so the Indexer falls back to the legacy
|
|
57
|
+
# dump-without-woods.json behaviour.
|
|
58
|
+
#
|
|
59
|
+
# Passes the live +provider+ so {ResolvedConfig.from_configuration} can
|
|
60
|
+
# probe +provider.dimensions+ — without this, Ollama snapshots record
|
|
61
|
+
# +dimension: 0+ and every subsequent MCP boot fails a spurious
|
|
62
|
+
# dimension-mismatch check against the real stored vectors.
|
|
63
|
+
def build_resolved_config(config, provider: nil)
|
|
64
|
+
return nil unless config.embedding_provider
|
|
65
|
+
|
|
66
|
+
ResolvedConfig.from_configuration(config, provider: provider)
|
|
67
|
+
rescue StandardError
|
|
68
|
+
nil
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Print an indexer stats hash in the format the rake tasks have historically
|
|
72
|
+
# used. `mode:` only affects the header line.
|
|
73
|
+
#
|
|
74
|
+
# @param stats [Hash]
|
|
75
|
+
# @param mode [Symbol] :full or :incremental
|
|
76
|
+
def print_embed_stats(stats, mode:)
|
|
77
|
+
header = mode == :incremental ? 'Incremental embedding complete!' : 'Embedding complete!'
|
|
78
|
+
puts
|
|
79
|
+
puts header
|
|
80
|
+
puts " Processed: #{stats[:processed]}"
|
|
81
|
+
puts " Skipped: #{stats[:skipped]}"
|
|
82
|
+
puts " Errors: #{stats[:errors]}"
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -23,10 +23,58 @@ module Woods
|
|
|
23
23
|
#
|
|
24
24
|
class SnapshotStore # rubocop:disable Metrics/ClassLength
|
|
25
25
|
# @param connection [Object] Database connection supporting #execute and #get_first_row
|
|
26
|
-
|
|
26
|
+
# @param validate_schema [Boolean] If true (default), probe both required
|
|
27
|
+
# tables at construction time and raise a descriptive error pointing at
|
|
28
|
+
# migrations 004+005 when they are missing. Set false in tests that
|
|
29
|
+
# construct the store with a bare mock.
|
|
30
|
+
def initialize(connection:, validate_schema: true)
|
|
27
31
|
@db = connection
|
|
32
|
+
validate_schema! if validate_schema
|
|
28
33
|
end
|
|
29
34
|
|
|
35
|
+
REQUIRED_TABLES = %w[woods_snapshots woods_snapshot_units].freeze
|
|
36
|
+
|
|
37
|
+
# Probe that `woods_snapshots` and `woods_snapshot_units` exist. If
|
|
38
|
+
# they don't, raise with guidance to run migrations 004 + 005 —
|
|
39
|
+
# without this, the first call to {#capture}/{#find} raises a generic
|
|
40
|
+
# adapter error that doesn't tell operators why.
|
|
41
|
+
#
|
|
42
|
+
# When the connection responds to `#columns` (ActiveRecord-shaped) or
|
|
43
|
+
# `#table_exists?`, use that — these are hard to spoof from a test
|
|
44
|
+
# mock, so a partial mock can no longer silently pass. Falls back to
|
|
45
|
+
# the `SELECT 1 FROM t LIMIT 1` probe for minimal connections.
|
|
46
|
+
#
|
|
47
|
+
# @raise [Woods::Error]
|
|
48
|
+
def validate_schema!
|
|
49
|
+
REQUIRED_TABLES.each { |t| probe_table!(t) }
|
|
50
|
+
rescue Woods::Error
|
|
51
|
+
raise
|
|
52
|
+
rescue StandardError => e
|
|
53
|
+
raise Woods::Error, schema_error_message(e)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
def probe_table!(table)
|
|
59
|
+
if @db.respond_to?(:table_exists?)
|
|
60
|
+
raise Woods::Error, schema_error_message("table `#{table}` does not exist") unless @db.table_exists?(table)
|
|
61
|
+
elsif @db.respond_to?(:columns)
|
|
62
|
+
cols = @db.columns(table)
|
|
63
|
+
raise Woods::Error, schema_error_message("no columns for `#{table}`") if cols.nil? || cols.empty?
|
|
64
|
+
else
|
|
65
|
+
@db.execute("SELECT 1 FROM #{table} LIMIT 1")
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def schema_error_message(detail)
|
|
70
|
+
'SnapshotStore requires the `woods_snapshots` and ' \
|
|
71
|
+
'`woods_snapshot_units` tables (migrations 004 + 005 under ' \
|
|
72
|
+
'`lib/woods/db/migrations/`). Run `rake woods:migrate` on the ' \
|
|
73
|
+
"metadata DB and retry. Underlying error: #{detail}"
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
public
|
|
77
|
+
|
|
30
78
|
# Capture a snapshot after extraction completes.
|
|
31
79
|
#
|
|
32
80
|
# Stores the manifest metadata and per-unit content hashes.
|
data/lib/woods/token_utils.rb
CHANGED
|
@@ -1,19 +1,58 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Woods
|
|
4
|
-
# Shared token estimation utility
|
|
4
|
+
# Shared token estimation utility — the single source of truth for the
|
|
5
|
+
# chars-per-token ratio used across cost estimation, context assembly,
|
|
6
|
+
# and embedding budgeting.
|
|
5
7
|
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
+
# Ratios:
|
|
9
|
+
# - `:openai` / default — 4.0 chars/token. Benchmarked against tiktoken
|
|
10
|
+
# (cl100k_base) on 19 Ruby source files (mean 4.41 chars/token). We use
|
|
11
|
+
# 4.0 as a conservative floor (~10.6 % overestimate) so truncation never
|
|
12
|
+
# hands the model more tokens than it budgeted for. See
|
|
13
|
+
# `docs/TOKEN_BENCHMARK.md`.
|
|
14
|
+
# - `:ollama` — 1.5 chars/token. Matches the BERT WordPiece tokenizers
|
|
15
|
+
# used by nomic-embed-text and mxbai-embed-large. See
|
|
16
|
+
# `docs/EMBEDDING_MODELS.md` and `Woods::Builder#chars_per_token_for`.
|
|
17
|
+
#
|
|
18
|
+
# Callers should prefer {.chars_per_token_for} over hardcoding a divisor
|
|
19
|
+
# so future tokenizer changes propagate in one place instead of drifting
|
|
20
|
+
# between {ContextAssembler}, {Builder}, and cost-model components.
|
|
8
21
|
module TokenUtils
|
|
22
|
+
CHARS_PER_TOKEN_BY_PROVIDER = {
|
|
23
|
+
openai: 4.0,
|
|
24
|
+
ollama: 1.5
|
|
25
|
+
}.freeze
|
|
26
|
+
|
|
27
|
+
DEFAULT_CHARS_PER_TOKEN = CHARS_PER_TOKEN_BY_PROVIDER[:openai]
|
|
28
|
+
|
|
9
29
|
module_function
|
|
10
30
|
|
|
11
|
-
#
|
|
31
|
+
# Chars-per-token ratio for the given embedding provider.
|
|
32
|
+
#
|
|
33
|
+
# @param provider [Symbol, String, nil] Provider identifier. Unknown or
|
|
34
|
+
# nil providers fall back to {DEFAULT_CHARS_PER_TOKEN}.
|
|
35
|
+
# @return [Float]
|
|
36
|
+
def chars_per_token_for(provider)
|
|
37
|
+
CHARS_PER_TOKEN_BY_PROVIDER.fetch(provider&.to_sym, DEFAULT_CHARS_PER_TOKEN)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Estimate token count for a string using the default (OpenAI) ratio.
|
|
41
|
+
# Use {.estimate_tokens_for} when a specific provider is in play.
|
|
12
42
|
#
|
|
13
43
|
# @param text [String] Text to estimate
|
|
14
44
|
# @return [Integer] Estimated token count
|
|
15
45
|
def estimate_tokens(text)
|
|
16
|
-
(text
|
|
46
|
+
estimate_tokens_for(text, provider: nil)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Estimate token count for a string using the provider's native ratio.
|
|
50
|
+
#
|
|
51
|
+
# @param text [String] Text to estimate
|
|
52
|
+
# @param provider [Symbol, String, nil] `:openai`, `:ollama`, or nil.
|
|
53
|
+
# @return [Integer] Estimated token count
|
|
54
|
+
def estimate_tokens_for(text, provider:)
|
|
55
|
+
(text.length / chars_per_token_for(provider)).ceil
|
|
17
56
|
end
|
|
18
57
|
end
|
|
19
58
|
end
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'net/http'
|
|
5
|
+
require 'uri'
|
|
6
|
+
require_relative 'rate_limiter'
|
|
7
|
+
|
|
8
|
+
module Woods
|
|
9
|
+
module Unblocked
|
|
10
|
+
# REST client for the Unblocked API v1.
|
|
11
|
+
#
|
|
12
|
+
# Handles document and collection CRUD with rate limiting, retries,
|
|
13
|
+
# and error handling. Uses Net::HTTP for zero external dependencies.
|
|
14
|
+
#
|
|
15
|
+
# @example
|
|
16
|
+
# client = Client.new(api_token: "ubk_...")
|
|
17
|
+
# client.put_document(
|
|
18
|
+
# collection_id: "uuid",
|
|
19
|
+
# title: "Order (model)",
|
|
20
|
+
# body: "# Order\n...",
|
|
21
|
+
# uri: "https://github.com/org/repo/blob/main/app/models/order.rb"
|
|
22
|
+
# )
|
|
23
|
+
#
|
|
24
|
+
class Client
|
|
25
|
+
BASE_URL = 'https://getunblocked.com/api/v1'
|
|
26
|
+
MAX_RETRIES = 3
|
|
27
|
+
DEFAULT_TIMEOUT = 30
|
|
28
|
+
|
|
29
|
+
# @param api_token [String] Unblocked API token (Personal or Team)
|
|
30
|
+
# @param rate_limiter [RateLimiter] Rate limiter instance
|
|
31
|
+
# @raise [ArgumentError] if api_token is nil or empty
|
|
32
|
+
def initialize(api_token:, rate_limiter: RateLimiter.new)
|
|
33
|
+
raise ArgumentError, 'api_token is required' if api_token.nil? || api_token.to_s.strip.empty?
|
|
34
|
+
|
|
35
|
+
@api_token = api_token
|
|
36
|
+
@rate_limiter = rate_limiter
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Create or update a document (upsert by URI).
|
|
40
|
+
#
|
|
41
|
+
# Documents are unique by `uri` across the organization. If a document
|
|
42
|
+
# with the given URI exists, it is updated; otherwise it is created.
|
|
43
|
+
# Documents become available for queries within ~1 minute.
|
|
44
|
+
#
|
|
45
|
+
# @param collection_id [String] Target collection UUID
|
|
46
|
+
# @param title [String] Document title (plain text)
|
|
47
|
+
# @param body [String] Document body (Markdown preferred)
|
|
48
|
+
# @param uri [String] Source URL (used as unique identifier and citation link)
|
|
49
|
+
# @return [Hash] { "id" => "document-uuid" }
|
|
50
|
+
def put_document(collection_id:, title:, body:, uri:)
|
|
51
|
+
request(:put, 'documents', {
|
|
52
|
+
collectionId: collection_id,
|
|
53
|
+
title: title,
|
|
54
|
+
body: body,
|
|
55
|
+
uri: uri
|
|
56
|
+
})
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Create a new collection.
|
|
60
|
+
#
|
|
61
|
+
# @param name [String] Collection name (1-32 chars)
|
|
62
|
+
# @param description [String] Collection description (1-4096 chars)
|
|
63
|
+
# @param icon_url [String, nil] Optional icon URL
|
|
64
|
+
# @return [Hash] { "id" => "collection-uuid", "name" => "...", ... }
|
|
65
|
+
def create_collection(name:, description:, icon_url: nil)
|
|
66
|
+
body = { name: name, description: description }
|
|
67
|
+
body[:iconUrl] = icon_url if icon_url
|
|
68
|
+
request(:post, 'collections', body)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# List all collections.
|
|
72
|
+
#
|
|
73
|
+
# @return [Array<Hash>] Collection objects
|
|
74
|
+
def list_collections
|
|
75
|
+
result = request(:get, 'collections')
|
|
76
|
+
result['items'] || result['data'] || [result].flatten.compact
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Delete a document by ID.
|
|
80
|
+
#
|
|
81
|
+
# @param document_id [String] Document UUID
|
|
82
|
+
# @return [Hash] API response
|
|
83
|
+
def delete_document(document_id:)
|
|
84
|
+
request(:delete, "documents/#{document_id}")
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
private
|
|
88
|
+
|
|
89
|
+
def request(method, path, body = nil)
|
|
90
|
+
retries = 0
|
|
91
|
+
|
|
92
|
+
loop do
|
|
93
|
+
response = @rate_limiter.track { execute_http(method, path, body) }
|
|
94
|
+
|
|
95
|
+
return parse_response(response) if response.is_a?(Net::HTTPSuccess)
|
|
96
|
+
|
|
97
|
+
if response.code == '429' && retries < MAX_RETRIES
|
|
98
|
+
retries += 1
|
|
99
|
+
wait_time = (response['Retry-After'] || (retries * 2)).to_f
|
|
100
|
+
sleep(wait_time)
|
|
101
|
+
next
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
raise_api_error(response)
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def execute_http(method, path, body)
|
|
109
|
+
attempts = 0
|
|
110
|
+
begin
|
|
111
|
+
uri = URI("#{BASE_URL}/#{path}")
|
|
112
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
113
|
+
http.use_ssl = true
|
|
114
|
+
http.open_timeout = DEFAULT_TIMEOUT
|
|
115
|
+
http.read_timeout = DEFAULT_TIMEOUT
|
|
116
|
+
|
|
117
|
+
req = build_request(method, uri, body)
|
|
118
|
+
http.request(req)
|
|
119
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNRESET, Errno::ECONNREFUSED => e
|
|
120
|
+
attempts += 1
|
|
121
|
+
raise Woods::Error, "Network error after #{attempts} retries: #{e.message}" if attempts > MAX_RETRIES
|
|
122
|
+
|
|
123
|
+
sleep(2**attempts)
|
|
124
|
+
retry
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def build_request(method, uri, body)
|
|
129
|
+
req = case method
|
|
130
|
+
when :put then Net::HTTP::Put.new(uri)
|
|
131
|
+
when :post then Net::HTTP::Post.new(uri)
|
|
132
|
+
when :get then Net::HTTP::Get.new(uri)
|
|
133
|
+
when :delete then Net::HTTP::Delete.new(uri)
|
|
134
|
+
else raise ArgumentError, "Unsupported HTTP method: #{method}"
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
req['Authorization'] = "Bearer #{@api_token}"
|
|
138
|
+
req['Content-Type'] = 'application/json'
|
|
139
|
+
req.body = JSON.generate(body) if body
|
|
140
|
+
|
|
141
|
+
req
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def parse_response(response)
|
|
145
|
+
return {} if response.body.nil? || response.body.strip.empty?
|
|
146
|
+
|
|
147
|
+
JSON.parse(response.body)
|
|
148
|
+
rescue JSON::ParserError
|
|
149
|
+
{}
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def raise_api_error(response)
|
|
153
|
+
parsed = begin
|
|
154
|
+
JSON.parse(response.body)
|
|
155
|
+
rescue JSON::ParserError, TypeError
|
|
156
|
+
{ 'message' => response.body&.slice(0, 200) || 'Unknown error' }
|
|
157
|
+
end
|
|
158
|
+
message = parsed['message'] || parsed['error'] || 'Unknown error'
|
|
159
|
+
raise Woods::Error, "Unblocked API error #{response.code}: #{message}"
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
end
|