woods 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +169 -0
- data/README.md +20 -8
- data/exe/woods-console +51 -6
- data/exe/woods-console-mcp +24 -4
- data/exe/woods-mcp +30 -7
- data/exe/woods-mcp-http +47 -6
- data/lib/generators/woods/install_generator.rb +13 -4
- data/lib/generators/woods/templates/woods.rb.tt +155 -0
- data/lib/tasks/woods.rake +15 -50
- data/lib/woods/builder.rb +174 -9
- data/lib/woods/cache/cache_middleware.rb +360 -31
- data/lib/woods/chunking/semantic_chunker.rb +334 -7
- data/lib/woods/console/adapters/job_adapter.rb +10 -4
- data/lib/woods/console/audit_logger.rb +76 -4
- data/lib/woods/console/bridge.rb +48 -15
- data/lib/woods/console/bridge_protocol.rb +44 -0
- data/lib/woods/console/confirmation.rb +3 -4
- data/lib/woods/console/console_response_renderer.rb +56 -18
- data/lib/woods/console/credential_index.rb +201 -0
- data/lib/woods/console/credential_scanner.rb +302 -0
- data/lib/woods/console/dispatch_pipeline.rb +138 -0
- data/lib/woods/console/embedded_executor.rb +682 -35
- data/lib/woods/console/eval_guard.rb +319 -0
- data/lib/woods/console/model_validator.rb +1 -3
- data/lib/woods/console/rack_middleware.rb +185 -29
- data/lib/woods/console/redactor.rb +161 -0
- data/lib/woods/console/response_context.rb +127 -0
- data/lib/woods/console/safe_context.rb +220 -23
- data/lib/woods/console/scope_predicate_parser.rb +131 -0
- data/lib/woods/console/server.rb +417 -486
- data/lib/woods/console/sql_noise_stripper.rb +87 -0
- data/lib/woods/console/sql_table_scanner.rb +213 -0
- data/lib/woods/console/sql_validator.rb +81 -31
- data/lib/woods/console/table_gate.rb +93 -0
- data/lib/woods/console/tool_specs.rb +552 -0
- data/lib/woods/console/tools/tier1.rb +3 -3
- data/lib/woods/console/tools/tier4.rb +7 -1
- data/lib/woods/dependency_graph.rb +66 -7
- data/lib/woods/embedding/indexer.rb +190 -6
- data/lib/woods/embedding/openai.rb +40 -4
- data/lib/woods/embedding/provider.rb +104 -8
- data/lib/woods/embedding/text_preparer.rb +23 -3
- data/lib/woods/embedding/token_counter.rb +133 -0
- data/lib/woods/evaluation/baseline_runner.rb +20 -2
- data/lib/woods/evaluation/metrics.rb +4 -1
- data/lib/woods/extracted_unit.rb +1 -0
- data/lib/woods/extractor.rb +7 -1
- data/lib/woods/extractors/controller_extractor.rb +6 -0
- data/lib/woods/extractors/mailer_extractor.rb +16 -2
- data/lib/woods/extractors/model_extractor.rb +6 -1
- data/lib/woods/extractors/phlex_extractor.rb +13 -4
- data/lib/woods/extractors/rails_source_extractor.rb +2 -0
- data/lib/woods/extractors/route_helper_resolver.rb +130 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
- data/lib/woods/extractors/view_component_extractor.rb +12 -1
- data/lib/woods/extractors/view_engines/base.rb +141 -0
- data/lib/woods/extractors/view_engines/erb.rb +145 -0
- data/lib/woods/extractors/view_template_extractor.rb +92 -133
- data/lib/woods/flow_assembler.rb +23 -15
- data/lib/woods/flow_precomputer.rb +21 -2
- data/lib/woods/graph_analyzer.rb +3 -4
- data/lib/woods/index_artifact.rb +173 -0
- data/lib/woods/mcp/bearer_auth.rb +45 -0
- data/lib/woods/mcp/bootstrap_state.rb +94 -0
- data/lib/woods/mcp/bootstrapper.rb +337 -16
- data/lib/woods/mcp/config_resolver.rb +288 -0
- data/lib/woods/mcp/errors.rb +134 -0
- data/lib/woods/mcp/index_reader.rb +265 -30
- data/lib/woods/mcp/origin_guard.rb +132 -0
- data/lib/woods/mcp/provider_probe.rb +166 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
- data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
- data/lib/woods/mcp/server.rb +737 -137
- data/lib/woods/model_name_cache.rb +78 -2
- data/lib/woods/notion/client.rb +25 -2
- data/lib/woods/notion/mappers/model_mapper.rb +36 -2
- data/lib/woods/railtie.rb +55 -15
- data/lib/woods/resilience/circuit_breaker.rb +9 -2
- data/lib/woods/resilience/retryable_provider.rb +40 -3
- data/lib/woods/resolved_config.rb +299 -0
- data/lib/woods/retrieval/context_assembler.rb +112 -5
- data/lib/woods/retrieval/query_classifier.rb +1 -1
- data/lib/woods/retrieval/ranker.rb +55 -6
- data/lib/woods/retrieval/search_executor.rb +42 -13
- data/lib/woods/retriever.rb +330 -24
- data/lib/woods/session_tracer/middleware.rb +35 -1
- data/lib/woods/storage/graph_store.rb +39 -0
- data/lib/woods/storage/inapplicable_backend.rb +14 -0
- data/lib/woods/storage/metadata_store.rb +129 -1
- data/lib/woods/storage/pgvector.rb +70 -8
- data/lib/woods/storage/qdrant.rb +196 -5
- data/lib/woods/storage/snapshotter/metadata.rb +172 -0
- data/lib/woods/storage/snapshotter/vector.rb +238 -0
- data/lib/woods/storage/snapshotter.rb +24 -0
- data/lib/woods/storage/vector_store.rb +184 -35
- data/lib/woods/tasks.rb +85 -0
- data/lib/woods/temporal/snapshot_store.rb +49 -1
- data/lib/woods/token_utils.rb +44 -5
- data/lib/woods/unblocked/client.rb +1 -1
- data/lib/woods/unblocked/document_builder.rb +35 -10
- data/lib/woods/unblocked/exporter.rb +1 -1
- data/lib/woods/util/host_guard.rb +61 -0
- data/lib/woods/version.rb +1 -1
- data/lib/woods.rb +126 -6
- metadata +69 -4
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'time'
|
|
5
|
+
require_relative 'mcp/errors'
|
|
6
|
+
|
|
7
|
+
module Woods
|
|
8
|
+
# Immutable Whole Value representing the resolved embedding configuration
|
|
9
|
+
# captured at embed time and read back by the MCP server at boot.
|
|
10
|
+
#
|
|
11
|
+
# This is NOT a declared-config bag of fields — it records what was
|
|
12
|
+
# *actually used* during embedding (provider class, model, host, dimension,
|
|
13
|
+
# store types). The MCP server compares a stored {ResolvedConfig} against
|
|
14
|
+
# the current host config to detect incompatible re-deployments.
|
|
15
|
+
#
|
|
16
|
+
# Build via {.from_hash} (parses +woods.json+) or {.from_configuration}
|
|
17
|
+
# (captures the current {Woods::Configuration} at embed time).
|
|
18
|
+
#
|
|
19
|
+
# @example Parsing woods.json
|
|
20
|
+
# config = Woods::ResolvedConfig.from_hash(JSON.parse(File.read("woods.json")))
|
|
21
|
+
# config.dimension # => 768
|
|
22
|
+
# config.provider_signature # => "Ollama/nomic-embed-text@http://host.docker.internal:11434"
|
|
23
|
+
#
|
|
24
|
+
# @example Asserting compatibility before hydrating stores
|
|
25
|
+
# stored = Woods::ResolvedConfig.from_hash(snapshot)
|
|
26
|
+
# live = Woods::ResolvedConfig.from_configuration(Woods.configuration)
|
|
27
|
+
# live.assert_compatible!(stored)
|
|
28
|
+
class ResolvedConfig # rubocop:disable Metrics/ClassLength
|
|
29
|
+
# The only schema version this gem release can read or write.
|
|
30
|
+
SCHEMA_VERSION_SUPPORTED = 1
|
|
31
|
+
|
|
32
|
+
# @return [Integer]
|
|
33
|
+
attr_reader :schema_version
|
|
34
|
+
|
|
35
|
+
# @return [String] Gem version at embed time (e.g. "1.2.0")
|
|
36
|
+
attr_reader :gem_version
|
|
37
|
+
|
|
38
|
+
# @return [Time]
|
|
39
|
+
attr_reader :created_at
|
|
40
|
+
|
|
41
|
+
# @return [Hash] Provider details — :class, :model, :host, :num_ctx, :read_timeout, :dimension
|
|
42
|
+
attr_reader :embedding_provider
|
|
43
|
+
|
|
44
|
+
# @return [Hash] Store types — :vector_store, :metadata_store, :graph_store (Symbols)
|
|
45
|
+
attr_reader :stores
|
|
46
|
+
|
|
47
|
+
# Parse a +woods.json+ hash into a {ResolvedConfig}.
|
|
48
|
+
#
|
|
49
|
+
# @param raw [Hash] Parsed JSON hash (string or symbol keys)
|
|
50
|
+
# @return [ResolvedConfig]
|
|
51
|
+
# @raise [Woods::MCP::UnsupportedArtifact] if schema_version is not supported
|
|
52
|
+
def self.from_hash(raw)
|
|
53
|
+
data = normalize_keys(raw)
|
|
54
|
+
validate_schema_version!(data[:schema_version].to_i)
|
|
55
|
+
|
|
56
|
+
new(
|
|
57
|
+
schema_version: data[:schema_version].to_i,
|
|
58
|
+
gem_version: data[:gem_version].to_s,
|
|
59
|
+
created_at: parse_time(data[:created_at]),
|
|
60
|
+
embedding_provider: parse_provider(data[:embedding_provider] || {}),
|
|
61
|
+
stores: parse_stores(data[:stores] || {})
|
|
62
|
+
)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Capture the current {Woods::Configuration} as a {ResolvedConfig}.
|
|
66
|
+
#
|
|
67
|
+
# The +provider:+ kwarg lets callers pass a live embedding provider so
|
|
68
|
+
# the dimension is discovered at runtime instead of being read from a
|
|
69
|
+
# declared-only field. This matters for Ollama — dimensions come from
|
|
70
|
+
# the model, not the config — and doesn't hurt OpenAI, whose provider
|
|
71
|
+
# exposes the same +#dimensions+ interface.
|
|
72
|
+
#
|
|
73
|
+
# When +provider:+ is omitted, dimension falls back to
|
|
74
|
+
# +config.embedding_options[:dimension]+ (useful for specs and for
|
|
75
|
+
# offline ResolvedConfig construction where no provider exists).
|
|
76
|
+
#
|
|
77
|
+
# @param config [Woods::Configuration]
|
|
78
|
+
# @param gem_version [String] Defaults to {Woods::VERSION}
|
|
79
|
+
# @param provider [#dimensions, nil] Optional live provider to probe
|
|
80
|
+
# for dimension when +config.embedding_options[:dimension]+ is absent.
|
|
81
|
+
# @return [ResolvedConfig]
|
|
82
|
+
def self.from_configuration(config, gem_version: nil, provider: nil) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
|
83
|
+
require_relative 'version'
|
|
84
|
+
|
|
85
|
+
opts = config.embedding_options || {}
|
|
86
|
+
declared_dim = opts[:dimension] || opts['dimension']
|
|
87
|
+
dim = declared_dim || (provider.respond_to?(:dimensions) ? provider.dimensions : nil)
|
|
88
|
+
|
|
89
|
+
provider_hash = {
|
|
90
|
+
class: resolve_provider_class(config.embedding_provider),
|
|
91
|
+
model: (opts[:model] || opts['model'] || config.embedding_model).to_s,
|
|
92
|
+
dimension: dim.to_i,
|
|
93
|
+
host: opts[:host] || opts['host'],
|
|
94
|
+
num_ctx: opts[:num_ctx] || opts['num_ctx'],
|
|
95
|
+
read_timeout: opts[:read_timeout] || opts['read_timeout']
|
|
96
|
+
}.compact
|
|
97
|
+
|
|
98
|
+
new(
|
|
99
|
+
schema_version: SCHEMA_VERSION_SUPPORTED,
|
|
100
|
+
gem_version: (gem_version || Woods::VERSION).to_s,
|
|
101
|
+
created_at: Time.now.utc,
|
|
102
|
+
embedding_provider: provider_hash,
|
|
103
|
+
stores: {
|
|
104
|
+
vector_store: config.vector_store,
|
|
105
|
+
metadata_store: config.metadata_store,
|
|
106
|
+
graph_store: config.graph_store
|
|
107
|
+
}
|
|
108
|
+
)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# @param schema_version [Integer]
|
|
112
|
+
# @param gem_version [String]
|
|
113
|
+
# @param created_at [Time]
|
|
114
|
+
# @param embedding_provider [Hash]
|
|
115
|
+
# @param stores [Hash]
|
|
116
|
+
def initialize(schema_version:, gem_version:, created_at:, embedding_provider:, stores:)
|
|
117
|
+
@schema_version = schema_version
|
|
118
|
+
@gem_version = gem_version.to_s.freeze
|
|
119
|
+
@created_at = created_at
|
|
120
|
+
@embedding_provider = deep_freeze(embedding_provider)
|
|
121
|
+
@stores = deep_freeze(stores)
|
|
122
|
+
freeze
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# @return [Integer] Embedding dimension declared by the provider
|
|
126
|
+
def dimension
|
|
127
|
+
embedding_provider[:dimension].to_i
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Short string identifying this provider configuration, useful for log
|
|
131
|
+
# messages and {ConfigMismatch} error text.
|
|
132
|
+
#
|
|
133
|
+
# @return [String] e.g. "Ollama/nomic-embed-text@http://host.docker.internal:11434"
|
|
134
|
+
def provider_signature
|
|
135
|
+
klass = embedding_provider[:class].to_s.split('::').last
|
|
136
|
+
model = embedding_provider[:model]
|
|
137
|
+
host = embedding_provider[:host]
|
|
138
|
+
host ? "#{klass}/#{model}@#{host}" : "#{klass}/#{model}"
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Returns +true+ if +other+ uses the same provider class, model, dimension,
|
|
142
|
+
# and store types. Ignores gem version, read_timeout, num_ctx, and created_at.
|
|
143
|
+
#
|
|
144
|
+
# @param other [ResolvedConfig]
|
|
145
|
+
# @return [Boolean]
|
|
146
|
+
def matches?(other)
|
|
147
|
+
embedding_provider[:class] == other.embedding_provider[:class] &&
|
|
148
|
+
embedding_provider[:model] == other.embedding_provider[:model] &&
|
|
149
|
+
dimension == other.dimension &&
|
|
150
|
+
stores[:vector_store] == other.stores[:vector_store] &&
|
|
151
|
+
stores[:metadata_store] == other.stores[:metadata_store] &&
|
|
152
|
+
stores[:graph_store] == other.stores[:graph_store]
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Assert that +stored_config+ (the config captured at embed time) is
|
|
156
|
+
# compatible with +self+ (the live host config). Raises typed errors
|
|
157
|
+
# so the operator can diagnose the mismatch without reading source.
|
|
158
|
+
#
|
|
159
|
+
# @param stored_config [ResolvedConfig]
|
|
160
|
+
# @raise [Woods::MCP::DimensionMismatch] if dimensions differ
|
|
161
|
+
# @raise [Woods::MCP::ConfigMismatch] if provider class or model differs
|
|
162
|
+
# @return [void]
|
|
163
|
+
def assert_compatible!(stored_config)
|
|
164
|
+
assert_dimensions_match!(stored_config)
|
|
165
|
+
assert_provider_matches!(stored_config)
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Serialize to a Hash suitable for +JSON.generate+ and round-trippable
|
|
169
|
+
# through {.from_hash}.
|
|
170
|
+
#
|
|
171
|
+
# @return [Hash]
|
|
172
|
+
def to_snapshot_json
|
|
173
|
+
{
|
|
174
|
+
'schema_version' => schema_version,
|
|
175
|
+
'gem_version' => gem_version,
|
|
176
|
+
'created_at' => created_at.iso8601,
|
|
177
|
+
'embedding_provider' => embedding_provider.transform_keys(&:to_s),
|
|
178
|
+
'stores' => stores.transform_keys(&:to_s).transform_values(&:to_s)
|
|
179
|
+
}
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# @return [Hash]
|
|
183
|
+
def to_h
|
|
184
|
+
to_snapshot_json.freeze
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
private
|
|
188
|
+
|
|
189
|
+
# Recursively freeze a Hash and every Hash/Array/String it transitively
|
|
190
|
+
# holds. The previous shallow `.freeze` left nested Hash values mutable
|
|
191
|
+
# — a caller reaching `config.embedding_provider[:options][:foo] = …`
|
|
192
|
+
# could mutate the supposedly-immutable snapshot. Public ResolvedConfig
|
|
193
|
+
# is documented as a frozen Whole Value; this enforces it.
|
|
194
|
+
def deep_freeze(obj) # rubocop:disable Metrics/CyclomaticComplexity
|
|
195
|
+
case obj
|
|
196
|
+
when Hash
|
|
197
|
+
obj.each_pair do |k, v|
|
|
198
|
+
deep_freeze(k)
|
|
199
|
+
deep_freeze(v)
|
|
200
|
+
end
|
|
201
|
+
obj.frozen? ? obj : obj.freeze
|
|
202
|
+
when Array
|
|
203
|
+
obj.each { |v| deep_freeze(v) }
|
|
204
|
+
obj.frozen? ? obj : obj.freeze
|
|
205
|
+
when String
|
|
206
|
+
obj.frozen? ? obj : obj.dup.freeze
|
|
207
|
+
else
|
|
208
|
+
obj
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def assert_dimensions_match!(stored_config)
|
|
213
|
+
return if dimension == stored_config.dimension
|
|
214
|
+
|
|
215
|
+
raise Woods::MCP::DimensionMismatch.new(
|
|
216
|
+
"Provider dimension #{dimension} does not match stored dimension #{stored_config.dimension}. " \
|
|
217
|
+
'Re-run `rake woods:embed` to rebuild the index.',
|
|
218
|
+
details: {
|
|
219
|
+
expected: stored_config.dimension,
|
|
220
|
+
actual: dimension,
|
|
221
|
+
stored_at: stored_config.created_at.iso8601
|
|
222
|
+
}
|
|
223
|
+
)
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
def assert_provider_matches!(stored_config)
|
|
227
|
+
return if embedding_provider[:class] == stored_config.embedding_provider[:class] &&
|
|
228
|
+
embedding_provider[:model] == stored_config.embedding_provider[:model]
|
|
229
|
+
|
|
230
|
+
raise Woods::MCP::ConfigMismatch.new(
|
|
231
|
+
"Host provider #{provider_signature} does not match stored provider #{stored_config.provider_signature}. " \
|
|
232
|
+
'Re-run `rake woods:embed` or align host configuration.',
|
|
233
|
+
details: {
|
|
234
|
+
host: provider_signature,
|
|
235
|
+
stored: stored_config.provider_signature,
|
|
236
|
+
stored_at: stored_config.created_at.iso8601
|
|
237
|
+
}
|
|
238
|
+
)
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
class << self
|
|
242
|
+
private
|
|
243
|
+
|
|
244
|
+
def validate_schema_version!(version)
|
|
245
|
+
# Forwards-compatibility rule: accept any version at or below the
|
|
246
|
+
# supported ceiling. An older dump (schema_version 1) must still
|
|
247
|
+
# load cleanly on a newer gem (schema_version 2), matching the
|
|
248
|
+
# behaviour of the binary snapshotters (vectors.bin, metadata.msgpack)
|
|
249
|
+
# which both use `<=`.
|
|
250
|
+
return if version.positive? && version <= SCHEMA_VERSION_SUPPORTED
|
|
251
|
+
|
|
252
|
+
raise Woods::MCP::UnsupportedArtifact.new(
|
|
253
|
+
"woods.json schema_version #{version} is not supported (supported: #{SCHEMA_VERSION_SUPPORTED})",
|
|
254
|
+
details: { found: version, supported: SCHEMA_VERSION_SUPPORTED }
|
|
255
|
+
)
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
def parse_provider(raw)
|
|
259
|
+
data = normalize_keys(raw)
|
|
260
|
+
{
|
|
261
|
+
class: data[:class].to_s,
|
|
262
|
+
model: data[:model].to_s,
|
|
263
|
+
dimension: data[:dimension].to_i,
|
|
264
|
+
host: data[:host],
|
|
265
|
+
num_ctx: data[:num_ctx],
|
|
266
|
+
read_timeout: data[:read_timeout]
|
|
267
|
+
}.compact
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
def parse_stores(raw)
|
|
271
|
+
data = normalize_keys(raw)
|
|
272
|
+
{
|
|
273
|
+
vector_store: data[:vector_store]&.to_sym,
|
|
274
|
+
metadata_store: data[:metadata_store]&.to_sym,
|
|
275
|
+
graph_store: data[:graph_store]&.to_sym
|
|
276
|
+
}
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
def parse_time(value)
|
|
280
|
+
value ? Time.parse(value.to_s) : Time.now.utc
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
def normalize_keys(hash)
|
|
284
|
+
hash.transform_keys(&:to_sym)
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
def resolve_provider_class(provider)
|
|
288
|
+
case provider
|
|
289
|
+
when :openai then 'Woods::Embedding::Provider::OpenAI'
|
|
290
|
+
when :ollama then 'Woods::Embedding::Provider::Ollama'
|
|
291
|
+
when String then provider
|
|
292
|
+
when Class then provider.name
|
|
293
|
+
when nil then ''
|
|
294
|
+
else provider.to_s
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
end
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative 'search_executor'
|
|
4
|
+
require_relative '../token_utils'
|
|
5
|
+
|
|
3
6
|
module Woods
|
|
4
7
|
module Retrieval
|
|
5
8
|
# Transforms ranked search candidates into a token-budgeted context string
|
|
@@ -34,13 +37,48 @@ module Woods
|
|
|
34
37
|
# Minimum token count for a section to be worth including.
|
|
35
38
|
MIN_USEFUL_TOKENS = 200
|
|
36
39
|
|
|
40
|
+
# Default chars-per-token ratio. Delegates to {Woods::TokenUtils} —
|
|
41
|
+
# the single source of truth — which uses 4.0 (OpenAI / tiktoken
|
|
42
|
+
# cl100k_base average for Ruby source; see docs/TOKEN_BENCHMARK.md).
|
|
43
|
+
# Callers embedding with BERT/WordPiece tokenizers (nomic-embed-text,
|
|
44
|
+
# bge-*) should pass the tighter ratio from their TextPreparer
|
|
45
|
+
# (~1.5–2.5) so truncation stays honest for that provider — or use
|
|
46
|
+
# {TokenUtils.chars_per_token_for(:ollama)} for the shipped default.
|
|
47
|
+
DEFAULT_CHARS_PER_TOKEN = TokenUtils::DEFAULT_CHARS_PER_TOKEN
|
|
48
|
+
|
|
37
49
|
# @param metadata_store [#find] Store that resolves identifiers to unit data
|
|
38
50
|
# @param budget [Integer] Total token budget
|
|
39
|
-
|
|
51
|
+
# @param chars_per_token [Float] Tokenizer-calibrated char/token ratio used
|
|
52
|
+
# for truncation sizing. Match this to the embedding provider in use —
|
|
53
|
+
# {Woods::Embedding::TextPreparer#chars_per_token} exposes the live
|
|
54
|
+
# value from the indexing-time preparer.
|
|
55
|
+
# @param token_counter [#count, nil] Optional exact tokenizer (typically
|
|
56
|
+
# {Woods::Embedding::TokenCounter}). When provided, token estimation
|
|
57
|
+
# uses the model's real WordPiece/BPE output instead of the
|
|
58
|
+
# `chars / chars_per_token` heuristic, which matters most for the
|
|
59
|
+
# Ollama path (ratios vary widely across Rails source, 1.5–2.5).
|
|
60
|
+
# The heuristic remains the fallback when the counter is nil or the
|
|
61
|
+
# tokenizer gem isn't installed.
|
|
62
|
+
def initialize(metadata_store:, budget: DEFAULT_BUDGET,
|
|
63
|
+
chars_per_token: DEFAULT_CHARS_PER_TOKEN,
|
|
64
|
+
token_counter: nil)
|
|
40
65
|
@metadata_store = metadata_store
|
|
41
66
|
@budget = budget
|
|
67
|
+
# Guard against 0 / negative / NaN ratios — any of those would make
|
|
68
|
+
# `estimate_tokens` div-by-zero or return a negative budget, which
|
|
69
|
+
# would silently truncate every section to empty. Fall back to the
|
|
70
|
+
# default ratio rather than propagate the bogus input.
|
|
71
|
+
ratio = chars_per_token.to_f
|
|
72
|
+
@chars_per_token = ratio.positive? ? ratio : DEFAULT_CHARS_PER_TOKEN
|
|
73
|
+
@token_counter = token_counter
|
|
42
74
|
end
|
|
43
75
|
|
|
76
|
+
# @return [Float] the configured chars-per-token ratio
|
|
77
|
+
attr_reader :chars_per_token
|
|
78
|
+
|
|
79
|
+
# @return [#count, nil] the exact tokenizer, if one was injected
|
|
80
|
+
attr_reader :token_counter
|
|
81
|
+
|
|
44
82
|
# Assemble context from ranked candidates within token budget.
|
|
45
83
|
#
|
|
46
84
|
# @param candidates [Array<Candidate>] Ranked search candidates
|
|
@@ -54,6 +92,13 @@ module Woods
|
|
|
54
92
|
sources = []
|
|
55
93
|
tokens_used = 0
|
|
56
94
|
|
|
95
|
+
# Collapse +User#chunk_0+, +User#chunk_1+, … back to their base unit
|
|
96
|
+
# BEFORE metadata lookup and section assembly. Chunk IDs are an
|
|
97
|
+
# embedding-side concern — the metadata store is keyed by the base
|
|
98
|
+
# identifier, and callers don't want the same unit formatted twice
|
|
99
|
+
# just because multiple chunks matched the query.
|
|
100
|
+
candidates = collapse_chunk_candidates(candidates)
|
|
101
|
+
|
|
57
102
|
# Pre-fetch all candidate metadata in one batch query
|
|
58
103
|
@unit_cache = @metadata_store.find_batch(candidates.map(&:identifier))
|
|
59
104
|
|
|
@@ -78,6 +123,46 @@ module Woods
|
|
|
78
123
|
|
|
79
124
|
private
|
|
80
125
|
|
|
126
|
+
# Suffix the Indexer appends when a single unit is split into multiple
|
|
127
|
+
# embedding vectors (rails_source and other large units). Separator
|
|
128
|
+
# is +#+ so it can never collide with a Ruby constant (+::+) or a
|
|
129
|
+
# method ref (+#instance_method+) in an identifier.
|
|
130
|
+
CHUNK_SUFFIX_PATTERN = /#chunk_\d+\z/
|
|
131
|
+
private_constant :CHUNK_SUFFIX_PATTERN
|
|
132
|
+
|
|
133
|
+
# Strip the +#chunk_N+ suffix from an identifier, if present.
|
|
134
|
+
# +User#chunk_3+ → +User+; +User+ stays +User+.
|
|
135
|
+
def base_identifier(identifier)
|
|
136
|
+
identifier.sub(CHUNK_SUFFIX_PATTERN, '')
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Rewrite every candidate to point at its base identifier and keep only
|
|
140
|
+
# the highest-scoring candidate per base unit. Preserves original score
|
|
141
|
+
# ordering on the output so downstream +sort_by(-score)+ gets the same
|
|
142
|
+
# input it would on an unchunked corpus.
|
|
143
|
+
def collapse_chunk_candidates(candidates)
|
|
144
|
+
best = {}
|
|
145
|
+
candidates.each do |c|
|
|
146
|
+
base = base_identifier(c.identifier)
|
|
147
|
+
rewritten = c.identifier == base ? c : rewrite_identifier(c, base)
|
|
148
|
+
best[base] = rewritten if best[base].nil? || rewritten.score > best[base].score
|
|
149
|
+
end
|
|
150
|
+
best.values
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Return a clone of +candidate+ with its identifier replaced. Kept as
|
|
154
|
+
# its own method so the Candidate struct shape is referenced in exactly
|
|
155
|
+
# one place — if SearchExecutor::Candidate grows fields, this is the
|
|
156
|
+
# only spot to update.
|
|
157
|
+
def rewrite_identifier(candidate, new_identifier)
|
|
158
|
+
SearchExecutor::Candidate.new(
|
|
159
|
+
identifier: new_identifier,
|
|
160
|
+
score: candidate.score,
|
|
161
|
+
source: candidate.source,
|
|
162
|
+
metadata: candidate.metadata
|
|
163
|
+
)
|
|
164
|
+
end
|
|
165
|
+
|
|
81
166
|
# Add structural context section if provided.
|
|
82
167
|
#
|
|
83
168
|
# @return [Integer] Updated tokens_used count
|
|
@@ -224,17 +309,39 @@ module Woods
|
|
|
224
309
|
def truncate_to_budget(text, token_budget)
|
|
225
310
|
return text if estimate_tokens(text) <= token_budget
|
|
226
311
|
|
|
227
|
-
#
|
|
228
|
-
|
|
312
|
+
# Target-char sizing uses the effective ratio: the provider's live
|
|
313
|
+
# ratio when we have an exact counter, otherwise @chars_per_token.
|
|
314
|
+
# 10 % safety margin keeps us below the budget after the imprecise
|
|
315
|
+
# tokenizer runs again on the truncated output.
|
|
316
|
+
target_chars = (token_budget * effective_chars_per_token * 0.9).to_i
|
|
229
317
|
"#{text[0...target_chars]}\n... [truncated]"
|
|
230
318
|
end
|
|
231
319
|
|
|
232
|
-
# Estimate token count
|
|
320
|
+
# Estimate token count. Prefers the injected {TokenCounter} — which
|
|
321
|
+
# loads the provider's real tokenizer and returns exact counts — and
|
|
322
|
+
# falls back to the configured chars-per-token ratio when no counter
|
|
323
|
+
# is wired.
|
|
233
324
|
#
|
|
234
325
|
# @param text [String]
|
|
235
326
|
# @return [Integer]
|
|
236
327
|
def estimate_tokens(text)
|
|
237
|
-
|
|
328
|
+
return 0 if text.nil? || text.empty?
|
|
329
|
+
return @token_counter.count(text) if @token_counter
|
|
330
|
+
|
|
331
|
+
(text.length / @chars_per_token).ceil
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
# Effective chars-per-token for chunk-size sizing. When an exact
|
|
335
|
+
# counter is present, prefer its native ratio (e.g. 1.2 for
|
|
336
|
+
# nomic-embed-text) so truncation and estimation agree. Falls back
|
|
337
|
+
# to the configured ratio if the counter reports 0 or a non-positive
|
|
338
|
+
# value (which would make truncation target zero chars).
|
|
339
|
+
def effective_chars_per_token
|
|
340
|
+
if @token_counter.respond_to?(:chars_per_token) && @token_counter.chars_per_token
|
|
341
|
+
ratio = @token_counter.chars_per_token.to_f
|
|
342
|
+
return ratio if ratio.positive?
|
|
343
|
+
end
|
|
344
|
+
@chars_per_token
|
|
238
345
|
end
|
|
239
346
|
|
|
240
347
|
# Build the final AssembledContext result.
|
|
@@ -31,7 +31,7 @@ module Woods
|
|
|
31
31
|
implement: /\b(implement|add|create|build|write|make|generate)\b/i,
|
|
32
32
|
compare: /\b(compare|difference|vs|versus|between|contrast)\b/i,
|
|
33
33
|
# rubocop:disable Layout/LineLength
|
|
34
|
-
framework: /\b(how does rails|what does rails|rails .+ work|work.+\brails\b|in rails\b|activerecord|actioncontroller|activejob)\b/i,
|
|
34
|
+
framework: /\b(how does rails|what does rails|rails .+ work|work.+\brails\b|in rails\b|activerecord|actioncontroller|activejob|actionmailer|actioncable|actiontext|activestorage|solid_queue|solid_cache|solid_cable|kamal|propshaft|importmap|hotwire|turbo|stimulus|zeitwerk)\b/i,
|
|
35
35
|
# rubocop:enable Layout/LineLength
|
|
36
36
|
reference: /\b(show me|what is|what are|list|options for|api|interface|signature)\b/i,
|
|
37
37
|
understand: /\b(how|why|explain|understand|what happens|describe|overview)\b/i
|
|
@@ -35,8 +35,11 @@ module Woods
|
|
|
35
35
|
RRF_K = 60
|
|
36
36
|
|
|
37
37
|
# @param metadata_store [#find] Store that resolves identifiers to unit metadata
|
|
38
|
-
|
|
38
|
+
# @param graph_store [#pagerank, nil] Optional graph store exposing PageRank scores.
|
|
39
|
+
# When present, PageRank rank-percentile replaces the bucketed importance signal.
|
|
40
|
+
def initialize(metadata_store:, graph_store: nil)
|
|
39
41
|
@metadata_store = metadata_store
|
|
42
|
+
@graph_store = graph_store
|
|
40
43
|
end
|
|
41
44
|
|
|
42
45
|
# Rank candidates by weighted signal scoring with diversity adjustment.
|
|
@@ -89,8 +92,9 @@ module Woods
|
|
|
89
92
|
|
|
90
93
|
candidates.group_by(&:source).each_value do |source_candidates|
|
|
91
94
|
ranked = source_candidates.sort_by { |c| -c.score }
|
|
92
|
-
ranked.each_with_index do |candidate,
|
|
93
|
-
|
|
95
|
+
ranked.each_with_index do |candidate, idx|
|
|
96
|
+
# RRF is 1-based (Cormack et al., 2009): top-ranked doc uses rank 1, not 0.
|
|
97
|
+
rrf_scores[candidate.identifier] += 1.0 / (RRF_K + idx + 1)
|
|
94
98
|
metadata_map[candidate.identifier] ||= candidate.metadata
|
|
95
99
|
end
|
|
96
100
|
end
|
|
@@ -102,7 +106,14 @@ module Woods
|
|
|
102
106
|
#
|
|
103
107
|
# @return [Array<Candidate>]
|
|
104
108
|
def rebuild_rrf_candidates(candidates, rrf_scores, metadata_map)
|
|
105
|
-
|
|
109
|
+
# Plain-Ruby `index_by` substitute — the ActiveSupport version
|
|
110
|
+
# isn't loaded when the gem runs outside a Rails boot. Preserve
|
|
111
|
+
# last-wins semantics to match ActiveSupport's `Enumerable#index_by`
|
|
112
|
+
# so the merged candidate's `source` continues to reflect the
|
|
113
|
+
# final source a given identifier appeared in (relevant when
|
|
114
|
+
# observability/debug tools read `.source` on an RRF result).
|
|
115
|
+
original_by_id = {}
|
|
116
|
+
candidates.each { |c| original_by_id[c.identifier] = c }
|
|
106
117
|
rrf_scores.sort_by { |_id, score| -score }.map do |identifier, score|
|
|
107
118
|
original = original_by_id[identifier]
|
|
108
119
|
build_candidate(
|
|
@@ -133,7 +144,7 @@ module Woods
|
|
|
133
144
|
semantic: candidate.score.to_f,
|
|
134
145
|
keyword: keyword_score(candidate),
|
|
135
146
|
recency: recency_score(unit),
|
|
136
|
-
importance: importance_score(unit),
|
|
147
|
+
importance: importance_score(unit, candidate.identifier),
|
|
137
148
|
type_match: type_match_score(unit, classification),
|
|
138
149
|
diversity: 1.0 # Adjusted after initial sort
|
|
139
150
|
}
|
|
@@ -184,9 +195,18 @@ module Woods
|
|
|
184
195
|
|
|
185
196
|
# Importance score based on PageRank / structural importance.
|
|
186
197
|
#
|
|
198
|
+
# Prefers live PageRank from the graph store (rank-percentile 0.0–1.0) when
|
|
199
|
+
# available. Falls back to bucketed importance metadata (`:high`/`:medium`/`:low`)
|
|
200
|
+
# when there is no graph store, the PageRank map is empty, or the identifier
|
|
201
|
+
# is not yet indexed (e.g., a new unit since the last extraction).
|
|
202
|
+
#
|
|
187
203
|
# @param unit [Hash, nil] Unit metadata from store
|
|
204
|
+
# @param identifier [String] Candidate identifier (matched against PageRank keys)
|
|
188
205
|
# @return [Float] 0.0 to 1.0
|
|
189
|
-
def importance_score(unit)
|
|
206
|
+
def importance_score(unit, identifier)
|
|
207
|
+
pagerank = pagerank_importance_map[identifier]
|
|
208
|
+
return pagerank if pagerank
|
|
209
|
+
|
|
190
210
|
return 0.5 unless unit
|
|
191
211
|
|
|
192
212
|
importance = dig_metadata(unit, :importance)
|
|
@@ -198,6 +218,35 @@ module Woods
|
|
|
198
218
|
end
|
|
199
219
|
end
|
|
200
220
|
|
|
221
|
+
# Lazily-computed rank-percentile map derived from the graph store's PageRank.
|
|
222
|
+
#
|
|
223
|
+
# Top-ranked identifier gets 1.0, bottom-ranked gets 1/n. Identifiers absent
|
|
224
|
+
# from PageRank (new units, ephemeral candidates) return nil and fall back
|
|
225
|
+
# to the bucketed importance signal.
|
|
226
|
+
#
|
|
227
|
+
# @return [Hash{String => Float}]
|
|
228
|
+
def pagerank_importance_map
|
|
229
|
+
@pagerank_importance_map ||= compute_pagerank_importance_map
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
# Compute rank-percentile scores from the graph store's PageRank hash.
|
|
233
|
+
#
|
|
234
|
+
# @return [Hash{String => Float}] Empty hash when no graph store or no scores.
|
|
235
|
+
def compute_pagerank_importance_map
|
|
236
|
+
return {} unless @graph_store.respond_to?(:pagerank)
|
|
237
|
+
|
|
238
|
+
scores = @graph_store.pagerank
|
|
239
|
+
return {} if scores.nil? || scores.empty?
|
|
240
|
+
|
|
241
|
+
ranked = scores.sort_by { |_id, score| -score }
|
|
242
|
+
total = ranked.size.to_f
|
|
243
|
+
ranked.each_with_index.to_h do |(identifier, _score), rank|
|
|
244
|
+
[identifier, 1.0 - (rank / total)]
|
|
245
|
+
end
|
|
246
|
+
rescue StandardError
|
|
247
|
+
{}
|
|
248
|
+
end
|
|
249
|
+
|
|
201
250
|
# Type match score — bonus when result type matches query target_type.
|
|
202
251
|
#
|
|
203
252
|
# @param unit [Hash, nil] Unit metadata from store
|
|
@@ -56,10 +56,27 @@ module Woods
|
|
|
56
56
|
# @param query [String] The original query text
|
|
57
57
|
# @param classification [QueryClassifier::Classification] Classified query
|
|
58
58
|
# @param limit [Integer] Maximum candidates to return
|
|
59
|
+
# @param type_filter [Array<String>, nil] When set, vector and hybrid
|
|
60
|
+
# strategies push this down into the vector store's metadata
|
|
61
|
+
# filter — used by {Retriever#retrieve} to rank-within-type when
|
|
62
|
+
# the unfiltered global top-K had no candidate of the requested type.
|
|
63
|
+
# Overrides the classifier-derived +target_type+ in filter construction.
|
|
64
|
+
# @param strategy [Symbol, nil] Override the classifier-selected strategy.
|
|
65
|
+
# {Retriever#within_type_fallback} passes +:vector+ here because the
|
|
66
|
+
# vector path is the only one that honors +type_filter+; if the
|
|
67
|
+
# classifier picked +:keyword+ / +:graph+ / +:direct+ the fallback
|
|
68
|
+
# would otherwise silently re-run the same strategy, get filtered to
|
|
69
|
+
# empty, and violate the "never empty when units exist" contract.
|
|
59
70
|
# @return [ExecutionResult] Candidates with strategy metadata
|
|
60
|
-
def execute(query:, classification:, limit: 20)
|
|
61
|
-
strategy
|
|
62
|
-
candidates = run_strategy(
|
|
71
|
+
def execute(query:, classification:, limit: 20, type_filter: nil, strategy: nil)
|
|
72
|
+
strategy ||= select_strategy(classification)
|
|
73
|
+
candidates = run_strategy(
|
|
74
|
+
strategy,
|
|
75
|
+
query: query,
|
|
76
|
+
classification: classification,
|
|
77
|
+
limit: limit,
|
|
78
|
+
type_filter: type_filter
|
|
79
|
+
)
|
|
63
80
|
|
|
64
81
|
ExecutionResult.new(
|
|
65
82
|
candidates: candidates.first(limit),
|
|
@@ -104,17 +121,18 @@ module Woods
|
|
|
104
121
|
# @param query [String] Original query text
|
|
105
122
|
# @param classification [QueryClassifier::Classification]
|
|
106
123
|
# @param limit [Integer] Max results
|
|
124
|
+
# @param type_filter [Array<String>, nil] Pushed into vector filters
|
|
107
125
|
# @return [Array<Candidate>]
|
|
108
|
-
def run_strategy(strategy, query:, classification:, limit:)
|
|
126
|
+
def run_strategy(strategy, query:, classification:, limit:, type_filter: nil)
|
|
109
127
|
case strategy
|
|
110
128
|
when :vector
|
|
111
|
-
execute_vector(query, classification: classification, limit: limit)
|
|
129
|
+
execute_vector(query, classification: classification, limit: limit, type_filter: type_filter)
|
|
112
130
|
when :keyword
|
|
113
131
|
execute_keyword(classification: classification, limit: limit)
|
|
114
132
|
when :graph
|
|
115
133
|
execute_graph(classification: classification, limit: limit)
|
|
116
134
|
when :hybrid
|
|
117
|
-
execute_hybrid(query, classification: classification, limit: limit)
|
|
135
|
+
execute_hybrid(query, classification: classification, limit: limit, type_filter: type_filter)
|
|
118
136
|
when :direct
|
|
119
137
|
execute_direct(classification: classification, limit: limit)
|
|
120
138
|
end
|
|
@@ -123,9 +141,9 @@ module Woods
|
|
|
123
141
|
# Vector strategy: embed the query and search by similarity.
|
|
124
142
|
#
|
|
125
143
|
# @return [Array<Candidate>]
|
|
126
|
-
def execute_vector(query, classification:, limit:)
|
|
144
|
+
def execute_vector(query, classification:, limit:, type_filter: nil)
|
|
127
145
|
query_vector = @embedding_provider.embed(query)
|
|
128
|
-
filters = build_vector_filters(classification)
|
|
146
|
+
filters = build_vector_filters(classification, type_filter: type_filter)
|
|
129
147
|
|
|
130
148
|
results = @vector_store.search(query_vector, limit: limit, filters: filters)
|
|
131
149
|
results.map do |r|
|
|
@@ -209,9 +227,10 @@ module Woods
|
|
|
209
227
|
# Hybrid strategy: combine vector, keyword, and graph expansion.
|
|
210
228
|
#
|
|
211
229
|
# @return [Array<Candidate>]
|
|
212
|
-
def execute_hybrid(query, classification:, limit:)
|
|
230
|
+
def execute_hybrid(query, classification:, limit:, type_filter: nil)
|
|
213
231
|
# Gather from all three sources
|
|
214
|
-
vector_candidates = execute_vector(query, classification: classification, limit: limit
|
|
232
|
+
vector_candidates = execute_vector(query, classification: classification, limit: limit,
|
|
233
|
+
type_filter: type_filter)
|
|
215
234
|
keyword_candidates = execute_keyword(classification: classification, limit: limit)
|
|
216
235
|
|
|
217
236
|
# Graph expansion on top vector results
|
|
@@ -266,13 +285,23 @@ module Woods
|
|
|
266
285
|
candidates
|
|
267
286
|
end
|
|
268
287
|
|
|
269
|
-
# Build metadata filters for vector search based on classification
|
|
288
|
+
# Build metadata filters for vector search based on classification
|
|
289
|
+
# and an optional explicit type filter from the caller.
|
|
290
|
+
#
|
|
291
|
+
# The caller's explicit +type_filter+ overrides classifier-derived
|
|
292
|
+
# +target_type+ when both are present — the caller opted into a
|
|
293
|
+
# specific set of types and that intent beats a heuristic.
|
|
270
294
|
#
|
|
271
295
|
# @param classification [QueryClassifier::Classification]
|
|
296
|
+
# @param type_filter [Array<String>, nil]
|
|
272
297
|
# @return [Hash]
|
|
273
|
-
def build_vector_filters(classification)
|
|
298
|
+
def build_vector_filters(classification, type_filter: nil)
|
|
274
299
|
filters = {}
|
|
275
|
-
|
|
300
|
+
if type_filter && !type_filter.empty?
|
|
301
|
+
filters[:type] = type_filter.map(&:to_s)
|
|
302
|
+
elsif classification.target_type
|
|
303
|
+
filters[:type] = classification.target_type.to_s
|
|
304
|
+
end
|
|
276
305
|
filters
|
|
277
306
|
end
|
|
278
307
|
|