woods 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +169 -0
- data/README.md +20 -8
- data/exe/woods-console +51 -6
- data/exe/woods-console-mcp +24 -4
- data/exe/woods-mcp +30 -7
- data/exe/woods-mcp-http +47 -6
- data/lib/generators/woods/install_generator.rb +13 -4
- data/lib/generators/woods/templates/woods.rb.tt +155 -0
- data/lib/tasks/woods.rake +15 -50
- data/lib/woods/builder.rb +174 -9
- data/lib/woods/cache/cache_middleware.rb +360 -31
- data/lib/woods/chunking/semantic_chunker.rb +334 -7
- data/lib/woods/console/adapters/job_adapter.rb +10 -4
- data/lib/woods/console/audit_logger.rb +76 -4
- data/lib/woods/console/bridge.rb +48 -15
- data/lib/woods/console/bridge_protocol.rb +44 -0
- data/lib/woods/console/confirmation.rb +3 -4
- data/lib/woods/console/console_response_renderer.rb +56 -18
- data/lib/woods/console/credential_index.rb +201 -0
- data/lib/woods/console/credential_scanner.rb +302 -0
- data/lib/woods/console/dispatch_pipeline.rb +138 -0
- data/lib/woods/console/embedded_executor.rb +682 -35
- data/lib/woods/console/eval_guard.rb +319 -0
- data/lib/woods/console/model_validator.rb +1 -3
- data/lib/woods/console/rack_middleware.rb +185 -29
- data/lib/woods/console/redactor.rb +161 -0
- data/lib/woods/console/response_context.rb +127 -0
- data/lib/woods/console/safe_context.rb +220 -23
- data/lib/woods/console/scope_predicate_parser.rb +131 -0
- data/lib/woods/console/server.rb +417 -486
- data/lib/woods/console/sql_noise_stripper.rb +87 -0
- data/lib/woods/console/sql_table_scanner.rb +213 -0
- data/lib/woods/console/sql_validator.rb +81 -31
- data/lib/woods/console/table_gate.rb +93 -0
- data/lib/woods/console/tool_specs.rb +552 -0
- data/lib/woods/console/tools/tier1.rb +3 -3
- data/lib/woods/console/tools/tier4.rb +7 -1
- data/lib/woods/dependency_graph.rb +66 -7
- data/lib/woods/embedding/indexer.rb +190 -6
- data/lib/woods/embedding/openai.rb +40 -4
- data/lib/woods/embedding/provider.rb +104 -8
- data/lib/woods/embedding/text_preparer.rb +23 -3
- data/lib/woods/embedding/token_counter.rb +133 -0
- data/lib/woods/evaluation/baseline_runner.rb +20 -2
- data/lib/woods/evaluation/metrics.rb +4 -1
- data/lib/woods/extracted_unit.rb +1 -0
- data/lib/woods/extractor.rb +7 -1
- data/lib/woods/extractors/controller_extractor.rb +6 -0
- data/lib/woods/extractors/mailer_extractor.rb +16 -2
- data/lib/woods/extractors/model_extractor.rb +6 -1
- data/lib/woods/extractors/phlex_extractor.rb +13 -4
- data/lib/woods/extractors/rails_source_extractor.rb +2 -0
- data/lib/woods/extractors/route_helper_resolver.rb +130 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
- data/lib/woods/extractors/view_component_extractor.rb +12 -1
- data/lib/woods/extractors/view_engines/base.rb +141 -0
- data/lib/woods/extractors/view_engines/erb.rb +145 -0
- data/lib/woods/extractors/view_template_extractor.rb +92 -133
- data/lib/woods/flow_assembler.rb +23 -15
- data/lib/woods/flow_precomputer.rb +21 -2
- data/lib/woods/graph_analyzer.rb +3 -4
- data/lib/woods/index_artifact.rb +173 -0
- data/lib/woods/mcp/bearer_auth.rb +45 -0
- data/lib/woods/mcp/bootstrap_state.rb +94 -0
- data/lib/woods/mcp/bootstrapper.rb +337 -16
- data/lib/woods/mcp/config_resolver.rb +288 -0
- data/lib/woods/mcp/errors.rb +134 -0
- data/lib/woods/mcp/index_reader.rb +265 -30
- data/lib/woods/mcp/origin_guard.rb +132 -0
- data/lib/woods/mcp/provider_probe.rb +166 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
- data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
- data/lib/woods/mcp/server.rb +737 -137
- data/lib/woods/model_name_cache.rb +78 -2
- data/lib/woods/notion/client.rb +25 -2
- data/lib/woods/notion/mappers/model_mapper.rb +36 -2
- data/lib/woods/railtie.rb +55 -15
- data/lib/woods/resilience/circuit_breaker.rb +9 -2
- data/lib/woods/resilience/retryable_provider.rb +40 -3
- data/lib/woods/resolved_config.rb +299 -0
- data/lib/woods/retrieval/context_assembler.rb +112 -5
- data/lib/woods/retrieval/query_classifier.rb +1 -1
- data/lib/woods/retrieval/ranker.rb +55 -6
- data/lib/woods/retrieval/search_executor.rb +42 -13
- data/lib/woods/retriever.rb +330 -24
- data/lib/woods/session_tracer/middleware.rb +35 -1
- data/lib/woods/storage/graph_store.rb +39 -0
- data/lib/woods/storage/inapplicable_backend.rb +14 -0
- data/lib/woods/storage/metadata_store.rb +129 -1
- data/lib/woods/storage/pgvector.rb +70 -8
- data/lib/woods/storage/qdrant.rb +196 -5
- data/lib/woods/storage/snapshotter/metadata.rb +172 -0
- data/lib/woods/storage/snapshotter/vector.rb +238 -0
- data/lib/woods/storage/snapshotter.rb +24 -0
- data/lib/woods/storage/vector_store.rb +184 -35
- data/lib/woods/tasks.rb +85 -0
- data/lib/woods/temporal/snapshot_store.rb +49 -1
- data/lib/woods/token_utils.rb +44 -5
- data/lib/woods/unblocked/client.rb +1 -1
- data/lib/woods/unblocked/document_builder.rb +35 -10
- data/lib/woods/unblocked/exporter.rb +1 -1
- data/lib/woods/util/host_guard.rb +61 -0
- data/lib/woods/version.rb +1 -1
- data/lib/woods.rb +126 -6
- metadata +69 -4
|
@@ -23,8 +23,9 @@ module Woods
|
|
|
23
23
|
class DependencyGraph
|
|
24
24
|
def initialize
|
|
25
25
|
@nodes = {} # identifier => { type:, file_path: }
|
|
26
|
-
@edges = {} # identifier => [
|
|
26
|
+
@edges = {} # identifier => [{ target:, via: }]
|
|
27
27
|
@reverse = {} # identifier => Set of dependent identifiers
|
|
28
|
+
@reverse_via = {} # [target, via] => Set of dependent identifiers
|
|
28
29
|
@file_map = {} # file_path => identifier
|
|
29
30
|
@type_index = {} # type => Set of identifiers
|
|
30
31
|
@to_h = nil
|
|
@@ -42,7 +43,7 @@ module Woods
|
|
|
42
43
|
namespace: unit.namespace
|
|
43
44
|
}
|
|
44
45
|
|
|
45
|
-
@edges[unit.identifier] = unit.dependencies.map { |d| d[:target] }
|
|
46
|
+
@edges[unit.identifier] = unit.dependencies.map { |d| { target: d[:target], via: d[:via] } }
|
|
46
47
|
@file_map[unit.file_path] = unit.identifier if unit.file_path
|
|
47
48
|
|
|
48
49
|
# Type index for filtering (Set-based for O(1) insert)
|
|
@@ -51,6 +52,7 @@ module Woods
|
|
|
51
52
|
# Build reverse edges (Set-based for O(1) insert)
|
|
52
53
|
unit.dependencies.each do |dep|
|
|
53
54
|
(@reverse[dep[:target]] ||= Set.new).add(unit.identifier)
|
|
55
|
+
(@reverse_via[[dep[:target], dep[:via]]] ||= Set.new).add(unit.identifier)
|
|
54
56
|
end
|
|
55
57
|
end
|
|
56
58
|
|
|
@@ -107,17 +109,28 @@ module Woods
|
|
|
107
109
|
# Get direct dependencies of a unit
|
|
108
110
|
#
|
|
109
111
|
# @param identifier [String] Unit identifier
|
|
112
|
+
# @param via [Symbol, Array<Symbol>, nil] Filter by relationship type(s)
|
|
110
113
|
# @return [Array<String>] List of dependency identifiers
|
|
111
|
-
def dependencies_of(identifier)
|
|
112
|
-
@edges[identifier] || []
|
|
114
|
+
def dependencies_of(identifier, via: nil)
|
|
115
|
+
edges = @edges[identifier] || []
|
|
116
|
+
if via
|
|
117
|
+
via_set = Array(via)
|
|
118
|
+
edges = edges.select { |e| via_set.include?(e[:via]) }
|
|
119
|
+
end
|
|
120
|
+
edges.map { |e| e[:target] }
|
|
113
121
|
end
|
|
114
122
|
|
|
115
123
|
# Get direct dependents of a unit (what depends on it)
|
|
116
124
|
#
|
|
117
125
|
# @param identifier [String] Unit identifier
|
|
126
|
+
# @param via [Symbol, Array<Symbol>, nil] Filter by relationship type(s)
|
|
118
127
|
# @return [Array<String>] List of dependent identifiers
|
|
119
|
-
def dependents_of(identifier)
|
|
120
|
-
@reverse.fetch(identifier, Set.new).to_a
|
|
128
|
+
def dependents_of(identifier, via: nil)
|
|
129
|
+
return @reverse.fetch(identifier, Set.new).to_a unless via
|
|
130
|
+
|
|
131
|
+
Array(via).each_with_object(Set.new) do |v, result|
|
|
132
|
+
@reverse_via.fetch([identifier, v], Set.new).each { |dep| result.add(dep) }
|
|
133
|
+
end.to_a
|
|
121
134
|
end
|
|
122
135
|
|
|
123
136
|
# Get all units of a specific type
|
|
@@ -204,7 +217,8 @@ module Woods
|
|
|
204
217
|
raw_nodes = data[:nodes] || data['nodes'] || {}
|
|
205
218
|
graph.instance_variable_set(:@nodes, raw_nodes.transform_values { |v| symbolize_node(v) })
|
|
206
219
|
|
|
207
|
-
|
|
220
|
+
raw_edges = data[:edges] || data['edges'] || {}
|
|
221
|
+
graph.instance_variable_set(:@edges, raw_edges.transform_values { |edges| normalize_edges(edges) })
|
|
208
222
|
|
|
209
223
|
raw_reverse = data[:reverse] || data['reverse'] || {}
|
|
210
224
|
graph.instance_variable_set(:@reverse, raw_reverse.transform_values { |v| v.is_a?(Set) ? v : Set.new(v) })
|
|
@@ -216,6 +230,15 @@ module Woods
|
|
|
216
230
|
v.is_a?(Set) ? v : Set.new(v)
|
|
217
231
|
end)
|
|
218
232
|
|
|
233
|
+
# Rebuild reverse_via index from edges
|
|
234
|
+
reverse_via = {}
|
|
235
|
+
graph.instance_variable_get(:@edges).each do |source_id, edges|
|
|
236
|
+
edges.each do |edge|
|
|
237
|
+
(reverse_via[[edge[:target], edge[:via]]] ||= Set.new).add(source_id)
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
graph.instance_variable_set(:@reverse_via, reverse_via)
|
|
241
|
+
|
|
219
242
|
graph
|
|
220
243
|
end
|
|
221
244
|
|
|
@@ -232,5 +255,41 @@ module Woods
|
|
|
232
255
|
namespace: node[:namespace] || node['namespace']
|
|
233
256
|
}
|
|
234
257
|
end
|
|
258
|
+
|
|
259
|
+
# Normalize edge data from either old format (bare strings) or new format (hashes).
|
|
260
|
+
#
|
|
261
|
+
# ROUND-TRIP INVARIANT (do not break when refactoring):
|
|
262
|
+
# DependencyGraph#to_h -> JSON.generate -> JSON.parse -> DependencyGraph.from_h
|
|
263
|
+
# must always yield the same in-memory shape. The two normalizers that
|
|
264
|
+
# sit at either end of this round trip are INTENTIONALLY SEPARATE — do
|
|
265
|
+
# not merge them:
|
|
266
|
+
#
|
|
267
|
+
# - This method ({.normalize_edges}) runs on Ruby objects. It produces
|
|
268
|
+
# `{ target:, via: }` with SYMBOL keys because consumers
|
|
269
|
+
# ({DependencyGraph#dependencies_of}, {GraphAnalyzer}) key on symbols.
|
|
270
|
+
# - {Woods::MCP::IndexReader.normalize_all_edges} runs on parsed JSON,
|
|
271
|
+
# producing `{ 'target' => ..., 'via' => ... }` with STRING keys,
|
|
272
|
+
# because the MCP tools serialize straight through to the client and
|
|
273
|
+
# symbol keys would become `:target` on the wire.
|
|
274
|
+
#
|
|
275
|
+
# This method also accepts OLD-format bare-string edges so graphs
|
|
276
|
+
# serialized before the `{target, via}` migration still load without
|
|
277
|
+
# explicit data conversion.
|
|
278
|
+
#
|
|
279
|
+
# @param edges [Array] Edge entries — either strings or hashes
|
|
280
|
+
# @return [Array<Hash>] Normalized edges with :target and :via keys
|
|
281
|
+
def self.normalize_edges(edges)
|
|
282
|
+
return [] unless edges.is_a?(Array)
|
|
283
|
+
|
|
284
|
+
edges.map do |edge|
|
|
285
|
+
if edge.is_a?(String)
|
|
286
|
+
{ target: edge, via: nil }
|
|
287
|
+
elsif edge.is_a?(Hash)
|
|
288
|
+
{ target: edge[:target] || edge['target'], via: (edge[:via] || edge['via'])&.to_sym }
|
|
289
|
+
else
|
|
290
|
+
{ target: edge.to_s, via: nil }
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
end
|
|
235
294
|
end
|
|
236
295
|
end
|
|
@@ -2,27 +2,65 @@
|
|
|
2
2
|
|
|
3
3
|
require 'json'
|
|
4
4
|
require 'digest'
|
|
5
|
+
require 'fileutils'
|
|
6
|
+
|
|
7
|
+
require_relative '../extracted_unit'
|
|
8
|
+
require_relative '../chunking/semantic_chunker'
|
|
5
9
|
|
|
6
10
|
module Woods
|
|
7
11
|
module Embedding
|
|
8
12
|
# Orchestrates the indexing pipeline: reads extracted units, prepares text,
|
|
9
13
|
# generates embeddings, and stores vectors. Supports full and incremental
|
|
10
14
|
# modes with checkpoint-based resumability.
|
|
11
|
-
|
|
15
|
+
#
|
|
16
|
+
# When the vector store is an in-memory adapter (responds to +#each_entry+
|
|
17
|
+
# and +#bulk_load+) and +output_dir+ is set, a successful {#index_all} run
|
|
18
|
+
# also persists the stores to disk via the Snapshotter pair and atomically
|
|
19
|
+
# flips the +dumps/latest+ pointer. Persistent backends (pgvector, Qdrant)
|
|
20
|
+
# see zero behaviour change — no Snapshotter is invoked.
|
|
21
|
+
class Indexer # rubocop:disable Metrics/ClassLength
|
|
22
|
+
# @param chunker [Chunking::SemanticChunker, nil] Splits oversize units
|
|
23
|
+
# into semantically coherent chunks before embedding. +nil+ disables
|
|
24
|
+
# chunking — units go to the provider whole (useful in tests).
|
|
12
25
|
# @param checkpoint_interval [Integer] Save checkpoint every N batches (default: 10)
|
|
13
|
-
|
|
26
|
+
# @param metadata_store [#each_entry, #bulk_load, nil] Optional metadata store.
|
|
27
|
+
# When present alongside an in-memory vector store, both are persisted
|
|
28
|
+
# at the end of a successful {#index_all} run.
|
|
29
|
+
# @param resolved_config [Woods::ResolvedConfig, nil] Captured config for
|
|
30
|
+
# +woods.json+ — written to +output_dir+ on {#index_all} completion.
|
|
31
|
+
# @param dump_retention_count [Integer] Number of completed dump directories
|
|
32
|
+
# to keep under +output_dir/dumps/+. Older dumps are removed after a
|
|
33
|
+
# successful {#index_all} run (default: 3).
|
|
34
|
+
def initialize(provider:, text_preparer:, vector_store:, output_dir:, # rubocop:disable Metrics/ParameterLists
|
|
35
|
+
chunker: Chunking::SemanticChunker.new,
|
|
36
|
+
batch_size: 32, checkpoint_interval: 10,
|
|
37
|
+
metadata_store: nil,
|
|
38
|
+
resolved_config: nil,
|
|
39
|
+
dump_retention_count: 3)
|
|
14
40
|
@provider = provider
|
|
15
41
|
@text_preparer = text_preparer
|
|
16
42
|
@vector_store = vector_store
|
|
17
43
|
@output_dir = output_dir
|
|
44
|
+
@chunker = chunker
|
|
18
45
|
@batch_size = batch_size
|
|
19
46
|
@checkpoint_interval = checkpoint_interval
|
|
47
|
+
@metadata_store = metadata_store
|
|
48
|
+
@resolved_config = resolved_config
|
|
49
|
+
@dump_retention_count = dump_retention_count
|
|
20
50
|
end
|
|
21
51
|
|
|
22
52
|
# Index all extracted units (full mode). Returns stats hash.
|
|
53
|
+
#
|
|
54
|
+
# When the vector store is an in-memory adapter, persists the embedded
|
|
55
|
+
# vectors (and metadata, if a metadata store was provided) to disk under
|
|
56
|
+
# +output_dir/dumps/<timestamp>/+ and atomically flips the +latest+
|
|
57
|
+
# pointer. Writes +woods.json+ when +resolved_config+ was supplied.
|
|
58
|
+
#
|
|
23
59
|
# @return [Hash] Stats with :processed, :skipped, :errors counts
|
|
24
60
|
def index_all
|
|
25
|
-
process_units(load_units, incremental: false)
|
|
61
|
+
stats = process_units(load_units, incremental: false)
|
|
62
|
+
persist_snapshot if persistable?
|
|
63
|
+
stats
|
|
26
64
|
end
|
|
27
65
|
|
|
28
66
|
# Index only changed units (incremental mode). Returns stats hash.
|
|
@@ -37,7 +75,11 @@ module Woods
|
|
|
37
75
|
Dir.glob(File.join(@output_dir, '**', '*.json')).filter_map do |path|
|
|
38
76
|
next if File.basename(path) == 'checkpoint.json'
|
|
39
77
|
|
|
40
|
-
JSON.parse(File.read(path))
|
|
78
|
+
data = JSON.parse(File.read(path))
|
|
79
|
+
# Extraction output also contains index listings (_index.json arrays) and
|
|
80
|
+
# summary files (manifest.json, dependency_graph.json, graph_analysis.json)
|
|
81
|
+
# that live alongside per-unit JSON. Filter to the unit shape.
|
|
82
|
+
data if data.is_a?(Hash) && data.key?('type') && data.key?('identifier')
|
|
41
83
|
rescue JSON::ParserError
|
|
42
84
|
nil
|
|
43
85
|
end
|
|
@@ -62,6 +104,12 @@ module Woods
|
|
|
62
104
|
|
|
63
105
|
def process_batch(batch, checkpoint, stats, incremental:)
|
|
64
106
|
to_embed = batch.each_with_object([]) do |unit_data, items|
|
|
107
|
+
persist_unit_metadata(unit_data)
|
|
108
|
+
# Incremental skip uses `source_hash`, which the extractor derives
|
|
109
|
+
# from the unit's *source_code string only* (see ExtractedUnit#to_h
|
|
110
|
+
# and Extractor#dump_units). It is NOT a hash of the serialized
|
|
111
|
+
# unit_data JSON — so key ordering or whitespace in the _index.json
|
|
112
|
+
# does not invalidate checkpoints across Ruby-minor upgrades.
|
|
65
113
|
if incremental && checkpoint[unit_data['identifier']] == unit_data['source_hash']
|
|
66
114
|
stats[:skipped] += 1
|
|
67
115
|
next
|
|
@@ -72,6 +120,20 @@ module Woods
|
|
|
72
120
|
embed_and_store(to_embed, checkpoint, stats)
|
|
73
121
|
end
|
|
74
122
|
|
|
123
|
+
# Persist a unit's metadata under its base identifier so retrieval can
|
|
124
|
+
# resolve vector-search hits back to their unit data. Without this,
|
|
125
|
+
# the metadata store is left empty at end of run — Snapshotter::Metadata
|
|
126
|
+
# dumps a header with record_count: 0 and every MCP +codebase_retrieve+
|
|
127
|
+
# call silently returns empty text, because ContextAssembler#find_batch
|
|
128
|
+
# misses every candidate identifier. No-op when metadata_store is nil
|
|
129
|
+
# (hosts that don't configure one). Stored under the base identifier,
|
|
130
|
+
# not the chunk-suffixed id — chunks are an embedding-side concern only.
|
|
131
|
+
def persist_unit_metadata(unit_data)
|
|
132
|
+
return unless @metadata_store
|
|
133
|
+
|
|
134
|
+
@metadata_store.store(unit_data['identifier'], unit_data)
|
|
135
|
+
end
|
|
136
|
+
|
|
75
137
|
def collect_embed_items(unit_data, items)
|
|
76
138
|
texts = prepare_texts(unit_data)
|
|
77
139
|
identifier = unit_data['identifier']
|
|
@@ -83,9 +145,71 @@ module Woods
|
|
|
83
145
|
end
|
|
84
146
|
end
|
|
85
147
|
|
|
86
|
-
def prepare_texts(unit_data)
|
|
148
|
+
def prepare_texts(unit_data) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
87
149
|
unit = build_unit(unit_data)
|
|
88
|
-
unit
|
|
150
|
+
apply_chunking(unit) if @chunker && unit.chunks.empty? && needs_chunking?(unit)
|
|
151
|
+
# Extraction may have emitted chunks larger than the provider's
|
|
152
|
+
# budget (rails_source in particular). Enforce the ceiling on
|
|
153
|
+
# whatever chunks we have before handing off to the provider.
|
|
154
|
+
@chunker&.enforce_chunk_limits!(unit) if unit.chunks.any?
|
|
155
|
+
texts = unit.chunks.any? ? @text_preparer.prepare_chunks(unit) : [@text_preparer.prepare(unit)]
|
|
156
|
+
# Drop empty/whitespace-only texts — embedding providers reject
|
|
157
|
+
# them with 400 and retrying never succeeds. Unit is effectively
|
|
158
|
+
# skipped when every text is empty (zero-source unit).
|
|
159
|
+
texts.reject { |t| t.nil? || t.strip.empty? || content_portion_empty?(t, unit) }
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# True when a prepared text is just the metadata prefix with no
|
|
163
|
+
# underlying source content (empty source_code + empty chunks).
|
|
164
|
+
# Avoids embedding prefix-only stubs that have no semantic value
|
|
165
|
+
# and would poison the vector space with identical headers.
|
|
166
|
+
def content_portion_empty?(text, unit)
|
|
167
|
+
return false unless unit.chunks.empty?
|
|
168
|
+
return false unless unit.source_code.nil? || unit.source_code.strip.empty?
|
|
169
|
+
|
|
170
|
+
!text.nil?
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Does this unit exceed the embedding provider's single-input
|
|
174
|
+
# budget? Returns false when the provider reports no budget, when
|
|
175
|
+
# the TextPreparer has no calibrated chars-per-token ratio, or when
|
|
176
|
+
# the unit's source fits.
|
|
177
|
+
#
|
|
178
|
+
# When the configured chunker carries a real tokenizer
|
|
179
|
+
# (Embedding::TokenCounter) we also consult it — dense Ruby source
|
|
180
|
+
# tokenizes hotter than chars/token averages suggest, and Ollama
|
|
181
|
+
# rejects over-budget input outright (see ollama/ollama#14186).
|
|
182
|
+
def needs_chunking?(unit)
|
|
183
|
+
budget_tokens = @provider.respond_to?(:max_input_tokens) ? @provider.max_input_tokens : nil
|
|
184
|
+
return false if budget_tokens.nil?
|
|
185
|
+
return false unless @text_preparer.respond_to?(:chars_per_token)
|
|
186
|
+
|
|
187
|
+
source = unit.source_code || ''
|
|
188
|
+
return true if chunker_token_oversize?(source)
|
|
189
|
+
|
|
190
|
+
# Subtract a small prefix allowance — the TextPreparer adds a few
|
|
191
|
+
# hundred characters of context header ([type] identifier / file /
|
|
192
|
+
# dependencies) that count toward the budget too.
|
|
193
|
+
char_budget = (budget_tokens * @text_preparer.chars_per_token).floor - PREFIX_CHAR_ALLOWANCE
|
|
194
|
+
char_budget.positive? && source.length > char_budget
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Ask the chunker's real tokenizer whether +source+ already exceeds
|
|
198
|
+
# the token budget. Returns false when the chunker wasn't built with
|
|
199
|
+
# one (e.g., OpenAI path), leaving the char-based check in charge.
|
|
200
|
+
def chunker_token_oversize?(source)
|
|
201
|
+
return false unless @chunker&.token_counter && @chunker.max_tokens
|
|
202
|
+
|
|
203
|
+
@chunker.token_counter.count(source) > @chunker.max_tokens
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Populate unit.chunks from the configured chunker. The chunker's
|
|
207
|
+
# own +max_chars+ safety net is what guarantees each chunk fits,
|
|
208
|
+
# so we pass the same char budget through here.
|
|
209
|
+
def apply_chunking(unit)
|
|
210
|
+
unit.chunks = @chunker.chunk(unit).map do |chunk|
|
|
211
|
+
{ content: chunk.content, chunk_type: chunk.chunk_type }
|
|
212
|
+
end
|
|
89
213
|
end
|
|
90
214
|
|
|
91
215
|
def build_unit(data)
|
|
@@ -98,6 +222,12 @@ module Woods
|
|
|
98
222
|
unit
|
|
99
223
|
end
|
|
100
224
|
|
|
225
|
+
# Character budget reserved for the TextPreparer context prefix
|
|
226
|
+
# ("[type] id / namespace / file / dependencies: …"). Typical
|
|
227
|
+
# prefixes run ~200–400 chars; 512 gives room to spare.
|
|
228
|
+
PREFIX_CHAR_ALLOWANCE = 512
|
|
229
|
+
private_constant :PREFIX_CHAR_ALLOWANCE
|
|
230
|
+
|
|
101
231
|
def embed_and_store(items, checkpoint, stats)
|
|
102
232
|
return if items.empty?
|
|
103
233
|
|
|
@@ -135,6 +265,60 @@ module Woods
|
|
|
135
265
|
def save_checkpoint(checkpoint)
|
|
136
266
|
File.write(File.join(@output_dir, 'checkpoint.json'), JSON.generate(checkpoint))
|
|
137
267
|
end
|
|
268
|
+
|
|
269
|
+
# Returns true when the vector store is an in-memory adapter that supports
|
|
270
|
+
# the persistence seam (+#each_entry+ / +#bulk_load+) and output_dir is set.
|
|
271
|
+
# Persistent backends (pgvector, Qdrant) never respond to +#each_entry+.
|
|
272
|
+
def persistable?
|
|
273
|
+
@output_dir &&
|
|
274
|
+
@vector_store.respond_to?(:each_entry) &&
|
|
275
|
+
@vector_store.respond_to?(:bulk_load)
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Persist stores to a timestamped dump directory, write +woods.json+,
|
|
279
|
+
# flip the +latest+ pointer, then prune old dumps.
|
|
280
|
+
def persist_snapshot
|
|
281
|
+
require_relative '../index_artifact'
|
|
282
|
+
require_relative '../storage/snapshotter'
|
|
283
|
+
|
|
284
|
+
artifact = IndexArtifact.new(@output_dir)
|
|
285
|
+
dump_dir = artifact.new_dump_dir
|
|
286
|
+
|
|
287
|
+
Storage::Snapshotter::Vector.dump(@vector_store, artifact, dump_dir)
|
|
288
|
+
|
|
289
|
+
if @metadata_store.respond_to?(:each_entry) && @metadata_store.respond_to?(:bulk_load)
|
|
290
|
+
Storage::Snapshotter::Metadata.dump(@metadata_store, artifact, dump_dir)
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
artifact.write_config(@resolved_config) if @resolved_config
|
|
294
|
+
|
|
295
|
+
artifact.promote(dump_dir)
|
|
296
|
+
|
|
297
|
+
prune_old_dumps(artifact)
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
# Remove old dump directories beyond the retention window.
|
|
301
|
+
#
|
|
302
|
+
# Keeps the +@dump_retention_count+ most-recently-created directories
|
|
303
|
+
# (sorted by name, which is a UTC timestamp so lexicographic order equals
|
|
304
|
+
# chronological order). The current +latest+ directory is always kept.
|
|
305
|
+
def prune_old_dumps(artifact)
|
|
306
|
+
return if @dump_retention_count.nil? || @dump_retention_count <= 0
|
|
307
|
+
|
|
308
|
+
dumps_root = artifact.dumps_root
|
|
309
|
+
return unless dumps_root.exist?
|
|
310
|
+
|
|
311
|
+
dirs = sorted_dump_dirs(dumps_root)
|
|
312
|
+
excess = dirs.length - @dump_retention_count
|
|
313
|
+
dirs.first(excess).each { |dir| FileUtils.rm_rf(dir) } if excess.positive?
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
def sorted_dump_dirs(dumps_root)
|
|
317
|
+
dumps_root.children
|
|
318
|
+
.select(&:directory?)
|
|
319
|
+
.sort_by(&:basename)
|
|
320
|
+
.map(&:to_s)
|
|
321
|
+
end
|
|
138
322
|
end
|
|
139
323
|
end
|
|
140
324
|
end
|
|
@@ -24,6 +24,12 @@ module Woods
|
|
|
24
24
|
'text-embedding-3-small' => 1536,
|
|
25
25
|
'text-embedding-3-large' => 3072
|
|
26
26
|
}.freeze
|
|
27
|
+
# OpenAI embedding models share an 8191-token input cap across
|
|
28
|
+
# text-embedding-3-small / -3-large / ada-002. The chunker uses
|
|
29
|
+
# this as a hard ceiling — the actual chunk size lands well
|
|
30
|
+
# below it once chars-per-token estimation and the prefix
|
|
31
|
+
# allowance are factored in (see Builder#build_chunker).
|
|
32
|
+
MAX_INPUT_TOKENS = 8191
|
|
27
33
|
|
|
28
34
|
# @param api_key [String] OpenAI API key
|
|
29
35
|
# @param model [String] OpenAI embedding model name (default: text-embedding-3-small)
|
|
@@ -37,7 +43,10 @@ module Woods
|
|
|
37
43
|
# @param text [String] the text to embed
|
|
38
44
|
# @return [Array<Float>] the embedding vector
|
|
39
45
|
# @raise [Woods::Error] if the API returns an error
|
|
46
|
+
# @raise [ArgumentError] if the text is nil or empty (OpenAI rejects these with 400)
|
|
40
47
|
def embed(text)
|
|
48
|
+
raise ArgumentError, 'embed(text) requires a non-empty string' if text.nil? || text.to_s.strip.empty?
|
|
49
|
+
|
|
41
50
|
response = post_request({ model: @model, input: text })
|
|
42
51
|
response['data'].first['embedding']
|
|
43
52
|
end
|
|
@@ -49,7 +58,13 @@ module Woods
|
|
|
49
58
|
# @param texts [Array<String>] the texts to embed
|
|
50
59
|
# @return [Array<Array<Float>>] array of embedding vectors
|
|
51
60
|
# @raise [Woods::Error] if the API returns an error
|
|
52
|
-
|
|
61
|
+
# @raise [ArgumentError] if the array is empty or any element is nil/empty
|
|
62
|
+
def embed_batch(texts) # rubocop:disable Metrics/CyclomaticComplexity
|
|
63
|
+
raise ArgumentError, 'embed_batch(texts) requires a non-empty array' if texts.nil? || texts.empty?
|
|
64
|
+
if texts.any? { |t| t.nil? || t.to_s.strip.empty? }
|
|
65
|
+
raise ArgumentError, 'embed_batch(texts) rejects nil/empty entries (OpenAI returns 400)'
|
|
66
|
+
end
|
|
67
|
+
|
|
53
68
|
response = post_request({ model: @model, input: texts })
|
|
54
69
|
response['data']
|
|
55
70
|
.sort_by { |item| item['index'] }
|
|
@@ -73,14 +88,35 @@ module Woods
|
|
|
73
88
|
@model
|
|
74
89
|
end
|
|
75
90
|
|
|
91
|
+
# Maximum input length OpenAI will accept for a single embedding
|
|
92
|
+
# text. All current text-embedding-* models cap at ~8k tokens.
|
|
93
|
+
#
|
|
94
|
+
# @return [Integer]
|
|
95
|
+
def max_input_tokens
|
|
96
|
+
MAX_INPUT_TOKENS
|
|
97
|
+
end
|
|
98
|
+
|
|
76
99
|
private
|
|
77
100
|
|
|
101
|
+
# Cap interpolated response bodies so misconfigured API errors
|
|
102
|
+
# (which occasionally echo request metadata, including headers) don't
|
|
103
|
+
# unbounded-leak into logs or re-raised messages.
|
|
104
|
+
#
|
|
105
|
+
# @param body [String, nil]
|
|
106
|
+
# @return [String]
|
|
107
|
+
def truncate_response_body(body)
|
|
108
|
+
return '' if body.nil?
|
|
109
|
+
|
|
110
|
+
s = body.to_s
|
|
111
|
+
s.length > 500 ? "#{s[0, 500]}... [truncated]" : s
|
|
112
|
+
end
|
|
113
|
+
|
|
78
114
|
# Send a POST request to the OpenAI embeddings API.
|
|
79
115
|
#
|
|
80
116
|
# @param body [Hash] request body
|
|
81
117
|
# @return [Hash] parsed JSON response
|
|
82
118
|
# @raise [Woods::Error] if the API returns a non-success status
|
|
83
|
-
def post_request(body)
|
|
119
|
+
def post_request(body) # rubocop:disable Metrics/AbcSize
|
|
84
120
|
request = Net::HTTP::Post.new(ENDPOINT.path)
|
|
85
121
|
request['Content-Type'] = 'application/json'
|
|
86
122
|
request['Authorization'] = "Bearer #{@api_key}"
|
|
@@ -89,7 +125,7 @@ module Woods
|
|
|
89
125
|
response = http_client.request(request)
|
|
90
126
|
|
|
91
127
|
unless response.is_a?(Net::HTTPSuccess)
|
|
92
|
-
raise Woods::Error, "OpenAI API error: #{response.code} #{response.body}"
|
|
128
|
+
raise Woods::Error, "OpenAI API error: #{response.code} #{truncate_response_body(response.body)}"
|
|
93
129
|
end
|
|
94
130
|
|
|
95
131
|
JSON.parse(response.body)
|
|
@@ -98,7 +134,7 @@ module Woods
|
|
|
98
134
|
@http_client = nil
|
|
99
135
|
response = http_client.request(request)
|
|
100
136
|
unless response.is_a?(Net::HTTPSuccess)
|
|
101
|
-
raise Woods::Error, "OpenAI API error: #{response.code} #{response.body}"
|
|
137
|
+
raise Woods::Error, "OpenAI API error: #{response.code} #{truncate_response_body(response.body)}"
|
|
102
138
|
end
|
|
103
139
|
|
|
104
140
|
JSON.parse(response.body)
|
|
@@ -49,6 +49,16 @@ module Woods
|
|
|
49
49
|
def model_name
|
|
50
50
|
raise NotImplementedError
|
|
51
51
|
end
|
|
52
|
+
|
|
53
|
+
# Return the maximum input length the provider will accept for a
|
|
54
|
+
# single text, in tokens. Used by the indexer to decide when a unit
|
|
55
|
+
# must be chunked before embedding.
|
|
56
|
+
#
|
|
57
|
+
# @return [Integer, nil] token budget, or nil if the provider has no hard cap
|
|
58
|
+
# @raise [NotImplementedError] if not implemented by the provider
|
|
59
|
+
def max_input_tokens
|
|
60
|
+
raise NotImplementedError
|
|
61
|
+
end
|
|
52
62
|
end
|
|
53
63
|
|
|
54
64
|
# Ollama adapter for local embeddings via the Ollama HTTP API.
|
|
@@ -66,11 +76,56 @@ module Woods
|
|
|
66
76
|
DEFAULT_MODEL = 'nomic-embed-text'
|
|
67
77
|
DEFAULT_HOST = 'http://localhost:11434'
|
|
68
78
|
|
|
69
|
-
#
|
|
79
|
+
# Ollama enforces the model's native context length on `/api/embed`
|
|
80
|
+
# regardless of the `num_ctx` override — we've validated this
|
|
81
|
+
# against 0.15.x for nomic-embed-text (rejects >2048) and bge-m3
|
|
82
|
+
# (accepts up to 8192, silently truncates above). Advertise the
|
|
83
|
+
# native ceiling so the chunker can size inputs correctly. Models
|
|
84
|
+
# outside this registry fall back to Ollama's conservative 2048
|
|
85
|
+
# default.
|
|
86
|
+
#
|
|
87
|
+
# See `docs/EMBEDDING_MODELS.md` for the tradeoff matrix and
|
|
88
|
+
# instructions for adding a new model here.
|
|
89
|
+
MODEL_CONTEXT_LENGTHS = {
|
|
90
|
+
'nomic-embed-text' => 2048,
|
|
91
|
+
'bge-m3' => 8192,
|
|
92
|
+
'mxbai-embed-large' => 512,
|
|
93
|
+
'snowflake-arctic-embed' => 512,
|
|
94
|
+
'snowflake-arctic-embed2' => 8192,
|
|
95
|
+
# all-minilm: 512 is the model's context length, NOT the 384
|
|
96
|
+
# embedding dimension and NOT the 256 some sources confuse with
|
|
97
|
+
# the dimension. With a 256-token budget the chunker formula
|
|
98
|
+
# produces a negative max_chars and silently drops every chunk.
|
|
99
|
+
'all-minilm' => 512
|
|
100
|
+
}.freeze
|
|
101
|
+
|
|
102
|
+
# Fallback when the configured model isn't in the registry.
|
|
103
|
+
FALLBACK_NUM_CTX = 2048
|
|
104
|
+
|
|
105
|
+
# Default read timeout for /api/embed. The previous 30s default
|
|
106
|
+
# was too short for batched embed calls on cold models — Ollama
|
|
107
|
+
# has to load the model on first call, and an N-item batch can
|
|
108
|
+
# easily exceed 30s on a CPU-only host. 120s leaves headroom
|
|
109
|
+
# without wedging the whole pipeline on a genuinely dead server.
|
|
110
|
+
DEFAULT_READ_TIMEOUT = 120
|
|
111
|
+
|
|
112
|
+
# @param model [String] Ollama model name (default: nomic-embed-text).
|
|
113
|
+
# Set to `"bge-m3"` or `"snowflake-arctic-embed2"` for an 8192-token
|
|
114
|
+
# context and skip most chunking for dense Rails units.
|
|
70
115
|
# @param host [String] Ollama server URL (default: http://localhost:11434)
|
|
71
|
-
|
|
116
|
+
# @param num_ctx [Integer, nil] Ollama context window in tokens. When
|
|
117
|
+
# `nil` (the default), the provider picks the model's native
|
|
118
|
+
# context from `MODEL_CONTEXT_LENGTHS`, falling back to 2048 for
|
|
119
|
+
# unknown models. Set explicitly only if running a model with a
|
|
120
|
+
# known-larger native context that isn't in the registry yet.
|
|
121
|
+
# @param read_timeout [Integer] HTTP read timeout in seconds.
|
|
122
|
+
# Bump this for slow / cold-start hosts or very large batches.
|
|
123
|
+
def initialize(model: DEFAULT_MODEL, host: DEFAULT_HOST, num_ctx: nil,
|
|
124
|
+
read_timeout: DEFAULT_READ_TIMEOUT)
|
|
72
125
|
@model = model
|
|
73
126
|
@host = host
|
|
127
|
+
@num_ctx = num_ctx || MODEL_CONTEXT_LENGTHS.fetch(model, FALLBACK_NUM_CTX)
|
|
128
|
+
@read_timeout = read_timeout
|
|
74
129
|
@uri = URI("#{host}/api/embed")
|
|
75
130
|
end
|
|
76
131
|
|
|
@@ -79,8 +134,11 @@ module Woods
|
|
|
79
134
|
# @param text [String] the text to embed
|
|
80
135
|
# @return [Array<Float>] the embedding vector
|
|
81
136
|
# @raise [Woods::Error] if the API returns an error
|
|
137
|
+
# @raise [ArgumentError] if the text is nil or empty (avoids provider 400)
|
|
82
138
|
def embed(text)
|
|
83
|
-
|
|
139
|
+
raise ArgumentError, 'embed(text) requires a non-empty string' if text.nil? || text.to_s.strip.empty?
|
|
140
|
+
|
|
141
|
+
response = post_request(build_body(text))
|
|
84
142
|
response['embeddings'].first
|
|
85
143
|
end
|
|
86
144
|
|
|
@@ -89,8 +147,14 @@ module Woods
|
|
|
89
147
|
# @param texts [Array<String>] the texts to embed
|
|
90
148
|
# @return [Array<Array<Float>>] array of embedding vectors
|
|
91
149
|
# @raise [Woods::Error] if the API returns an error
|
|
150
|
+
# @raise [ArgumentError] if the array is empty or any element is nil/empty
|
|
92
151
|
def embed_batch(texts)
|
|
93
|
-
|
|
152
|
+
raise ArgumentError, 'embed_batch(texts) requires a non-empty array' if texts.nil? || texts.empty?
|
|
153
|
+
if texts.any? { |t| t.nil? || t.to_s.strip.empty? }
|
|
154
|
+
raise ArgumentError, 'embed_batch(texts) rejects nil/empty entries'
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
response = post_request(build_body(texts))
|
|
94
158
|
response['embeddings']
|
|
95
159
|
end
|
|
96
160
|
|
|
@@ -110,20 +174,52 @@ module Woods
|
|
|
110
174
|
@model
|
|
111
175
|
end
|
|
112
176
|
|
|
177
|
+
# Maximum input length Ollama will accept — tracks the configured
|
|
178
|
+
# context window. Always populated: the constructor resolves
|
|
179
|
+
# `num_ctx` to the model's registry entry or {FALLBACK_NUM_CTX},
|
|
180
|
+
# so this method never returns nil for an Ollama provider.
|
|
181
|
+
#
|
|
182
|
+
# @return [Integer]
|
|
183
|
+
def max_input_tokens
|
|
184
|
+
@num_ctx
|
|
185
|
+
end
|
|
186
|
+
|
|
113
187
|
private
|
|
114
188
|
|
|
189
|
+
# Cap interpolated response bodies so misconfigured Ollama responses
|
|
190
|
+
# (e.g. proxied HTML error pages) don't unbounded-leak into logs or
|
|
191
|
+
# re-raised error messages.
|
|
192
|
+
#
|
|
193
|
+
# @param body [String, nil]
|
|
194
|
+
# @return [String]
|
|
195
|
+
def truncate_response_body(body)
|
|
196
|
+
return '' if body.nil?
|
|
197
|
+
|
|
198
|
+
s = body.to_s
|
|
199
|
+
s.length > 500 ? "#{s[0, 500]}... [truncated]" : s
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Build the JSON body for an `/api/embed` call. Adds `options.num_ctx`
|
|
203
|
+
# when configured — without it, Ollama silently truncates to 2048
|
|
204
|
+
# tokens and returns 400 when the input exceeds that default.
|
|
205
|
+
def build_body(input)
|
|
206
|
+
body = { model: @model, input: input }
|
|
207
|
+
body[:options] = { num_ctx: @num_ctx } if @num_ctx
|
|
208
|
+
body
|
|
209
|
+
end
|
|
210
|
+
|
|
115
211
|
# Send a POST request to the Ollama API.
|
|
116
212
|
#
|
|
117
213
|
# @param body [Hash] request body
|
|
118
214
|
# @return [Hash] parsed JSON response
|
|
119
215
|
# @raise [Woods::Error] if the API returns a non-success status
|
|
120
|
-
def post_request(body)
|
|
216
|
+
def post_request(body) # rubocop:disable Metrics/AbcSize
|
|
121
217
|
request = Net::HTTP::Post.new(@uri.path, 'Content-Type' => 'application/json')
|
|
122
218
|
request.body = body.to_json
|
|
123
219
|
response = http_client.request(request)
|
|
124
220
|
|
|
125
221
|
unless response.is_a?(Net::HTTPSuccess)
|
|
126
|
-
raise Woods::Error, "Ollama API error: #{response.code} #{response.body}"
|
|
222
|
+
raise Woods::Error, "Ollama API error: #{response.code} #{truncate_response_body(response.body)}"
|
|
127
223
|
end
|
|
128
224
|
|
|
129
225
|
JSON.parse(response.body)
|
|
@@ -136,7 +232,7 @@ module Woods
|
|
|
136
232
|
raise Woods::Error, "Ollama API error (retry failed): #{retry_error.message}"
|
|
137
233
|
end
|
|
138
234
|
unless response.is_a?(Net::HTTPSuccess)
|
|
139
|
-
raise Woods::Error, "Ollama API error: #{response.code} #{response.body}"
|
|
235
|
+
raise Woods::Error, "Ollama API error: #{response.code} #{truncate_response_body(response.body)}"
|
|
140
236
|
end
|
|
141
237
|
|
|
142
238
|
JSON.parse(response.body)
|
|
@@ -151,7 +247,7 @@ module Woods
|
|
|
151
247
|
http = Net::HTTP.new(@uri.host, @uri.port)
|
|
152
248
|
http.use_ssl = @uri.scheme == 'https'
|
|
153
249
|
http.open_timeout = 10
|
|
154
|
-
http.read_timeout =
|
|
250
|
+
http.read_timeout = @read_timeout
|
|
155
251
|
http.keep_alive_timeout = 30
|
|
156
252
|
http.start
|
|
157
253
|
@http_client = http
|