woods 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +169 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +15 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +3 -4
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +737 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +1 -1
  102. data/lib/woods/unblocked/document_builder.rb +35 -10
  103. data/lib/woods/unblocked/exporter.rb +1 -1
  104. data/lib/woods/util/host_guard.rb +61 -0
  105. data/lib/woods/version.rb +1 -1
  106. data/lib/woods.rb +126 -6
  107. metadata +69 -4
@@ -23,8 +23,9 @@ module Woods
23
23
  class DependencyGraph
24
24
  def initialize
25
25
  @nodes = {} # identifier => { type:, file_path: }
26
- @edges = {} # identifier => [dependency identifiers]
26
+ @edges = {} # identifier => [{ target:, via: }]
27
27
  @reverse = {} # identifier => Set of dependent identifiers
28
+ @reverse_via = {} # [target, via] => Set of dependent identifiers
28
29
  @file_map = {} # file_path => identifier
29
30
  @type_index = {} # type => Set of identifiers
30
31
  @to_h = nil
@@ -42,7 +43,7 @@ module Woods
42
43
  namespace: unit.namespace
43
44
  }
44
45
 
45
- @edges[unit.identifier] = unit.dependencies.map { |d| d[:target] }
46
+ @edges[unit.identifier] = unit.dependencies.map { |d| { target: d[:target], via: d[:via] } }
46
47
  @file_map[unit.file_path] = unit.identifier if unit.file_path
47
48
 
48
49
  # Type index for filtering (Set-based for O(1) insert)
@@ -51,6 +52,7 @@ module Woods
51
52
  # Build reverse edges (Set-based for O(1) insert)
52
53
  unit.dependencies.each do |dep|
53
54
  (@reverse[dep[:target]] ||= Set.new).add(unit.identifier)
55
+ (@reverse_via[[dep[:target], dep[:via]]] ||= Set.new).add(unit.identifier)
54
56
  end
55
57
  end
56
58
 
@@ -107,17 +109,28 @@ module Woods
107
109
  # Get direct dependencies of a unit
108
110
  #
109
111
  # @param identifier [String] Unit identifier
112
+ # @param via [Symbol, Array<Symbol>, nil] Filter by relationship type(s)
110
113
  # @return [Array<String>] List of dependency identifiers
111
- def dependencies_of(identifier)
112
- @edges[identifier] || []
114
+ def dependencies_of(identifier, via: nil)
115
+ edges = @edges[identifier] || []
116
+ if via
117
+ via_set = Array(via)
118
+ edges = edges.select { |e| via_set.include?(e[:via]) }
119
+ end
120
+ edges.map { |e| e[:target] }
113
121
  end
114
122
 
115
123
  # Get direct dependents of a unit (what depends on it)
116
124
  #
117
125
  # @param identifier [String] Unit identifier
126
+ # @param via [Symbol, Array<Symbol>, nil] Filter by relationship type(s)
118
127
  # @return [Array<String>] List of dependent identifiers
119
- def dependents_of(identifier)
120
- @reverse.fetch(identifier, Set.new).to_a
128
+ def dependents_of(identifier, via: nil)
129
+ return @reverse.fetch(identifier, Set.new).to_a unless via
130
+
131
+ Array(via).each_with_object(Set.new) do |v, result|
132
+ @reverse_via.fetch([identifier, v], Set.new).each { |dep| result.add(dep) }
133
+ end.to_a
121
134
  end
122
135
 
123
136
  # Get all units of a specific type
@@ -204,7 +217,8 @@ module Woods
204
217
  raw_nodes = data[:nodes] || data['nodes'] || {}
205
218
  graph.instance_variable_set(:@nodes, raw_nodes.transform_values { |v| symbolize_node(v) })
206
219
 
207
- graph.instance_variable_set(:@edges, data[:edges] || data['edges'] || {})
220
+ raw_edges = data[:edges] || data['edges'] || {}
221
+ graph.instance_variable_set(:@edges, raw_edges.transform_values { |edges| normalize_edges(edges) })
208
222
 
209
223
  raw_reverse = data[:reverse] || data['reverse'] || {}
210
224
  graph.instance_variable_set(:@reverse, raw_reverse.transform_values { |v| v.is_a?(Set) ? v : Set.new(v) })
@@ -216,6 +230,15 @@ module Woods
216
230
  v.is_a?(Set) ? v : Set.new(v)
217
231
  end)
218
232
 
233
+ # Rebuild reverse_via index from edges
234
+ reverse_via = {}
235
+ graph.instance_variable_get(:@edges).each do |source_id, edges|
236
+ edges.each do |edge|
237
+ (reverse_via[[edge[:target], edge[:via]]] ||= Set.new).add(source_id)
238
+ end
239
+ end
240
+ graph.instance_variable_set(:@reverse_via, reverse_via)
241
+
219
242
  graph
220
243
  end
221
244
 
@@ -232,5 +255,41 @@ module Woods
232
255
  namespace: node[:namespace] || node['namespace']
233
256
  }
234
257
  end
258
+
259
+ # Normalize edge data from either old format (bare strings) or new format (hashes).
260
+ #
261
+ # ROUND-TRIP INVARIANT (do not break when refactoring):
262
+ # DependencyGraph#to_h -> JSON.generate -> JSON.parse -> DependencyGraph.from_h
263
+ # must always yield the same in-memory shape. The two normalizers that
264
+ # sit at either end of this round trip are INTENTIONALLY SEPARATE — do
265
+ # not merge them:
266
+ #
267
+ # - This method ({.normalize_edges}) runs on Ruby objects. It produces
268
+ # `{ target:, via: }` with SYMBOL keys because consumers
269
+ # ({DependencyGraph#dependencies_of}, {GraphAnalyzer}) key on symbols.
270
+ # - {Woods::MCP::IndexReader.normalize_all_edges} runs on parsed JSON,
271
+ # producing `{ 'target' => ..., 'via' => ... }` with STRING keys,
272
+ # because the MCP tools serialize straight through to the client and
273
+ # symbol keys would become `:target` on the wire.
274
+ #
275
+ # This method also accepts OLD-format bare-string edges so graphs
276
+ # serialized before the `{target, via}` migration still load without
277
+ # explicit data conversion.
278
+ #
279
+ # @param edges [Array] Edge entries — either strings or hashes
280
+ # @return [Array<Hash>] Normalized edges with :target and :via keys
281
+ def self.normalize_edges(edges)
282
+ return [] unless edges.is_a?(Array)
283
+
284
+ edges.map do |edge|
285
+ if edge.is_a?(String)
286
+ { target: edge, via: nil }
287
+ elsif edge.is_a?(Hash)
288
+ { target: edge[:target] || edge['target'], via: (edge[:via] || edge['via'])&.to_sym }
289
+ else
290
+ { target: edge.to_s, via: nil }
291
+ end
292
+ end
293
+ end
235
294
  end
236
295
  end
@@ -2,27 +2,65 @@
2
2
 
3
3
  require 'json'
4
4
  require 'digest'
5
+ require 'fileutils'
6
+
7
+ require_relative '../extracted_unit'
8
+ require_relative '../chunking/semantic_chunker'
5
9
 
6
10
  module Woods
7
11
  module Embedding
8
12
  # Orchestrates the indexing pipeline: reads extracted units, prepares text,
9
13
  # generates embeddings, and stores vectors. Supports full and incremental
10
14
  # modes with checkpoint-based resumability.
11
- class Indexer
15
+ #
16
+ # When the vector store is an in-memory adapter (responds to +#each_entry+
17
+ # and +#bulk_load+) and +output_dir+ is set, a successful {#index_all} run
18
+ # also persists the stores to disk via the Snapshotter pair and atomically
19
+ # flips the +dumps/latest+ pointer. Persistent backends (pgvector, Qdrant)
20
+ # see zero behaviour change — no Snapshotter is invoked.
21
+ class Indexer # rubocop:disable Metrics/ClassLength
22
+ # @param chunker [Chunking::SemanticChunker, nil] Splits oversize units
23
+ # into semantically coherent chunks before embedding. +nil+ disables
24
+ # chunking — units go to the provider whole (useful in tests).
12
25
  # @param checkpoint_interval [Integer] Save checkpoint every N batches (default: 10)
13
- def initialize(provider:, text_preparer:, vector_store:, output_dir:, batch_size: 32, checkpoint_interval: 10) # rubocop:disable Metrics/ParameterLists
26
+ # @param metadata_store [#each_entry, #bulk_load, nil] Optional metadata store.
27
+ # When present alongside an in-memory vector store, both are persisted
28
+ # at the end of a successful {#index_all} run.
29
+ # @param resolved_config [Woods::ResolvedConfig, nil] Captured config for
30
+ # +woods.json+ — written to +output_dir+ on {#index_all} completion.
31
+ # @param dump_retention_count [Integer] Number of completed dump directories
32
+ # to keep under +output_dir/dumps/+. Older dumps are removed after a
33
+ # successful {#index_all} run (default: 3).
34
+ def initialize(provider:, text_preparer:, vector_store:, output_dir:, # rubocop:disable Metrics/ParameterLists
35
+ chunker: Chunking::SemanticChunker.new,
36
+ batch_size: 32, checkpoint_interval: 10,
37
+ metadata_store: nil,
38
+ resolved_config: nil,
39
+ dump_retention_count: 3)
14
40
  @provider = provider
15
41
  @text_preparer = text_preparer
16
42
  @vector_store = vector_store
17
43
  @output_dir = output_dir
44
+ @chunker = chunker
18
45
  @batch_size = batch_size
19
46
  @checkpoint_interval = checkpoint_interval
47
+ @metadata_store = metadata_store
48
+ @resolved_config = resolved_config
49
+ @dump_retention_count = dump_retention_count
20
50
  end
21
51
 
22
52
  # Index all extracted units (full mode). Returns stats hash.
53
+ #
54
+ # When the vector store is an in-memory adapter, persists the embedded
55
+ # vectors (and metadata, if a metadata store was provided) to disk under
56
+ # +output_dir/dumps/<timestamp>/+ and atomically flips the +latest+
57
+ # pointer. Writes +woods.json+ when +resolved_config+ was supplied.
58
+ #
23
59
  # @return [Hash] Stats with :processed, :skipped, :errors counts
24
60
  def index_all
25
- process_units(load_units, incremental: false)
61
+ stats = process_units(load_units, incremental: false)
62
+ persist_snapshot if persistable?
63
+ stats
26
64
  end
27
65
 
28
66
  # Index only changed units (incremental mode). Returns stats hash.
@@ -37,7 +75,11 @@ module Woods
37
75
  Dir.glob(File.join(@output_dir, '**', '*.json')).filter_map do |path|
38
76
  next if File.basename(path) == 'checkpoint.json'
39
77
 
40
- JSON.parse(File.read(path))
78
+ data = JSON.parse(File.read(path))
79
+ # Extraction output also contains index listings (_index.json arrays) and
80
+ # summary files (manifest.json, dependency_graph.json, graph_analysis.json)
81
+ # that live alongside per-unit JSON. Filter to the unit shape.
82
+ data if data.is_a?(Hash) && data.key?('type') && data.key?('identifier')
41
83
  rescue JSON::ParserError
42
84
  nil
43
85
  end
@@ -62,6 +104,12 @@ module Woods
62
104
 
63
105
  def process_batch(batch, checkpoint, stats, incremental:)
64
106
  to_embed = batch.each_with_object([]) do |unit_data, items|
107
+ persist_unit_metadata(unit_data)
108
+ # Incremental skip uses `source_hash`, which the extractor derives
109
+ # from the unit's *source_code string only* (see ExtractedUnit#to_h
110
+ # and Extractor#dump_units). It is NOT a hash of the serialized
111
+ # unit_data JSON — so key ordering or whitespace in the _index.json
112
+ # does not invalidate checkpoints across Ruby-minor upgrades.
65
113
  if incremental && checkpoint[unit_data['identifier']] == unit_data['source_hash']
66
114
  stats[:skipped] += 1
67
115
  next
@@ -72,6 +120,20 @@ module Woods
72
120
  embed_and_store(to_embed, checkpoint, stats)
73
121
  end
74
122
 
123
+ # Persist a unit's metadata under its base identifier so retrieval can
124
+ # resolve vector-search hits back to their unit data. Without this,
125
+ # the metadata store is left empty at end of run — Snapshotter::Metadata
126
+ # dumps a header with record_count: 0 and every MCP +codebase_retrieve+
127
+ # call silently returns empty text, because ContextAssembler#find_batch
128
+ # misses every candidate identifier. No-op when metadata_store is nil
129
+ # (hosts that don't configure one). Stored under the base identifier,
130
+ # not the chunk-suffixed id — chunks are an embedding-side concern only.
131
+ def persist_unit_metadata(unit_data)
132
+ return unless @metadata_store
133
+
134
+ @metadata_store.store(unit_data['identifier'], unit_data)
135
+ end
136
+
75
137
  def collect_embed_items(unit_data, items)
76
138
  texts = prepare_texts(unit_data)
77
139
  identifier = unit_data['identifier']
@@ -83,9 +145,71 @@ module Woods
83
145
  end
84
146
  end
85
147
 
86
- def prepare_texts(unit_data)
148
+ def prepare_texts(unit_data) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
87
149
  unit = build_unit(unit_data)
88
- unit.chunks&.any? ? @text_preparer.prepare_chunks(unit) : [@text_preparer.prepare(unit)]
150
+ apply_chunking(unit) if @chunker && unit.chunks.empty? && needs_chunking?(unit)
151
+ # Extraction may have emitted chunks larger than the provider's
152
+ # budget (rails_source in particular). Enforce the ceiling on
153
+ # whatever chunks we have before handing off to the provider.
154
+ @chunker&.enforce_chunk_limits!(unit) if unit.chunks.any?
155
+ texts = unit.chunks.any? ? @text_preparer.prepare_chunks(unit) : [@text_preparer.prepare(unit)]
156
+ # Drop empty/whitespace-only texts — embedding providers reject
157
+ # them with 400 and retrying never succeeds. Unit is effectively
158
+ # skipped when every text is empty (zero-source unit).
159
+ texts.reject { |t| t.nil? || t.strip.empty? || content_portion_empty?(t, unit) }
160
+ end
161
+
162
+ # True when a prepared text is just the metadata prefix with no
163
+ # underlying source content (empty source_code + empty chunks).
164
+ # Avoids embedding prefix-only stubs that have no semantic value
165
+ # and would poison the vector space with identical headers.
166
+ def content_portion_empty?(text, unit)
167
+ return false unless unit.chunks.empty?
168
+ return false unless unit.source_code.nil? || unit.source_code.strip.empty?
169
+
170
+ !text.nil?
171
+ end
172
+
173
+ # Does this unit exceed the embedding provider's single-input
174
+ # budget? Returns false when the provider reports no budget, when
175
+ # the TextPreparer has no calibrated chars-per-token ratio, or when
176
+ # the unit's source fits.
177
+ #
178
+ # When the configured chunker carries a real tokenizer
179
+ # (Embedding::TokenCounter) we also consult it — dense Ruby source
180
+ # tokenizes hotter than chars/token averages suggest, and Ollama
181
+ # rejects over-budget input outright (see ollama/ollama#14186).
182
+ def needs_chunking?(unit)
183
+ budget_tokens = @provider.respond_to?(:max_input_tokens) ? @provider.max_input_tokens : nil
184
+ return false if budget_tokens.nil?
185
+ return false unless @text_preparer.respond_to?(:chars_per_token)
186
+
187
+ source = unit.source_code || ''
188
+ return true if chunker_token_oversize?(source)
189
+
190
+ # Subtract a small prefix allowance — the TextPreparer adds a few
191
+ # hundred characters of context header ([type] identifier / file /
192
+ # dependencies) that count toward the budget too.
193
+ char_budget = (budget_tokens * @text_preparer.chars_per_token).floor - PREFIX_CHAR_ALLOWANCE
194
+ char_budget.positive? && source.length > char_budget
195
+ end
196
+
197
+ # Ask the chunker's real tokenizer whether +source+ already exceeds
198
+ # the token budget. Returns false when the chunker wasn't built with
199
+ # one (e.g., OpenAI path), leaving the char-based check in charge.
200
+ def chunker_token_oversize?(source)
201
+ return false unless @chunker&.token_counter && @chunker.max_tokens
202
+
203
+ @chunker.token_counter.count(source) > @chunker.max_tokens
204
+ end
205
+
206
+ # Populate unit.chunks from the configured chunker. The chunker's
207
+ # own +max_chars+ safety net is what guarantees each chunk fits,
208
+ # so we pass the same char budget through here.
209
+ def apply_chunking(unit)
210
+ unit.chunks = @chunker.chunk(unit).map do |chunk|
211
+ { content: chunk.content, chunk_type: chunk.chunk_type }
212
+ end
89
213
  end
90
214
 
91
215
  def build_unit(data)
@@ -98,6 +222,12 @@ module Woods
98
222
  unit
99
223
  end
100
224
 
225
+ # Character budget reserved for the TextPreparer context prefix
226
+ # ("[type] id / namespace / file / dependencies: …"). Typical
227
+ # prefixes run ~200–400 chars; 512 gives room to spare.
228
+ PREFIX_CHAR_ALLOWANCE = 512
229
+ private_constant :PREFIX_CHAR_ALLOWANCE
230
+
101
231
  def embed_and_store(items, checkpoint, stats)
102
232
  return if items.empty?
103
233
 
@@ -135,6 +265,60 @@ module Woods
135
265
  def save_checkpoint(checkpoint)
136
266
  File.write(File.join(@output_dir, 'checkpoint.json'), JSON.generate(checkpoint))
137
267
  end
268
+
269
+ # Returns true when the vector store is an in-memory adapter that supports
270
+ # the persistence seam (+#each_entry+ / +#bulk_load+) and output_dir is set.
271
+ # Persistent backends (pgvector, Qdrant) never respond to +#each_entry+.
272
+ def persistable?
273
+ @output_dir &&
274
+ @vector_store.respond_to?(:each_entry) &&
275
+ @vector_store.respond_to?(:bulk_load)
276
+ end
277
+
278
+ # Persist stores to a timestamped dump directory, write +woods.json+,
279
+ # flip the +latest+ pointer, then prune old dumps.
280
+ def persist_snapshot
281
+ require_relative '../index_artifact'
282
+ require_relative '../storage/snapshotter'
283
+
284
+ artifact = IndexArtifact.new(@output_dir)
285
+ dump_dir = artifact.new_dump_dir
286
+
287
+ Storage::Snapshotter::Vector.dump(@vector_store, artifact, dump_dir)
288
+
289
+ if @metadata_store.respond_to?(:each_entry) && @metadata_store.respond_to?(:bulk_load)
290
+ Storage::Snapshotter::Metadata.dump(@metadata_store, artifact, dump_dir)
291
+ end
292
+
293
+ artifact.write_config(@resolved_config) if @resolved_config
294
+
295
+ artifact.promote(dump_dir)
296
+
297
+ prune_old_dumps(artifact)
298
+ end
299
+
300
+ # Remove old dump directories beyond the retention window.
301
+ #
302
+ # Keeps the +@dump_retention_count+ most-recently-created directories
303
+ # (sorted by name, which is a UTC timestamp so lexicographic order equals
304
+ # chronological order). The current +latest+ directory is always kept.
305
+ def prune_old_dumps(artifact)
306
+ return if @dump_retention_count.nil? || @dump_retention_count <= 0
307
+
308
+ dumps_root = artifact.dumps_root
309
+ return unless dumps_root.exist?
310
+
311
+ dirs = sorted_dump_dirs(dumps_root)
312
+ excess = dirs.length - @dump_retention_count
313
+ dirs.first(excess).each { |dir| FileUtils.rm_rf(dir) } if excess.positive?
314
+ end
315
+
316
+ def sorted_dump_dirs(dumps_root)
317
+ dumps_root.children
318
+ .select(&:directory?)
319
+ .sort_by(&:basename)
320
+ .map(&:to_s)
321
+ end
138
322
  end
139
323
  end
140
324
  end
@@ -24,6 +24,12 @@ module Woods
24
24
  'text-embedding-3-small' => 1536,
25
25
  'text-embedding-3-large' => 3072
26
26
  }.freeze
27
+ # OpenAI embedding models share an 8191-token input cap across
28
+ # text-embedding-3-small / -3-large / ada-002. The chunker uses
29
+ # this as a hard ceiling — the actual chunk size lands well
30
+ # below it once chars-per-token estimation and the prefix
31
+ # allowance are factored in (see Builder#build_chunker).
32
+ MAX_INPUT_TOKENS = 8191
27
33
 
28
34
  # @param api_key [String] OpenAI API key
29
35
  # @param model [String] OpenAI embedding model name (default: text-embedding-3-small)
@@ -37,7 +43,10 @@ module Woods
37
43
  # @param text [String] the text to embed
38
44
  # @return [Array<Float>] the embedding vector
39
45
  # @raise [Woods::Error] if the API returns an error
46
+ # @raise [ArgumentError] if the text is nil or empty (OpenAI rejects these with 400)
40
47
  def embed(text)
48
+ raise ArgumentError, 'embed(text) requires a non-empty string' if text.nil? || text.to_s.strip.empty?
49
+
41
50
  response = post_request({ model: @model, input: text })
42
51
  response['data'].first['embedding']
43
52
  end
@@ -49,7 +58,13 @@ module Woods
49
58
  # @param texts [Array<String>] the texts to embed
50
59
  # @return [Array<Array<Float>>] array of embedding vectors
51
60
  # @raise [Woods::Error] if the API returns an error
52
- def embed_batch(texts)
61
+ # @raise [ArgumentError] if the array is empty or any element is nil/empty
62
+ def embed_batch(texts) # rubocop:disable Metrics/CyclomaticComplexity
63
+ raise ArgumentError, 'embed_batch(texts) requires a non-empty array' if texts.nil? || texts.empty?
64
+ if texts.any? { |t| t.nil? || t.to_s.strip.empty? }
65
+ raise ArgumentError, 'embed_batch(texts) rejects nil/empty entries (OpenAI returns 400)'
66
+ end
67
+
53
68
  response = post_request({ model: @model, input: texts })
54
69
  response['data']
55
70
  .sort_by { |item| item['index'] }
@@ -73,14 +88,35 @@ module Woods
73
88
  @model
74
89
  end
75
90
 
91
+ # Maximum input length OpenAI will accept for a single embedding
92
+ # text. All current text-embedding-* models cap at ~8k tokens.
93
+ #
94
+ # @return [Integer]
95
+ def max_input_tokens
96
+ MAX_INPUT_TOKENS
97
+ end
98
+
76
99
  private
77
100
 
101
+ # Cap interpolated response bodies so misconfigured API errors
102
+ # (which occasionally echo request metadata, including headers) don't
103
+ # unbounded-leak into logs or re-raised messages.
104
+ #
105
+ # @param body [String, nil]
106
+ # @return [String]
107
+ def truncate_response_body(body)
108
+ return '' if body.nil?
109
+
110
+ s = body.to_s
111
+ s.length > 500 ? "#{s[0, 500]}... [truncated]" : s
112
+ end
113
+
78
114
  # Send a POST request to the OpenAI embeddings API.
79
115
  #
80
116
  # @param body [Hash] request body
81
117
  # @return [Hash] parsed JSON response
82
118
  # @raise [Woods::Error] if the API returns a non-success status
83
- def post_request(body)
119
+ def post_request(body) # rubocop:disable Metrics/AbcSize
84
120
  request = Net::HTTP::Post.new(ENDPOINT.path)
85
121
  request['Content-Type'] = 'application/json'
86
122
  request['Authorization'] = "Bearer #{@api_key}"
@@ -89,7 +125,7 @@ module Woods
89
125
  response = http_client.request(request)
90
126
 
91
127
  unless response.is_a?(Net::HTTPSuccess)
92
- raise Woods::Error, "OpenAI API error: #{response.code} #{response.body}"
128
+ raise Woods::Error, "OpenAI API error: #{response.code} #{truncate_response_body(response.body)}"
93
129
  end
94
130
 
95
131
  JSON.parse(response.body)
@@ -98,7 +134,7 @@ module Woods
98
134
  @http_client = nil
99
135
  response = http_client.request(request)
100
136
  unless response.is_a?(Net::HTTPSuccess)
101
- raise Woods::Error, "OpenAI API error: #{response.code} #{response.body}"
137
+ raise Woods::Error, "OpenAI API error: #{response.code} #{truncate_response_body(response.body)}"
102
138
  end
103
139
 
104
140
  JSON.parse(response.body)
@@ -49,6 +49,16 @@ module Woods
49
49
  def model_name
50
50
  raise NotImplementedError
51
51
  end
52
+
53
+ # Return the maximum input length the provider will accept for a
54
+ # single text, in tokens. Used by the indexer to decide when a unit
55
+ # must be chunked before embedding.
56
+ #
57
+ # @return [Integer, nil] token budget, or nil if the provider has no hard cap
58
+ # @raise [NotImplementedError] if not implemented by the provider
59
+ def max_input_tokens
60
+ raise NotImplementedError
61
+ end
52
62
  end
53
63
 
54
64
  # Ollama adapter for local embeddings via the Ollama HTTP API.
@@ -66,11 +76,56 @@ module Woods
66
76
  DEFAULT_MODEL = 'nomic-embed-text'
67
77
  DEFAULT_HOST = 'http://localhost:11434'
68
78
 
69
- # @param model [String] Ollama model name (default: nomic-embed-text)
79
+ # Ollama enforces the model's native context length on `/api/embed`
80
+ # regardless of the `num_ctx` override — we've validated this
81
+ # against 0.15.x for nomic-embed-text (rejects >2048) and bge-m3
82
+ # (accepts up to 8192, silently truncates above). Advertise the
83
+ # native ceiling so the chunker can size inputs correctly. Models
84
+ # outside this registry fall back to Ollama's conservative 2048
85
+ # default.
86
+ #
87
+ # See `docs/EMBEDDING_MODELS.md` for the tradeoff matrix and
88
+ # instructions for adding a new model here.
89
+ MODEL_CONTEXT_LENGTHS = {
90
+ 'nomic-embed-text' => 2048,
91
+ 'bge-m3' => 8192,
92
+ 'mxbai-embed-large' => 512,
93
+ 'snowflake-arctic-embed' => 512,
94
+ 'snowflake-arctic-embed2' => 8192,
95
+ # all-minilm: 512 is the model's context length, NOT the 384
96
+ # embedding dimension and NOT the 256 some sources confuse with
97
+ # the dimension. With a 256-token budget the chunker formula
98
+ # produces a negative max_chars and silently drops every chunk.
99
+ 'all-minilm' => 512
100
+ }.freeze
101
+
102
+ # Fallback when the configured model isn't in the registry.
103
+ FALLBACK_NUM_CTX = 2048
104
+
105
+ # Default read timeout for /api/embed. The previous 30s default
106
+ # was too short for batched embed calls on cold models — Ollama
107
+ # has to load the model on first call, and an N-item batch can
108
+ # easily exceed 30s on a CPU-only host. 120s leaves headroom
109
+ # without wedging the whole pipeline on a genuinely dead server.
110
+ DEFAULT_READ_TIMEOUT = 120
111
+
112
+ # @param model [String] Ollama model name (default: nomic-embed-text).
113
+ # Set to `"bge-m3"` or `"snowflake-arctic-embed2"` for an 8192-token
114
+ # context and skip most chunking for dense Rails units.
70
115
  # @param host [String] Ollama server URL (default: http://localhost:11434)
71
- def initialize(model: DEFAULT_MODEL, host: DEFAULT_HOST)
116
+ # @param num_ctx [Integer, nil] Ollama context window in tokens. When
117
+ # `nil` (the default), the provider picks the model's native
118
+ # context from `MODEL_CONTEXT_LENGTHS`, falling back to 2048 for
119
+ # unknown models. Set explicitly only if running a model with a
120
+ # known-larger native context that isn't in the registry yet.
121
+ # @param read_timeout [Integer] HTTP read timeout in seconds.
122
+ # Bump this for slow / cold-start hosts or very large batches.
123
+ def initialize(model: DEFAULT_MODEL, host: DEFAULT_HOST, num_ctx: nil,
124
+ read_timeout: DEFAULT_READ_TIMEOUT)
72
125
  @model = model
73
126
  @host = host
127
+ @num_ctx = num_ctx || MODEL_CONTEXT_LENGTHS.fetch(model, FALLBACK_NUM_CTX)
128
+ @read_timeout = read_timeout
74
129
  @uri = URI("#{host}/api/embed")
75
130
  end
76
131
 
@@ -79,8 +134,11 @@ module Woods
79
134
  # @param text [String] the text to embed
80
135
  # @return [Array<Float>] the embedding vector
81
136
  # @raise [Woods::Error] if the API returns an error
137
+ # @raise [ArgumentError] if the text is nil or empty (avoids provider 400)
82
138
  def embed(text)
83
- response = post_request({ model: @model, input: text })
139
+ raise ArgumentError, 'embed(text) requires a non-empty string' if text.nil? || text.to_s.strip.empty?
140
+
141
+ response = post_request(build_body(text))
84
142
  response['embeddings'].first
85
143
  end
86
144
 
@@ -89,8 +147,14 @@ module Woods
89
147
  # @param texts [Array<String>] the texts to embed
90
148
  # @return [Array<Array<Float>>] array of embedding vectors
91
149
  # @raise [Woods::Error] if the API returns an error
150
+ # @raise [ArgumentError] if the array is empty or any element is nil/empty
92
151
  def embed_batch(texts)
93
- response = post_request({ model: @model, input: texts })
152
+ raise ArgumentError, 'embed_batch(texts) requires a non-empty array' if texts.nil? || texts.empty?
153
+ if texts.any? { |t| t.nil? || t.to_s.strip.empty? }
154
+ raise ArgumentError, 'embed_batch(texts) rejects nil/empty entries'
155
+ end
156
+
157
+ response = post_request(build_body(texts))
94
158
  response['embeddings']
95
159
  end
96
160
 
@@ -110,20 +174,52 @@ module Woods
110
174
  @model
111
175
  end
112
176
 
177
+ # Maximum input length Ollama will accept — tracks the configured
178
+ # context window. Always populated: the constructor resolves
179
+ # `num_ctx` to the model's registry entry or {FALLBACK_NUM_CTX},
180
+ # so this method never returns nil for an Ollama provider.
181
+ #
182
+ # @return [Integer]
183
+ def max_input_tokens
184
+ @num_ctx
185
+ end
186
+
113
187
  private
114
188
 
189
+ # Cap interpolated response bodies so misconfigured Ollama responses
190
+ # (e.g. proxied HTML error pages) don't unbounded-leak into logs or
191
+ # re-raised error messages.
192
+ #
193
+ # @param body [String, nil]
194
+ # @return [String]
195
+ def truncate_response_body(body)
196
+ return '' if body.nil?
197
+
198
+ s = body.to_s
199
+ s.length > 500 ? "#{s[0, 500]}... [truncated]" : s
200
+ end
201
+
202
+ # Build the JSON body for an `/api/embed` call. Adds `options.num_ctx`
203
+ # when configured — without it, Ollama silently truncates to 2048
204
+ # tokens and returns 400 when the input exceeds that default.
205
+ def build_body(input)
206
+ body = { model: @model, input: input }
207
+ body[:options] = { num_ctx: @num_ctx } if @num_ctx
208
+ body
209
+ end
210
+
115
211
  # Send a POST request to the Ollama API.
116
212
  #
117
213
  # @param body [Hash] request body
118
214
  # @return [Hash] parsed JSON response
119
215
  # @raise [Woods::Error] if the API returns a non-success status
120
- def post_request(body)
216
+ def post_request(body) # rubocop:disable Metrics/AbcSize
121
217
  request = Net::HTTP::Post.new(@uri.path, 'Content-Type' => 'application/json')
122
218
  request.body = body.to_json
123
219
  response = http_client.request(request)
124
220
 
125
221
  unless response.is_a?(Net::HTTPSuccess)
126
- raise Woods::Error, "Ollama API error: #{response.code} #{response.body}"
222
+ raise Woods::Error, "Ollama API error: #{response.code} #{truncate_response_body(response.body)}"
127
223
  end
128
224
 
129
225
  JSON.parse(response.body)
@@ -136,7 +232,7 @@ module Woods
136
232
  raise Woods::Error, "Ollama API error (retry failed): #{retry_error.message}"
137
233
  end
138
234
  unless response.is_a?(Net::HTTPSuccess)
139
- raise Woods::Error, "Ollama API error: #{response.code} #{response.body}"
235
+ raise Woods::Error, "Ollama API error: #{response.code} #{truncate_response_body(response.body)}"
140
236
  end
141
237
 
142
238
  JSON.parse(response.body)
@@ -151,7 +247,7 @@ module Woods
151
247
  http = Net::HTTP.new(@uri.host, @uri.port)
152
248
  http.use_ssl = @uri.scheme == 'https'
153
249
  http.open_timeout = 10
154
- http.read_timeout = 30
250
+ http.read_timeout = @read_timeout
155
251
  http.keep_alive_timeout = 30
156
252
  http.start
157
253
  @http_client = http