woods 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +169 -0
- data/README.md +20 -8
- data/exe/woods-console +51 -6
- data/exe/woods-console-mcp +24 -4
- data/exe/woods-mcp +30 -7
- data/exe/woods-mcp-http +47 -6
- data/lib/generators/woods/install_generator.rb +13 -4
- data/lib/generators/woods/templates/woods.rb.tt +155 -0
- data/lib/tasks/woods.rake +15 -50
- data/lib/woods/builder.rb +174 -9
- data/lib/woods/cache/cache_middleware.rb +360 -31
- data/lib/woods/chunking/semantic_chunker.rb +334 -7
- data/lib/woods/console/adapters/job_adapter.rb +10 -4
- data/lib/woods/console/audit_logger.rb +76 -4
- data/lib/woods/console/bridge.rb +48 -15
- data/lib/woods/console/bridge_protocol.rb +44 -0
- data/lib/woods/console/confirmation.rb +3 -4
- data/lib/woods/console/console_response_renderer.rb +56 -18
- data/lib/woods/console/credential_index.rb +201 -0
- data/lib/woods/console/credential_scanner.rb +302 -0
- data/lib/woods/console/dispatch_pipeline.rb +138 -0
- data/lib/woods/console/embedded_executor.rb +682 -35
- data/lib/woods/console/eval_guard.rb +319 -0
- data/lib/woods/console/model_validator.rb +1 -3
- data/lib/woods/console/rack_middleware.rb +185 -29
- data/lib/woods/console/redactor.rb +161 -0
- data/lib/woods/console/response_context.rb +127 -0
- data/lib/woods/console/safe_context.rb +220 -23
- data/lib/woods/console/scope_predicate_parser.rb +131 -0
- data/lib/woods/console/server.rb +417 -486
- data/lib/woods/console/sql_noise_stripper.rb +87 -0
- data/lib/woods/console/sql_table_scanner.rb +213 -0
- data/lib/woods/console/sql_validator.rb +81 -31
- data/lib/woods/console/table_gate.rb +93 -0
- data/lib/woods/console/tool_specs.rb +552 -0
- data/lib/woods/console/tools/tier1.rb +3 -3
- data/lib/woods/console/tools/tier4.rb +7 -1
- data/lib/woods/dependency_graph.rb +66 -7
- data/lib/woods/embedding/indexer.rb +190 -6
- data/lib/woods/embedding/openai.rb +40 -4
- data/lib/woods/embedding/provider.rb +104 -8
- data/lib/woods/embedding/text_preparer.rb +23 -3
- data/lib/woods/embedding/token_counter.rb +133 -0
- data/lib/woods/evaluation/baseline_runner.rb +20 -2
- data/lib/woods/evaluation/metrics.rb +4 -1
- data/lib/woods/extracted_unit.rb +1 -0
- data/lib/woods/extractor.rb +7 -1
- data/lib/woods/extractors/controller_extractor.rb +6 -0
- data/lib/woods/extractors/mailer_extractor.rb +16 -2
- data/lib/woods/extractors/model_extractor.rb +6 -1
- data/lib/woods/extractors/phlex_extractor.rb +13 -4
- data/lib/woods/extractors/rails_source_extractor.rb +2 -0
- data/lib/woods/extractors/route_helper_resolver.rb +130 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
- data/lib/woods/extractors/view_component_extractor.rb +12 -1
- data/lib/woods/extractors/view_engines/base.rb +141 -0
- data/lib/woods/extractors/view_engines/erb.rb +145 -0
- data/lib/woods/extractors/view_template_extractor.rb +92 -133
- data/lib/woods/flow_assembler.rb +23 -15
- data/lib/woods/flow_precomputer.rb +21 -2
- data/lib/woods/graph_analyzer.rb +3 -4
- data/lib/woods/index_artifact.rb +173 -0
- data/lib/woods/mcp/bearer_auth.rb +45 -0
- data/lib/woods/mcp/bootstrap_state.rb +94 -0
- data/lib/woods/mcp/bootstrapper.rb +337 -16
- data/lib/woods/mcp/config_resolver.rb +288 -0
- data/lib/woods/mcp/errors.rb +134 -0
- data/lib/woods/mcp/index_reader.rb +265 -30
- data/lib/woods/mcp/origin_guard.rb +132 -0
- data/lib/woods/mcp/provider_probe.rb +166 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
- data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
- data/lib/woods/mcp/server.rb +737 -137
- data/lib/woods/model_name_cache.rb +78 -2
- data/lib/woods/notion/client.rb +25 -2
- data/lib/woods/notion/mappers/model_mapper.rb +36 -2
- data/lib/woods/railtie.rb +55 -15
- data/lib/woods/resilience/circuit_breaker.rb +9 -2
- data/lib/woods/resilience/retryable_provider.rb +40 -3
- data/lib/woods/resolved_config.rb +299 -0
- data/lib/woods/retrieval/context_assembler.rb +112 -5
- data/lib/woods/retrieval/query_classifier.rb +1 -1
- data/lib/woods/retrieval/ranker.rb +55 -6
- data/lib/woods/retrieval/search_executor.rb +42 -13
- data/lib/woods/retriever.rb +330 -24
- data/lib/woods/session_tracer/middleware.rb +35 -1
- data/lib/woods/storage/graph_store.rb +39 -0
- data/lib/woods/storage/inapplicable_backend.rb +14 -0
- data/lib/woods/storage/metadata_store.rb +129 -1
- data/lib/woods/storage/pgvector.rb +70 -8
- data/lib/woods/storage/qdrant.rb +196 -5
- data/lib/woods/storage/snapshotter/metadata.rb +172 -0
- data/lib/woods/storage/snapshotter/vector.rb +238 -0
- data/lib/woods/storage/snapshotter.rb +24 -0
- data/lib/woods/storage/vector_store.rb +184 -35
- data/lib/woods/tasks.rb +85 -0
- data/lib/woods/temporal/snapshot_store.rb +49 -1
- data/lib/woods/token_utils.rb +44 -5
- data/lib/woods/unblocked/client.rb +1 -1
- data/lib/woods/unblocked/document_builder.rb +35 -10
- data/lib/woods/unblocked/exporter.rb +1 -1
- data/lib/woods/util/host_guard.rb +61 -0
- data/lib/woods/version.rb +1 -1
- data/lib/woods.rb +126 -6
- metadata +69 -4
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'time'
|
|
4
|
+
|
|
5
|
+
module Woods
|
|
6
|
+
module MCP
|
|
7
|
+
# Tracks the lifecycle state of an MCP server bootstrap sequence.
|
|
8
|
+
#
|
|
9
|
+
# Status transitions flow forward: +initializing+ → +hydrating+ →
|
|
10
|
+
# +hydrated+ (success path), or +initializing+/+hydrating+ → +degraded+
|
|
11
|
+
# (provider unreachable) or +failed+ (config-invalid). States are mutated
|
|
12
|
+
# via {#mark} so the +woods_status+ MCP tool always reads consistent values.
|
|
13
|
+
#
|
|
14
|
+
# @example Bootstrapper usage
|
|
15
|
+
# state = Woods::MCP::BootstrapState.new
|
|
16
|
+
# state.mark(:hydrating)
|
|
17
|
+
# vector_store = Snapshotter::Vector.load_or_empty(artifact)
|
|
18
|
+
# state.mark(:hydrated)
|
|
19
|
+
# # or, on provider failure:
|
|
20
|
+
# state.mark(:degraded, reason: ProviderUnreachable.new("..."))
|
|
21
|
+
#
|
|
22
|
+
class BootstrapState
|
|
23
|
+
VALID_STATUSES = %i[initializing hydrating hydrated degraded failed].freeze
|
|
24
|
+
|
|
25
|
+
# @return [Symbol] one of +:initializing+, +:hydrating+, +:hydrated+,
|
|
26
|
+
# +:degraded+, +:failed+
|
|
27
|
+
attr_reader :status
|
|
28
|
+
|
|
29
|
+
# @return [Exception, nil] the exception that caused degradation or failure
|
|
30
|
+
attr_reader :reason
|
|
31
|
+
|
|
32
|
+
# @return [Time, nil] set when status transitions to +:hydrated+
|
|
33
|
+
attr_reader :hydrated_at
|
|
34
|
+
|
|
35
|
+
# @return [Time, nil] set when status transitions to +:degraded+
|
|
36
|
+
attr_reader :degraded_since
|
|
37
|
+
|
|
38
|
+
# @return [Woods::ResolvedConfig, nil] captured at embed time and read
|
|
39
|
+
# back from +woods.json+ during boot. Used by +woods_status+ to report
|
|
40
|
+
# the provider/model actually in play instead of the stale defaults on
|
|
41
|
+
# {Woods.configuration}.
|
|
42
|
+
attr_accessor :resolved_config
|
|
43
|
+
|
|
44
|
+
def initialize
|
|
45
|
+
@status = :initializing
|
|
46
|
+
@reason = nil
|
|
47
|
+
@hydrated_at = nil
|
|
48
|
+
@degraded_since = nil
|
|
49
|
+
@resolved_config = nil
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Transition to a new status.
|
|
53
|
+
#
|
|
54
|
+
# +hydrated_at+ is recorded on +:hydrated+; +degraded_since+ is recorded
|
|
55
|
+
# on +:degraded+. +reason:+ is accepted for +:degraded+ and +:failed+.
|
|
56
|
+
#
|
|
57
|
+
# @param new_status [Symbol] target status (must be in {VALID_STATUSES})
|
|
58
|
+
# @param reason [Exception, nil] causal exception for degraded/failed states
|
|
59
|
+
# @param now [Time] timestamp for the transition (default: UTC now)
|
|
60
|
+
# @return [self]
|
|
61
|
+
# @raise [ArgumentError] when +new_status+ is not a recognised status
|
|
62
|
+
def mark(new_status, reason: nil, now: Time.now.utc)
|
|
63
|
+
unless VALID_STATUSES.include?(new_status)
|
|
64
|
+
raise ArgumentError,
|
|
65
|
+
"Unknown status #{new_status.inspect}. " \
|
|
66
|
+
"Must be one of: #{VALID_STATUSES.map(&:inspect).join(', ')}"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
@status = new_status
|
|
70
|
+
@reason = reason
|
|
71
|
+
|
|
72
|
+
case new_status
|
|
73
|
+
when :hydrated
|
|
74
|
+
@hydrated_at = now
|
|
75
|
+
when :degraded
|
|
76
|
+
@degraded_since = now
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
self
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Returns a hash suitable for embedding in a +woods_status+ MCP response.
|
|
83
|
+
#
|
|
84
|
+
# @return [Hash]
|
|
85
|
+
def to_h
|
|
86
|
+
h = { status: @status }
|
|
87
|
+
h[:reason] = "#{@reason.class}: #{@reason.message}" if @reason
|
|
88
|
+
h[:hydrated_at] = @hydrated_at.iso8601 if @hydrated_at
|
|
89
|
+
h[:degraded_since] = @degraded_since.iso8601 if @degraded_since
|
|
90
|
+
h
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
require_relative 'errors'
|
|
6
|
+
require_relative 'bootstrap_state'
|
|
7
|
+
require_relative 'config_resolver'
|
|
8
|
+
require_relative 'provider_probe'
|
|
9
|
+
require_relative '../index_artifact'
|
|
10
|
+
require_relative '../builder'
|
|
11
|
+
require_relative '../resolved_config'
|
|
12
|
+
require_relative '../storage/snapshotter'
|
|
13
|
+
require_relative '../storage/inapplicable_backend'
|
|
14
|
+
|
|
3
15
|
module Woods
|
|
4
16
|
module MCP
|
|
5
17
|
# Shared setup logic for MCP server executables.
|
|
@@ -68,29 +80,338 @@ module Woods
|
|
|
68
80
|
end
|
|
69
81
|
end
|
|
70
82
|
|
|
71
|
-
#
|
|
83
|
+
# Build a retriever for MCP semantic search.
|
|
84
|
+
#
|
|
85
|
+
# Flow:
|
|
86
|
+
# 1. Wrap output_dir in an IndexArtifact (owns path semantics).
|
|
87
|
+
# 2. If woods.json is present, resolve config from it; otherwise
|
|
88
|
+
# either raise MissingArtifact or, if WOODS_ALLOW_AUTODETECT=1,
|
|
89
|
+
# fall back to env-var auto-detect (deprecated path).
|
|
90
|
+
# 3. Build provider + stores from config (no mutation of
|
|
91
|
+
# Woods.configuration — the host's initializer stays intact).
|
|
92
|
+
# 4. Hydrate in-memory stores from dumps (stubs in PR 2; real in PR 3).
|
|
93
|
+
# 5. Probe the provider. If reachable, state :hydrated. If unreachable,
|
|
94
|
+
# state :degraded — retriever is still returned, queries will
|
|
95
|
+
# retry on first use.
|
|
96
|
+
#
|
|
97
|
+
# Config-invalid failures raise typed BootstrapError subclasses;
|
|
98
|
+
# exe/woods-mcp's top-level catches them and prints a one-line
|
|
99
|
+
# operator message. Dependency-unreachable failures start degraded
|
|
100
|
+
# and surface via woods_status.
|
|
101
|
+
#
|
|
102
|
+
# @param index_dir [String, nil] Path to the extraction output directory.
|
|
103
|
+
# When nil, uses Woods.configuration.output_dir.
|
|
104
|
+
# @return [Array(Woods::Retriever, Woods::MCP::BootstrapState)]
|
|
105
|
+
# @raise [Woods::MCP::BootstrapError] on config-invalid (missing
|
|
106
|
+
# credentials, dimension mismatch, unsupported artifact, missing
|
|
107
|
+
# artifact with autodetect off).
|
|
108
|
+
def self.build_retriever(index_dir: nil)
|
|
109
|
+
state = BootstrapState.new
|
|
110
|
+
state.mark(:hydrating)
|
|
111
|
+
|
|
112
|
+
artifact = build_artifact(index_dir)
|
|
113
|
+
config, _source = ConfigResolver.resolve(Woods.configuration,
|
|
114
|
+
artifact: artifact,
|
|
115
|
+
ollama_probe: method(:ollama_reachable?))
|
|
116
|
+
return [nil, state] unless config.embedding_provider
|
|
117
|
+
|
|
118
|
+
# Build the provider once so {ResolvedConfig.from_configuration} can
|
|
119
|
+
# probe +provider.dimensions+ — without this, Ollama's runtime-only
|
|
120
|
+
# dimension never makes it into +resolved+ and the downstream
|
|
121
|
+
# Snapshotter.load_or_empty validation compares stored-vs-0.
|
|
122
|
+
#
|
|
123
|
+
# The probe is tolerant: if the provider is unreachable we still
|
|
124
|
+
# need a non-nil +resolved+ so the MCP server can start degraded
|
|
125
|
+
# (see the "provider unreachable" branch below). Snapshotter then
|
|
126
|
+
# surfaces a DimensionMismatch only if there's actually a stored
|
|
127
|
+
# artifact to validate against.
|
|
128
|
+
resolved = build_resolved_config(config)
|
|
129
|
+
state.resolved_config = resolved
|
|
130
|
+
retriever = build_retriever_from_config(config, resolved, artifact)
|
|
131
|
+
probe_and_mark_state(config, state)
|
|
132
|
+
warn "[woods-mcp] semantic search: #{state.status} (#{config.embedding_provider})"
|
|
133
|
+
|
|
134
|
+
[retriever, state]
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Backwards-compatible wrapper — existing callers (exe/woods-mcp and
|
|
138
|
+
# exe/woods-mcp-http) just want the retriever. They rescue typed
|
|
139
|
+
# BootstrapError at their own top level; we do not catch here.
|
|
140
|
+
def self.build_retriever_compat(index_dir: nil)
|
|
141
|
+
retriever, _state = build_retriever(index_dir: index_dir)
|
|
142
|
+
retriever
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Refresh a live retriever's in-memory stores from the latest dumps on
|
|
146
|
+
# disk. Used by the MCP +reload+ tool so agents can pick up a fresh embed
|
|
147
|
+
# run without restarting the process. The retriever instance is preserved
|
|
148
|
+
# (tool closures kept their reference) — only the stores are mutated.
|
|
149
|
+
#
|
|
150
|
+
# No-op when:
|
|
151
|
+
# - +retriever+ is nil (no embedding provider configured)
|
|
152
|
+
# - stores are durable (pgvector / Qdrant auto-refresh externally)
|
|
153
|
+
# - +woods.json+ is absent (Shape-1 deployments don't use Snapshotter)
|
|
154
|
+
#
|
|
155
|
+
# @param retriever [Woods::Retriever, nil]
|
|
156
|
+
# @param index_dir [String, Pathname]
|
|
157
|
+
# @return [Hash] Stats — +{ vectors:, metadata:, graph: }+ record counts
|
|
158
|
+
# @raise [Woods::MCP::BootstrapError] surfaced from ConfigResolver / Snapshotter
|
|
159
|
+
def self.reload_stores!(retriever, index_dir:)
|
|
160
|
+
return { vectors: 0, metadata: 0, graph: 0 } unless retriever
|
|
161
|
+
|
|
162
|
+
artifact = build_artifact(index_dir)
|
|
163
|
+
config, _source = ConfigResolver.resolve(Woods.configuration,
|
|
164
|
+
artifact: artifact,
|
|
165
|
+
ollama_probe: method(:ollama_reachable?))
|
|
166
|
+
resolved = build_resolved_config(config)
|
|
167
|
+
|
|
168
|
+
vectors_count = refill_in_memory_vector_store(retriever, config, resolved, artifact)
|
|
169
|
+
metadata_count = refill_in_memory_metadata_store(retriever, config, resolved, artifact)
|
|
170
|
+
graph_count = refill_in_memory_graph_store(retriever, config, artifact)
|
|
171
|
+
|
|
172
|
+
# Context-cache entries from the previous embed run no longer agree
|
|
173
|
+
# with the refreshed stores. Drop them so the next codebase_retrieve
|
|
174
|
+
# call goes through the full pipeline with the new data. Embedding
|
|
175
|
+
# caches (query → vector) survive — that mapping is deterministic
|
|
176
|
+
# for a given provider+model.
|
|
177
|
+
retriever.invalidate_context_cache! if retriever.respond_to?(:invalidate_context_cache!)
|
|
178
|
+
|
|
179
|
+
{ vectors: vectors_count, metadata: metadata_count, graph: graph_count }
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Retriever (and CachedRetriever) expose public +vector_store+ /
|
|
183
|
+
# +metadata_store+ / +graph_store+ readers so this helper never pokes
|
|
184
|
+
# private state. Durable backends don't implement +clear!+/+bulk_load+
|
|
185
|
+
# — they return 0 silently because they're already refreshed externally.
|
|
186
|
+
def self.refill_in_memory_vector_store(retriever, config, resolved, artifact)
|
|
187
|
+
vs = retriever.respond_to?(:vector_store) ? retriever.vector_store : nil
|
|
188
|
+
return 0 unless vs.respond_to?(:clear!) && vs.respond_to?(:bulk_load)
|
|
189
|
+
|
|
190
|
+
fresh = hydrated_vector_store(config, resolved, artifact)
|
|
191
|
+
return 0 unless fresh
|
|
192
|
+
|
|
193
|
+
vs.clear!
|
|
194
|
+
vs.bulk_load(fresh.each_entry.map { |id, vec, meta| { id: id, vector: vec, metadata: meta } })
|
|
195
|
+
vs.respond_to?(:count) ? vs.count : 0
|
|
196
|
+
end
|
|
197
|
+
private_class_method :refill_in_memory_vector_store
|
|
198
|
+
|
|
199
|
+
def self.refill_in_memory_metadata_store(retriever, config, resolved, artifact)
|
|
200
|
+
ms = retriever.respond_to?(:metadata_store) ? retriever.metadata_store : nil
|
|
201
|
+
return 0 unless ms.respond_to?(:clear!) && ms.respond_to?(:bulk_load)
|
|
202
|
+
|
|
203
|
+
fresh = hydrated_metadata_store(config, resolved, artifact)
|
|
204
|
+
return 0 unless fresh
|
|
205
|
+
|
|
206
|
+
ms.clear!
|
|
207
|
+
ms.bulk_load(fresh.each_entry)
|
|
208
|
+
ms.respond_to?(:count) ? ms.count : 0
|
|
209
|
+
end
|
|
210
|
+
private_class_method :refill_in_memory_metadata_store
|
|
211
|
+
|
|
212
|
+
# GraphStore::Memory doesn't expose a +clear!+/+bulk_load+ pair today
|
|
213
|
+
# — a fresh run hands it an entirely new DependencyGraph from disk.
|
|
214
|
+
# Swap the inner graph via +replace_graph+ so SearchExecutor / Ranker /
|
|
215
|
+
# MCP tools keep their references to the same wrapper and see the new
|
|
216
|
+
# graph (no closure references break).
|
|
217
|
+
def self.refill_in_memory_graph_store(retriever, config, artifact)
|
|
218
|
+
gs = retriever.respond_to?(:graph_store) ? retriever.graph_store : nil
|
|
219
|
+
return 0 unless gs.respond_to?(:replace_graph)
|
|
220
|
+
|
|
221
|
+
fresh = hydrated_graph_store(config, artifact)
|
|
222
|
+
return 0 if fresh.nil?
|
|
223
|
+
|
|
224
|
+
gs.replace_graph(fresh.graph)
|
|
225
|
+
1
|
|
226
|
+
end
|
|
227
|
+
private_class_method :refill_in_memory_graph_store
|
|
228
|
+
|
|
229
|
+
# Check whether Ollama is reachable at the configured base URL.
|
|
230
|
+
#
|
|
231
|
+
# Kept for backwards compatibility with existing specs. Delegates to
|
|
232
|
+
# {Woods::MCP::ConfigResolver} and is passed as the +ollama_probe:+
|
|
233
|
+
# callable in {.build_retriever} so that specs stubbing this method
|
|
234
|
+
# continue to intercept Ollama checks in the autodetect path.
|
|
235
|
+
#
|
|
236
|
+
# New code should use {Woods::MCP::ProviderProbe.reachable!} via the
|
|
237
|
+
# ResolvedConfig flow.
|
|
238
|
+
#
|
|
239
|
+
# @return [Boolean]
|
|
240
|
+
def self.ollama_reachable?
|
|
241
|
+
ConfigResolver.send(:ollama_reachable?)
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Build a ResolvedConfig from the live host config, probing the
|
|
245
|
+
# provider for its dimension when possible. A provider that can't
|
|
246
|
+
# be reached (Ollama down) falls back to the declared-only path so
|
|
247
|
+
# the MCP server can still come up degraded.
|
|
248
|
+
def self.build_resolved_config(config)
|
|
249
|
+
provider = Woods::Builder.new(config).build_embedding_provider
|
|
250
|
+
ResolvedConfig.from_configuration(config, provider: provider)
|
|
251
|
+
rescue StandardError
|
|
252
|
+
ResolvedConfig.from_configuration(config)
|
|
253
|
+
end
|
|
254
|
+
private_class_method :build_resolved_config
|
|
255
|
+
|
|
256
|
+
# Resolve an IndexArtifact from the passed dir or Woods.configuration.
|
|
257
|
+
def self.build_artifact(index_dir)
|
|
258
|
+
dir = index_dir || Woods.configuration.output_dir
|
|
259
|
+
IndexArtifact.new(dir) if dir
|
|
260
|
+
end
|
|
261
|
+
private_class_method :build_artifact
|
|
262
|
+
|
|
263
|
+
def self.build_retriever_from_config(config, resolved, artifact)
|
|
264
|
+
vector_store = hydrated_vector_store(config, resolved, artifact)
|
|
265
|
+
metadata_store = hydrated_metadata_store(config, resolved, artifact)
|
|
266
|
+
graph_store = hydrated_graph_store(config, artifact)
|
|
267
|
+
|
|
268
|
+
# Cross-populate the vector store's per-entry metadata cache from
|
|
269
|
+
# the metadata store. The WVF1 binary format stores only id + float
|
|
270
|
+
# blob (no per-vector hash) — it's numeric-only to keep dumps
|
|
271
|
+
# mmap-friendly. The metadata lives in metadata.msgpack, and
|
|
272
|
+
# InMemory::VectorStore#search uses its per-entry metadata for
|
|
273
|
+
# filter predicates. Without this back-fill, a type-filtered
|
|
274
|
+
# search returns zero results after a dump/reload.
|
|
275
|
+
populate_vector_metadata(vector_store, metadata_store) if vector_store && metadata_store
|
|
276
|
+
|
|
277
|
+
Woods::Builder.new(config).build_retriever(
|
|
278
|
+
vector_store: vector_store, metadata_store: metadata_store,
|
|
279
|
+
graph_store: graph_store
|
|
280
|
+
)
|
|
281
|
+
end
|
|
282
|
+
private_class_method :build_retriever_from_config
|
|
283
|
+
|
|
284
|
+
# Hydrate an in-memory graph store from +dependency_graph.json+ on disk.
|
|
285
|
+
#
|
|
286
|
+
# The Indexer doesn't populate the graph store — it accepts the kwarg,
|
|
287
|
+
# dumps it empty at end of run, and moves on. But +dependency_graph.json+
|
|
288
|
+
# is already written at extraction time with the full graph. Loading it
|
|
289
|
+
# here via {DependencyGraph.from_h} turns the retriever's +:hybrid+
|
|
290
|
+
# strategy from a silent no-op (empty graph → no graph expansion) into
|
|
291
|
+
# a working graph-expansion source.
|
|
292
|
+
#
|
|
293
|
+
# Contract: this path assumes an ephemeral store. A durable backend
|
|
294
|
+
# (adapter reports +durable? => true+) owns its own persistence and
|
|
295
|
+
# must be populated via the extraction/embed write path — rebuilding
|
|
296
|
+
# it from +dependency_graph.json+ on every boot would stomp state the
|
|
297
|
+
# durable store is supposed to preserve. When a future adapter declares
|
|
298
|
+
# itself durable, this method raises {Woods::Storage::InapplicableBackend}
|
|
299
|
+
# so the contributor adding the adapter sees the contract violation at
|
|
300
|
+
# boot rather than shipping a store that silently disagrees with
|
|
301
|
+
# +dependency_graph.json+. Mirrors the pattern +Snapshotter::Vector.dump+
|
|
302
|
+
# uses for pgvector / Qdrant.
|
|
72
303
|
#
|
|
73
|
-
#
|
|
74
|
-
#
|
|
304
|
+
# Returns nil when the artifact or dependency graph file is absent so
|
|
305
|
+
# Builder falls back to a fresh empty store — the pre-fix behaviour for
|
|
306
|
+
# hosts that haven't run an extraction yet.
|
|
75
307
|
#
|
|
76
|
-
# @
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
308
|
+
# @param config [Woods::Configuration]
|
|
309
|
+
# @param artifact [Woods::IndexArtifact, nil]
|
|
310
|
+
# @return [Woods::Storage::GraphStore::Memory, nil]
|
|
311
|
+
# @raise [Woods::Storage::InapplicableBackend] if the configured
|
|
312
|
+
# graph_store reports +durable? => true+
|
|
313
|
+
def self.hydrated_graph_store(config, artifact)
|
|
314
|
+
return nil unless artifact
|
|
315
|
+
|
|
316
|
+
graph_json = artifact.output_dir.join('dependency_graph.json')
|
|
317
|
+
return nil unless graph_json.exist?
|
|
318
|
+
|
|
319
|
+
require_relative '../dependency_graph'
|
|
320
|
+
require_relative '../storage/graph_store'
|
|
321
|
+
|
|
322
|
+
probe = Woods::Builder.new(config).build_graph_store
|
|
323
|
+
if probe.durable?
|
|
324
|
+
raise Woods::Storage::InapplicableBackend,
|
|
325
|
+
"graph_store=#{config.graph_store.inspect} reports durable? => true; " \
|
|
326
|
+
'boot rehydration from dependency_graph.json is only valid for ephemeral ' \
|
|
327
|
+
'stores. Populate the durable backend via the extraction write path instead.'
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
graph = Woods::DependencyGraph.from_h(JSON.parse(graph_json.read))
|
|
331
|
+
Woods::Storage::GraphStore::Memory.new(graph)
|
|
332
|
+
rescue Woods::Storage::InapplicableBackend
|
|
333
|
+
raise
|
|
334
|
+
rescue StandardError => e
|
|
335
|
+
warn "[woods-mcp] graph hydration failed (#{e.class}: #{e.message}); starting with empty store"
|
|
336
|
+
nil
|
|
337
|
+
end
|
|
338
|
+
private_class_method :hydrated_graph_store
|
|
339
|
+
|
|
340
|
+
# Suffix the Indexer appends when a single unit is split into
|
|
341
|
+
# multiple embedding vectors — see
|
|
342
|
+
# {Embedding::Indexer#collect_embed_items}. Vector rows are keyed
|
|
343
|
+
# per-chunk (+Foo#chunk_0+) but metadata is keyed by the base
|
|
344
|
+
# identifier (+Foo+), so hydration strips the suffix before the
|
|
345
|
+
# lookup. Mirrors the pattern in {Retriever} and
|
|
346
|
+
# {Retrieval::ContextAssembler}.
|
|
347
|
+
CHUNK_SUFFIX_PATTERN = /#chunk_\d+\z/
|
|
348
|
+
private_constant :CHUNK_SUFFIX_PATTERN
|
|
349
|
+
|
|
350
|
+
# Back-fill the vector store's per-entry metadata hashes from the
|
|
351
|
+
# metadata store. Only makes sense when both are in-memory — durable
|
|
352
|
+
# backends return nil from the hydration helpers and never reach
|
|
353
|
+
# this path.
|
|
354
|
+
def self.populate_vector_metadata(vector_store, metadata_store)
|
|
355
|
+
return unless vector_store.respond_to?(:each_entry) && vector_store.respond_to?(:store)
|
|
356
|
+
return unless metadata_store.respond_to?(:find)
|
|
357
|
+
|
|
358
|
+
# Collect (id, vector) pairs in one pass; overwriting via #store
|
|
359
|
+
# re-triggers the metadata update path without changing the
|
|
360
|
+
# underlying flat buffer (store semantics: same id → overwrite
|
|
361
|
+
# vector + metadata in place).
|
|
362
|
+
entries = vector_store.each_entry.map { |id, vec, _meta| [id, vec] }
|
|
363
|
+
entries.each do |id, vec|
|
|
364
|
+
meta = metadata_store.find(id.to_s.sub(CHUNK_SUFFIX_PATTERN, ''))
|
|
365
|
+
next if meta.nil? || (meta.respond_to?(:empty?) && meta.empty?)
|
|
366
|
+
|
|
367
|
+
vector_store.store(id, vec, meta)
|
|
87
368
|
end
|
|
369
|
+
end
|
|
370
|
+
private_class_method :populate_vector_metadata
|
|
88
371
|
|
|
89
|
-
|
|
372
|
+
# Return a hydrated InMemory vector store when Shape 2 applies
|
|
373
|
+
# (in-memory configured + artifact on disk + resolved config) —
|
|
374
|
+
# otherwise nil, which tells Builder to construct a fresh one.
|
|
375
|
+
# Durable backends (pgvector, Qdrant) never match this path.
|
|
376
|
+
def self.hydrated_vector_store(config, resolved, artifact)
|
|
377
|
+
return nil unless artifact && resolved
|
|
378
|
+
return nil unless config.vector_store == :in_memory
|
|
379
|
+
|
|
380
|
+
Woods::Storage::Snapshotter::Vector.load_or_empty(artifact, resolved_config: resolved)
|
|
381
|
+
rescue Woods::MCP::BootstrapError, ArgumentError
|
|
382
|
+
# Config-invalid failures — ArgumentError signals a misconfigured
|
|
383
|
+
# output_dir (dump_dir outside dumps_root) or a programming bug,
|
|
384
|
+
# not a transient I/O issue. Operators must see these.
|
|
385
|
+
raise
|
|
386
|
+
rescue StandardError => e
|
|
387
|
+
warn "[woods-mcp] vector hydration failed (#{e.class}: #{e.message}); starting with empty store"
|
|
388
|
+
nil
|
|
389
|
+
end
|
|
390
|
+
private_class_method :hydrated_vector_store
|
|
391
|
+
|
|
392
|
+
def self.hydrated_metadata_store(config, resolved, artifact)
|
|
393
|
+
return nil unless artifact && resolved
|
|
394
|
+
return nil unless config.metadata_store == :in_memory
|
|
395
|
+
|
|
396
|
+
Woods::Storage::Snapshotter::Metadata.load_or_empty(artifact, resolved_config: resolved)
|
|
397
|
+
rescue Woods::MCP::BootstrapError, ArgumentError
|
|
398
|
+
raise
|
|
90
399
|
rescue StandardError => e
|
|
91
|
-
warn "
|
|
400
|
+
warn "[woods-mcp] metadata hydration failed (#{e.class}: #{e.message}); starting with empty store"
|
|
92
401
|
nil
|
|
93
402
|
end
|
|
403
|
+
private_class_method :hydrated_metadata_store
|
|
404
|
+
|
|
405
|
+
def self.probe_and_mark_state(config, state)
|
|
406
|
+
provider = Woods::Builder.new(config).build_embedding_provider
|
|
407
|
+
ProviderProbe.reachable!(provider)
|
|
408
|
+
state.mark(:hydrated)
|
|
409
|
+
rescue ProviderUnreachable => e
|
|
410
|
+
state.mark(:degraded, reason: e)
|
|
411
|
+
warn "[woods-mcp] provider unreachable at boot: #{e.url} (#{e.reason}); " \
|
|
412
|
+
'starting degraded — will retry on first query'
|
|
413
|
+
end
|
|
414
|
+
private_class_method :probe_and_mark_state
|
|
94
415
|
end
|
|
95
416
|
end
|
|
96
417
|
end
|