woods 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +169 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +15 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +3 -4
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +737 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +1 -1
  102. data/lib/woods/unblocked/document_builder.rb +35 -10
  103. data/lib/woods/unblocked/exporter.rb +1 -1
  104. data/lib/woods/util/host_guard.rb +61 -0
  105. data/lib/woods/version.rb +1 -1
  106. data/lib/woods.rb +126 -6
  107. metadata +69 -4
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'time'
4
+
5
+ module Woods
6
+ module MCP
7
+ # Tracks the lifecycle state of an MCP server bootstrap sequence.
8
+ #
9
+ # Status transitions flow forward: +initializing+ → +hydrating+ →
10
+ # +hydrated+ (success path), or +initializing+/+hydrating+ → +degraded+
11
+ # (provider unreachable) or +failed+ (config-invalid). States are mutated
12
+ # via {#mark} so the +woods_status+ MCP tool always reads consistent values.
13
+ #
14
+ # @example Bootstrapper usage
15
+ # state = Woods::MCP::BootstrapState.new
16
+ # state.mark(:hydrating)
17
+ # vector_store = Snapshotter::Vector.load_or_empty(artifact)
18
+ # state.mark(:hydrated)
19
+ # # or, on provider failure:
20
+ # state.mark(:degraded, reason: ProviderUnreachable.new("..."))
21
+ #
22
+ class BootstrapState
23
+ VALID_STATUSES = %i[initializing hydrating hydrated degraded failed].freeze
24
+
25
+ # @return [Symbol] one of +:initializing+, +:hydrating+, +:hydrated+,
26
+ # +:degraded+, +:failed+
27
+ attr_reader :status
28
+
29
+ # @return [Exception, nil] the exception that caused degradation or failure
30
+ attr_reader :reason
31
+
32
+ # @return [Time, nil] set when status transitions to +:hydrated+
33
+ attr_reader :hydrated_at
34
+
35
+ # @return [Time, nil] set when status transitions to +:degraded+
36
+ attr_reader :degraded_since
37
+
38
+ # @return [Woods::ResolvedConfig, nil] captured at embed time and read
39
+ # back from +woods.json+ during boot. Used by +woods_status+ to report
40
+ # the provider/model actually in play instead of the stale defaults on
41
+ # {Woods.configuration}.
42
+ attr_accessor :resolved_config
43
+
44
+ def initialize
45
+ @status = :initializing
46
+ @reason = nil
47
+ @hydrated_at = nil
48
+ @degraded_since = nil
49
+ @resolved_config = nil
50
+ end
51
+
52
+ # Transition to a new status.
53
+ #
54
+ # +hydrated_at+ is recorded on +:hydrated+; +degraded_since+ is recorded
55
+ # on +:degraded+. +reason:+ is accepted for +:degraded+ and +:failed+.
56
+ #
57
+ # @param new_status [Symbol] target status (must be in {VALID_STATUSES})
58
+ # @param reason [Exception, nil] causal exception for degraded/failed states
59
+ # @param now [Time] timestamp for the transition (default: UTC now)
60
+ # @return [self]
61
+ # @raise [ArgumentError] when +new_status+ is not a recognised status
62
+ def mark(new_status, reason: nil, now: Time.now.utc)
63
+ unless VALID_STATUSES.include?(new_status)
64
+ raise ArgumentError,
65
+ "Unknown status #{new_status.inspect}. " \
66
+ "Must be one of: #{VALID_STATUSES.map(&:inspect).join(', ')}"
67
+ end
68
+
69
+ @status = new_status
70
+ @reason = reason
71
+
72
+ case new_status
73
+ when :hydrated
74
+ @hydrated_at = now
75
+ when :degraded
76
+ @degraded_since = now
77
+ end
78
+
79
+ self
80
+ end
81
+
82
+ # Returns a hash suitable for embedding in a +woods_status+ MCP response.
83
+ #
84
+ # @return [Hash]
85
+ def to_h
86
+ h = { status: @status }
87
+ h[:reason] = "#{@reason.class}: #{@reason.message}" if @reason
88
+ h[:hydrated_at] = @hydrated_at.iso8601 if @hydrated_at
89
+ h[:degraded_since] = @degraded_since.iso8601 if @degraded_since
90
+ h
91
+ end
92
+ end
93
+ end
94
+ end
@@ -1,5 +1,17 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'json'
4
+
5
+ require_relative 'errors'
6
+ require_relative 'bootstrap_state'
7
+ require_relative 'config_resolver'
8
+ require_relative 'provider_probe'
9
+ require_relative '../index_artifact'
10
+ require_relative '../builder'
11
+ require_relative '../resolved_config'
12
+ require_relative '../storage/snapshotter'
13
+ require_relative '../storage/inapplicable_backend'
14
+
3
15
  module Woods
4
16
  module MCP
5
17
  # Shared setup logic for MCP server executables.
@@ -68,29 +80,338 @@ module Woods
68
80
  end
69
81
  end
70
82
 
71
- # Attempt to build a retriever for semantic search.
83
+ # Build a retriever for MCP semantic search.
84
+ #
85
+ # Flow:
86
+ # 1. Wrap output_dir in an IndexArtifact (owns path semantics).
87
+ # 2. If woods.json is present, resolve config from it; otherwise
88
+ # either raise MissingArtifact or, if WOODS_ALLOW_AUTODETECT=1,
89
+ # fall back to env-var auto-detect (deprecated path).
90
+ # 3. Build provider + stores from config (no mutation of
91
+ # Woods.configuration — the host's initializer stays intact).
92
+ # 4. Hydrate in-memory stores from dumps (stubs in PR 2; real in PR 3).
93
+ # 5. Probe the provider. If reachable, state :hydrated. If unreachable,
94
+ # state :degraded — retriever is still returned, queries will
95
+ # retry on first use.
96
+ #
97
+ # Config-invalid failures raise typed BootstrapError subclasses;
98
+ # exe/woods-mcp's top-level catches them and prints a one-line
99
+ # operator message. Dependency-unreachable failures start degraded
100
+ # and surface via woods_status.
101
+ #
102
+ # @param index_dir [String, nil] Path to the extraction output directory.
103
+ # When nil, uses Woods.configuration.output_dir.
104
+ # @return [Array(Woods::Retriever, Woods::MCP::BootstrapState)]
105
+ # @raise [Woods::MCP::BootstrapError] on config-invalid (missing
106
+ # credentials, dimension mismatch, unsupported artifact, missing
107
+ # artifact with autodetect off).
108
+ def self.build_retriever(index_dir: nil)
109
+ state = BootstrapState.new
110
+ state.mark(:hydrating)
111
+
112
+ artifact = build_artifact(index_dir)
113
+ config, _source = ConfigResolver.resolve(Woods.configuration,
114
+ artifact: artifact,
115
+ ollama_probe: method(:ollama_reachable?))
116
+ return [nil, state] unless config.embedding_provider
117
+
118
+ # Build the provider once so {ResolvedConfig.from_configuration} can
119
+ # probe +provider.dimensions+ — without this, Ollama's runtime-only
120
+ # dimension never makes it into +resolved+ and the downstream
121
+ # Snapshotter.load_or_empty validation compares stored-vs-0.
122
+ #
123
+ # The probe is tolerant: if the provider is unreachable we still
124
+ # need a non-nil +resolved+ so the MCP server can start degraded
125
+ # (see the "provider unreachable" branch below). Snapshotter then
126
+ # surfaces a DimensionMismatch only if there's actually a stored
127
+ # artifact to validate against.
128
+ resolved = build_resolved_config(config)
129
+ state.resolved_config = resolved
130
+ retriever = build_retriever_from_config(config, resolved, artifact)
131
+ probe_and_mark_state(config, state)
132
+ warn "[woods-mcp] semantic search: #{state.status} (#{config.embedding_provider})"
133
+
134
+ [retriever, state]
135
+ end
136
+
137
+ # Backwards-compatible wrapper — existing callers (exe/woods-mcp and
138
+ # exe/woods-mcp-http) just want the retriever. They rescue typed
139
+ # BootstrapError at their own top level; we do not catch here.
140
+ def self.build_retriever_compat(index_dir: nil)
141
+ retriever, _state = build_retriever(index_dir: index_dir)
142
+ retriever
143
+ end
144
+
145
+ # Refresh a live retriever's in-memory stores from the latest dumps on
146
+ # disk. Used by the MCP +reload+ tool so agents can pick up a fresh embed
147
+ # run without restarting the process. The retriever instance is preserved
148
+ # (tool closures kept their reference) — only the stores are mutated.
149
+ #
150
+ # No-op when:
151
+ # - +retriever+ is nil (no embedding provider configured)
152
+ # - stores are durable (pgvector / Qdrant auto-refresh externally)
153
+ # - +woods.json+ is absent (Shape-1 deployments don't use Snapshotter)
154
+ #
155
+ # @param retriever [Woods::Retriever, nil]
156
+ # @param index_dir [String, Pathname]
157
+ # @return [Hash] Stats — +{ vectors:, metadata:, graph: }+ record counts
158
+ # @raise [Woods::MCP::BootstrapError] surfaced from ConfigResolver / Snapshotter
159
+ def self.reload_stores!(retriever, index_dir:)
160
+ return { vectors: 0, metadata: 0, graph: 0 } unless retriever
161
+
162
+ artifact = build_artifact(index_dir)
163
+ config, _source = ConfigResolver.resolve(Woods.configuration,
164
+ artifact: artifact,
165
+ ollama_probe: method(:ollama_reachable?))
166
+ resolved = build_resolved_config(config)
167
+
168
+ vectors_count = refill_in_memory_vector_store(retriever, config, resolved, artifact)
169
+ metadata_count = refill_in_memory_metadata_store(retriever, config, resolved, artifact)
170
+ graph_count = refill_in_memory_graph_store(retriever, config, artifact)
171
+
172
+ # Context-cache entries from the previous embed run no longer agree
173
+ # with the refreshed stores. Drop them so the next codebase_retrieve
174
+ # call goes through the full pipeline with the new data. Embedding
175
+ # caches (query → vector) survive — that mapping is deterministic
176
+ # for a given provider+model.
177
+ retriever.invalidate_context_cache! if retriever.respond_to?(:invalidate_context_cache!)
178
+
179
+ { vectors: vectors_count, metadata: metadata_count, graph: graph_count }
180
+ end
181
+
182
+ # Retriever (and CachedRetriever) expose public +vector_store+ /
183
+ # +metadata_store+ / +graph_store+ readers so this helper never pokes
184
+ # private state. Durable backends don't implement +clear!+/+bulk_load+
185
+ # — they return 0 silently because they're already refreshed externally.
186
+ def self.refill_in_memory_vector_store(retriever, config, resolved, artifact)
187
+ vs = retriever.respond_to?(:vector_store) ? retriever.vector_store : nil
188
+ return 0 unless vs.respond_to?(:clear!) && vs.respond_to?(:bulk_load)
189
+
190
+ fresh = hydrated_vector_store(config, resolved, artifact)
191
+ return 0 unless fresh
192
+
193
+ vs.clear!
194
+ vs.bulk_load(fresh.each_entry.map { |id, vec, meta| { id: id, vector: vec, metadata: meta } })
195
+ vs.respond_to?(:count) ? vs.count : 0
196
+ end
197
+ private_class_method :refill_in_memory_vector_store
198
+
199
+ def self.refill_in_memory_metadata_store(retriever, config, resolved, artifact)
200
+ ms = retriever.respond_to?(:metadata_store) ? retriever.metadata_store : nil
201
+ return 0 unless ms.respond_to?(:clear!) && ms.respond_to?(:bulk_load)
202
+
203
+ fresh = hydrated_metadata_store(config, resolved, artifact)
204
+ return 0 unless fresh
205
+
206
+ ms.clear!
207
+ ms.bulk_load(fresh.each_entry)
208
+ ms.respond_to?(:count) ? ms.count : 0
209
+ end
210
+ private_class_method :refill_in_memory_metadata_store
211
+
212
+ # GraphStore::Memory doesn't expose a +clear!+/+bulk_load+ pair today
213
+ # — a fresh run hands it an entirely new DependencyGraph from disk.
214
+ # Swap the inner graph via +replace_graph+ so SearchExecutor / Ranker /
215
+ # MCP tools keep their references to the same wrapper and see the new
216
+ # graph (no closure references break).
217
+ def self.refill_in_memory_graph_store(retriever, config, artifact)
218
+ gs = retriever.respond_to?(:graph_store) ? retriever.graph_store : nil
219
+ return 0 unless gs.respond_to?(:replace_graph)
220
+
221
+ fresh = hydrated_graph_store(config, artifact)
222
+ return 0 if fresh.nil?
223
+
224
+ gs.replace_graph(fresh.graph)
225
+ 1
226
+ end
227
+ private_class_method :refill_in_memory_graph_store
228
+
229
+ # Check whether Ollama is reachable at the configured base URL.
230
+ #
231
+ # Kept for backwards compatibility with existing specs. Delegates to
232
+ # {Woods::MCP::ConfigResolver} and is passed as the +ollama_probe:+
233
+ # callable in {.build_retriever} so that specs stubbing this method
234
+ # continue to intercept Ollama checks in the autodetect path.
235
+ #
236
+ # New code should use {Woods::MCP::ProviderProbe.reachable!} via the
237
+ # ResolvedConfig flow.
238
+ #
239
+ # @return [Boolean]
240
+ def self.ollama_reachable?
241
+ ConfigResolver.send(:ollama_reachable?)
242
+ end
243
+
244
+ # Build a ResolvedConfig from the live host config, probing the
245
+ # provider for its dimension when possible. A provider that can't
246
+ # be reached (Ollama down) falls back to the declared-only path so
247
+ # the MCP server can still come up degraded.
248
+ def self.build_resolved_config(config)
249
+ provider = Woods::Builder.new(config).build_embedding_provider
250
+ ResolvedConfig.from_configuration(config, provider: provider)
251
+ rescue StandardError
252
+ ResolvedConfig.from_configuration(config)
253
+ end
254
+ private_class_method :build_resolved_config
255
+
256
+ # Resolve an IndexArtifact from the passed dir or Woods.configuration.
257
+ def self.build_artifact(index_dir)
258
+ dir = index_dir || Woods.configuration.output_dir
259
+ IndexArtifact.new(dir) if dir
260
+ end
261
+ private_class_method :build_artifact
262
+
263
+ def self.build_retriever_from_config(config, resolved, artifact)
264
+ vector_store = hydrated_vector_store(config, resolved, artifact)
265
+ metadata_store = hydrated_metadata_store(config, resolved, artifact)
266
+ graph_store = hydrated_graph_store(config, artifact)
267
+
268
+ # Cross-populate the vector store's per-entry metadata cache from
269
+ # the metadata store. The WVF1 binary format stores only id + float
270
+ # blob (no per-vector hash) — it's numeric-only to keep dumps
271
+ # mmap-friendly. The metadata lives in metadata.msgpack, and
272
+ # InMemory::VectorStore#search uses its per-entry metadata for
273
+ # filter predicates. Without this back-fill, a type-filtered
274
+ # search returns zero results after a dump/reload.
275
+ populate_vector_metadata(vector_store, metadata_store) if vector_store && metadata_store
276
+
277
+ Woods::Builder.new(config).build_retriever(
278
+ vector_store: vector_store, metadata_store: metadata_store,
279
+ graph_store: graph_store
280
+ )
281
+ end
282
+ private_class_method :build_retriever_from_config
283
+
284
+ # Hydrate an in-memory graph store from +dependency_graph.json+ on disk.
285
+ #
286
+ # The Indexer doesn't populate the graph store — it accepts the kwarg,
287
+ # dumps it empty at end of run, and moves on. But +dependency_graph.json+
288
+ # is already written at extraction time with the full graph. Loading it
289
+ # here via {DependencyGraph.from_h} turns the retriever's +:hybrid+
290
+ # strategy from a silent no-op (empty graph → no graph expansion) into
291
+ # a working graph-expansion source.
292
+ #
293
+ # Contract: this path assumes an ephemeral store. A durable backend
294
+ # (adapter reports +durable? => true+) owns its own persistence and
295
+ # must be populated via the extraction/embed write path — rebuilding
296
+ # it from +dependency_graph.json+ on every boot would stomp state the
297
+ # durable store is supposed to preserve. When a future adapter declares
298
+ # itself durable, this method raises {Woods::Storage::InapplicableBackend}
299
+ # so the contributor adding the adapter sees the contract violation at
300
+ # boot rather than shipping a store that silently disagrees with
301
+ # +dependency_graph.json+. Mirrors the pattern +Snapshotter::Vector.dump+
302
+ # uses for pgvector / Qdrant.
72
303
  #
73
- # Auto-configures from environment variables when no explicit configuration
74
- # exists. Returns nil if embedding is unavailable or setup fails.
304
+ # Returns nil when the artifact or dependency graph file is absent so
305
+ # Builder falls back to a fresh empty store the pre-fix behaviour for
306
+ # hosts that haven't run an extraction yet.
75
307
  #
76
- # @return [Woods::Retriever, nil]
77
- def self.build_retriever
78
- config = Woods.configuration
79
-
80
- openai_key = ENV.fetch('OPENAI_API_KEY', nil)
81
- if !config.embedding_provider && openai_key
82
- config.vector_store = :in_memory
83
- config.metadata_store = :in_memory
84
- config.graph_store = :in_memory
85
- config.embedding_provider = :openai
86
- config.embedding_options = { api_key: openai_key }
308
+ # @param config [Woods::Configuration]
309
+ # @param artifact [Woods::IndexArtifact, nil]
310
+ # @return [Woods::Storage::GraphStore::Memory, nil]
311
+ # @raise [Woods::Storage::InapplicableBackend] if the configured
312
+ # graph_store reports +durable? => true+
313
+ def self.hydrated_graph_store(config, artifact)
314
+ return nil unless artifact
315
+
316
+ graph_json = artifact.output_dir.join('dependency_graph.json')
317
+ return nil unless graph_json.exist?
318
+
319
+ require_relative '../dependency_graph'
320
+ require_relative '../storage/graph_store'
321
+
322
+ probe = Woods::Builder.new(config).build_graph_store
323
+ if probe.durable?
324
+ raise Woods::Storage::InapplicableBackend,
325
+ "graph_store=#{config.graph_store.inspect} reports durable? => true; " \
326
+ 'boot rehydration from dependency_graph.json is only valid for ephemeral ' \
327
+ 'stores. Populate the durable backend via the extraction write path instead.'
328
+ end
329
+
330
+ graph = Woods::DependencyGraph.from_h(JSON.parse(graph_json.read))
331
+ Woods::Storage::GraphStore::Memory.new(graph)
332
+ rescue Woods::Storage::InapplicableBackend
333
+ raise
334
+ rescue StandardError => e
335
+ warn "[woods-mcp] graph hydration failed (#{e.class}: #{e.message}); starting with empty store"
336
+ nil
337
+ end
338
+ private_class_method :hydrated_graph_store
339
+
340
+ # Suffix the Indexer appends when a single unit is split into
341
+ # multiple embedding vectors — see
342
+ # {Embedding::Indexer#collect_embed_items}. Vector rows are keyed
343
+ # per-chunk (+Foo#chunk_0+) but metadata is keyed by the base
344
+ # identifier (+Foo+), so hydration strips the suffix before the
345
+ # lookup. Mirrors the pattern in {Retriever} and
346
+ # {Retrieval::ContextAssembler}.
347
+ CHUNK_SUFFIX_PATTERN = /#chunk_\d+\z/
348
+ private_constant :CHUNK_SUFFIX_PATTERN
349
+
350
+ # Back-fill the vector store's per-entry metadata hashes from the
351
+ # metadata store. Only makes sense when both are in-memory — durable
352
+ # backends return nil from the hydration helpers and never reach
353
+ # this path.
354
+ def self.populate_vector_metadata(vector_store, metadata_store)
355
+ return unless vector_store.respond_to?(:each_entry) && vector_store.respond_to?(:store)
356
+ return unless metadata_store.respond_to?(:find)
357
+
358
+ # Collect (id, vector) pairs in one pass; overwriting via #store
359
+ # re-triggers the metadata update path without changing the
360
+ # underlying flat buffer (store semantics: same id → overwrite
361
+ # vector + metadata in place).
362
+ entries = vector_store.each_entry.map { |id, vec, _meta| [id, vec] }
363
+ entries.each do |id, vec|
364
+ meta = metadata_store.find(id.to_s.sub(CHUNK_SUFFIX_PATTERN, ''))
365
+ next if meta.nil? || (meta.respond_to?(:empty?) && meta.empty?)
366
+
367
+ vector_store.store(id, vec, meta)
87
368
  end
369
+ end
370
+ private_class_method :populate_vector_metadata
88
371
 
89
- Woods::Builder.new(config).build_retriever if config.embedding_provider
372
+ # Return a hydrated InMemory vector store when Shape 2 applies
373
+ # (in-memory configured + artifact on disk + resolved config) —
374
+ # otherwise nil, which tells Builder to construct a fresh one.
375
+ # Durable backends (pgvector, Qdrant) never match this path.
376
+ def self.hydrated_vector_store(config, resolved, artifact)
377
+ return nil unless artifact && resolved
378
+ return nil unless config.vector_store == :in_memory
379
+
380
+ Woods::Storage::Snapshotter::Vector.load_or_empty(artifact, resolved_config: resolved)
381
+ rescue Woods::MCP::BootstrapError, ArgumentError
382
+ # Config-invalid failures — ArgumentError signals a misconfigured
383
+ # output_dir (dump_dir outside dumps_root) or a programming bug,
384
+ # not a transient I/O issue. Operators must see these.
385
+ raise
386
+ rescue StandardError => e
387
+ warn "[woods-mcp] vector hydration failed (#{e.class}: #{e.message}); starting with empty store"
388
+ nil
389
+ end
390
+ private_class_method :hydrated_vector_store
391
+
392
+ def self.hydrated_metadata_store(config, resolved, artifact)
393
+ return nil unless artifact && resolved
394
+ return nil unless config.metadata_store == :in_memory
395
+
396
+ Woods::Storage::Snapshotter::Metadata.load_or_empty(artifact, resolved_config: resolved)
397
+ rescue Woods::MCP::BootstrapError, ArgumentError
398
+ raise
90
399
  rescue StandardError => e
91
- warn "Note: Semantic search unavailable (#{e.message}). Using pattern-based search only."
400
+ warn "[woods-mcp] metadata hydration failed (#{e.class}: #{e.message}); starting with empty store"
92
401
  nil
93
402
  end
403
+ private_class_method :hydrated_metadata_store
404
+
405
+ def self.probe_and_mark_state(config, state)
406
+ provider = Woods::Builder.new(config).build_embedding_provider
407
+ ProviderProbe.reachable!(provider)
408
+ state.mark(:hydrated)
409
+ rescue ProviderUnreachable => e
410
+ state.mark(:degraded, reason: e)
411
+ warn "[woods-mcp] provider unreachable at boot: #{e.url} (#{e.reason}); " \
412
+ 'starting degraded — will retry on first query'
413
+ end
414
+ private_class_method :probe_and_mark_state
94
415
  end
95
416
  end
96
417
  end