woods 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +169 -0
- data/README.md +20 -8
- data/exe/woods-console +51 -6
- data/exe/woods-console-mcp +24 -4
- data/exe/woods-mcp +30 -7
- data/exe/woods-mcp-http +47 -6
- data/lib/generators/woods/install_generator.rb +13 -4
- data/lib/generators/woods/templates/woods.rb.tt +155 -0
- data/lib/tasks/woods.rake +15 -50
- data/lib/woods/builder.rb +174 -9
- data/lib/woods/cache/cache_middleware.rb +360 -31
- data/lib/woods/chunking/semantic_chunker.rb +334 -7
- data/lib/woods/console/adapters/job_adapter.rb +10 -4
- data/lib/woods/console/audit_logger.rb +76 -4
- data/lib/woods/console/bridge.rb +48 -15
- data/lib/woods/console/bridge_protocol.rb +44 -0
- data/lib/woods/console/confirmation.rb +3 -4
- data/lib/woods/console/console_response_renderer.rb +56 -18
- data/lib/woods/console/credential_index.rb +201 -0
- data/lib/woods/console/credential_scanner.rb +302 -0
- data/lib/woods/console/dispatch_pipeline.rb +138 -0
- data/lib/woods/console/embedded_executor.rb +682 -35
- data/lib/woods/console/eval_guard.rb +319 -0
- data/lib/woods/console/model_validator.rb +1 -3
- data/lib/woods/console/rack_middleware.rb +185 -29
- data/lib/woods/console/redactor.rb +161 -0
- data/lib/woods/console/response_context.rb +127 -0
- data/lib/woods/console/safe_context.rb +220 -23
- data/lib/woods/console/scope_predicate_parser.rb +131 -0
- data/lib/woods/console/server.rb +417 -486
- data/lib/woods/console/sql_noise_stripper.rb +87 -0
- data/lib/woods/console/sql_table_scanner.rb +213 -0
- data/lib/woods/console/sql_validator.rb +81 -31
- data/lib/woods/console/table_gate.rb +93 -0
- data/lib/woods/console/tool_specs.rb +552 -0
- data/lib/woods/console/tools/tier1.rb +3 -3
- data/lib/woods/console/tools/tier4.rb +7 -1
- data/lib/woods/dependency_graph.rb +66 -7
- data/lib/woods/embedding/indexer.rb +190 -6
- data/lib/woods/embedding/openai.rb +40 -4
- data/lib/woods/embedding/provider.rb +104 -8
- data/lib/woods/embedding/text_preparer.rb +23 -3
- data/lib/woods/embedding/token_counter.rb +133 -0
- data/lib/woods/evaluation/baseline_runner.rb +20 -2
- data/lib/woods/evaluation/metrics.rb +4 -1
- data/lib/woods/extracted_unit.rb +1 -0
- data/lib/woods/extractor.rb +7 -1
- data/lib/woods/extractors/controller_extractor.rb +6 -0
- data/lib/woods/extractors/mailer_extractor.rb +16 -2
- data/lib/woods/extractors/model_extractor.rb +6 -1
- data/lib/woods/extractors/phlex_extractor.rb +13 -4
- data/lib/woods/extractors/rails_source_extractor.rb +2 -0
- data/lib/woods/extractors/route_helper_resolver.rb +130 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
- data/lib/woods/extractors/view_component_extractor.rb +12 -1
- data/lib/woods/extractors/view_engines/base.rb +141 -0
- data/lib/woods/extractors/view_engines/erb.rb +145 -0
- data/lib/woods/extractors/view_template_extractor.rb +92 -133
- data/lib/woods/flow_assembler.rb +23 -15
- data/lib/woods/flow_precomputer.rb +21 -2
- data/lib/woods/graph_analyzer.rb +3 -4
- data/lib/woods/index_artifact.rb +173 -0
- data/lib/woods/mcp/bearer_auth.rb +45 -0
- data/lib/woods/mcp/bootstrap_state.rb +94 -0
- data/lib/woods/mcp/bootstrapper.rb +337 -16
- data/lib/woods/mcp/config_resolver.rb +288 -0
- data/lib/woods/mcp/errors.rb +134 -0
- data/lib/woods/mcp/index_reader.rb +265 -30
- data/lib/woods/mcp/origin_guard.rb +132 -0
- data/lib/woods/mcp/provider_probe.rb +166 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
- data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
- data/lib/woods/mcp/server.rb +737 -137
- data/lib/woods/model_name_cache.rb +78 -2
- data/lib/woods/notion/client.rb +25 -2
- data/lib/woods/notion/mappers/model_mapper.rb +36 -2
- data/lib/woods/railtie.rb +55 -15
- data/lib/woods/resilience/circuit_breaker.rb +9 -2
- data/lib/woods/resilience/retryable_provider.rb +40 -3
- data/lib/woods/resolved_config.rb +299 -0
- data/lib/woods/retrieval/context_assembler.rb +112 -5
- data/lib/woods/retrieval/query_classifier.rb +1 -1
- data/lib/woods/retrieval/ranker.rb +55 -6
- data/lib/woods/retrieval/search_executor.rb +42 -13
- data/lib/woods/retriever.rb +330 -24
- data/lib/woods/session_tracer/middleware.rb +35 -1
- data/lib/woods/storage/graph_store.rb +39 -0
- data/lib/woods/storage/inapplicable_backend.rb +14 -0
- data/lib/woods/storage/metadata_store.rb +129 -1
- data/lib/woods/storage/pgvector.rb +70 -8
- data/lib/woods/storage/qdrant.rb +196 -5
- data/lib/woods/storage/snapshotter/metadata.rb +172 -0
- data/lib/woods/storage/snapshotter/vector.rb +238 -0
- data/lib/woods/storage/snapshotter.rb +24 -0
- data/lib/woods/storage/vector_store.rb +184 -35
- data/lib/woods/tasks.rb +85 -0
- data/lib/woods/temporal/snapshot_store.rb +49 -1
- data/lib/woods/token_utils.rb +44 -5
- data/lib/woods/unblocked/client.rb +1 -1
- data/lib/woods/unblocked/document_builder.rb +35 -10
- data/lib/woods/unblocked/exporter.rb +1 -1
- data/lib/woods/util/host_guard.rb +61 -0
- data/lib/woods/version.rb +1 -1
- data/lib/woods.rb +126 -6
- metadata +69 -4
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
require 'tempfile'
|
|
5
|
+
require 'time'
|
|
6
|
+
require 'msgpack'
|
|
7
|
+
require 'woods/version'
|
|
8
|
+
require 'woods/storage/metadata_store'
|
|
9
|
+
require 'woods/mcp/errors'
|
|
10
|
+
|
|
11
|
+
module Woods
|
|
12
|
+
module Storage
|
|
13
|
+
module Snapshotter
|
|
14
|
+
# Reads and writes the metadata store snapshot (+metadata.msgpack+).
|
|
15
|
+
#
|
|
16
|
+
# MessagePack is chosen over +pack("e*")+ because metadata is heterogeneous
|
|
17
|
+
# hash-shaped data — type tags matter here. The vector format uses packed
|
|
18
|
+
# float32 for dense numeric data; metadata uses MessagePack for everything else.
|
|
19
|
+
#
|
|
20
|
+
# == On-disk format
|
|
21
|
+
#
|
|
22
|
+
# A stream of MessagePack-packed objects in a single file:
|
|
23
|
+
#
|
|
24
|
+
# 1. Header hash (one MessagePack object):
|
|
25
|
+
# { "magic" => "WMD1", "schema_version" => 1, "record_count" => N,
|
|
26
|
+
# "gem_version" => "1.2.0", "created_at" => "2026-04-23T03:42:17Z" }
|
|
27
|
+
#
|
|
28
|
+
# 2. One hash per record, streamed directly after the header:
|
|
29
|
+
# { "id" => "PostsController", "metadata" => { ... } }
|
|
30
|
+
#
|
|
31
|
+
# Stream-written via +MessagePack::Packer+ to avoid loading all records into
|
|
32
|
+
# memory at once. Stream-read via +MessagePack::Unpacker+ on load. Written
|
|
33
|
+
# atomically via +Tempfile+ + +File.rename+.
|
|
34
|
+
#
|
|
35
|
+
# @see Snapshotter::Vector companion class for vector stores
|
|
36
|
+
module Metadata
|
|
37
|
+
# Magic string identifying a valid Woods Metadata Dump file.
|
|
38
|
+
MAGIC = 'WMD1'
|
|
39
|
+
|
|
40
|
+
# Current schema version written by this implementation.
|
|
41
|
+
SCHEMA_VERSION = 1
|
|
42
|
+
|
|
43
|
+
# Maximum schema version this code can read. A dump with a higher version
|
|
44
|
+
# raises {Woods::MCP::UnsupportedArtifact} rather than silently misreading data.
|
|
45
|
+
MAX_SUPPORTED_SCHEMA_VERSION = 1
|
|
46
|
+
|
|
47
|
+
# Filename written inside the dump directory.
|
|
48
|
+
FILENAME = 'metadata.msgpack'
|
|
49
|
+
|
|
50
|
+
# Load a metadata store from the latest dump in +artifact+, or return an
|
|
51
|
+
# empty store if no dump exists yet.
|
|
52
|
+
#
|
|
53
|
+
# Never raises for a missing dump — callers that need an empty store on
|
|
54
|
+
# first run get one without special-casing.
|
|
55
|
+
#
|
|
56
|
+
# @param artifact [Woods::IndexArtifact] the artifact layout object
|
|
57
|
+
# @param resolved_config [Object, nil] reserved for future validation
|
|
58
|
+
# @return [Woods::Storage::MetadataStore::InMemory]
|
|
59
|
+
# @raise [Woods::MCP::UnsupportedArtifact] if magic is wrong or schema_version
|
|
60
|
+
# exceeds {MAX_SUPPORTED_SCHEMA_VERSION}
|
|
61
|
+
def self.load_or_empty(artifact, resolved_config: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
62
|
+
dump_path = dump_file_path(artifact)
|
|
63
|
+
return MetadataStore::InMemory.new unless dump_path&.exist?
|
|
64
|
+
|
|
65
|
+
store = MetadataStore::InMemory.new
|
|
66
|
+
File.open(dump_path.to_s, 'rb') do |io|
|
|
67
|
+
unpacker = MessagePack::Unpacker.new(io)
|
|
68
|
+
header = unpacker.read
|
|
69
|
+
validate_header!(header, dump_path)
|
|
70
|
+
header['record_count'].times do
|
|
71
|
+
record = unpacker.read
|
|
72
|
+
store.store(record['id'], record['metadata'])
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
store
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Write the metadata store to +dump_dir/metadata.msgpack+ atomically.
|
|
79
|
+
#
|
|
80
|
+
# Streams header then one packed hash per record — no full in-memory copy
|
|
81
|
+
# of the record set. Uses +Tempfile+ + +File.rename+ for atomicity.
|
|
82
|
+
#
|
|
83
|
+
# @param store [#each_entry, #bulk_load] an in-memory MetadataStore adapter
|
|
84
|
+
# @param artifact [Woods::IndexArtifact] the artifact layout object
|
|
85
|
+
# @param dump_dir [Pathname, String] target directory; must be under +artifact.dumps_root+
|
|
86
|
+
# @param resolved_config [Object, nil] reserved for future use
|
|
87
|
+
# @return [void]
|
|
88
|
+
# @raise [Woods::Storage::InapplicableBackend] if +store+ lacks +#each_entry+ or +#bulk_load+
|
|
89
|
+
# @raise [ArgumentError] if +dump_dir+ is not under +artifact.dumps_root+
|
|
90
|
+
def self.dump(store, artifact, dump_dir, resolved_config: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
91
|
+
validate_store!(store)
|
|
92
|
+
validate_dump_dir!(artifact, dump_dir)
|
|
93
|
+
target = Pathname.new(dump_dir.to_s).join(FILENAME)
|
|
94
|
+
target.dirname.mkpath
|
|
95
|
+
write_atomic(target, store)
|
|
96
|
+
nil
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
class << self
|
|
100
|
+
private
|
|
101
|
+
|
|
102
|
+
def dump_file_path(artifact)
|
|
103
|
+
latest = artifact.latest_dump_path
|
|
104
|
+
return nil unless latest
|
|
105
|
+
|
|
106
|
+
latest.join(FILENAME)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def validate_header!(header, path)
|
|
110
|
+
unless header.is_a?(Hash) && header['magic'] == MAGIC
|
|
111
|
+
raise Woods::MCP::UnsupportedArtifact,
|
|
112
|
+
"metadata.msgpack at #{path} has invalid magic " \
|
|
113
|
+
"(got #{header.is_a?(Hash) ? header['magic'].inspect : 'non-hash'}; " \
|
|
114
|
+
"expected #{MAGIC.inspect}). The file may be corrupt or from an incompatible tool."
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
version = header['schema_version']
|
|
118
|
+
return if version <= MAX_SUPPORTED_SCHEMA_VERSION
|
|
119
|
+
|
|
120
|
+
raise Woods::MCP::UnsupportedArtifact,
|
|
121
|
+
"metadata.msgpack at #{path} has schema_version #{version}; " \
|
|
122
|
+
"this gem supports up to #{MAX_SUPPORTED_SCHEMA_VERSION}. " \
|
|
123
|
+
'Upgrade the woods gem or re-run woods:embed to regenerate.'
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Write +store+ contents to +target+ atomically via a sibling Tempfile + rename.
|
|
127
|
+
# Streams header then one record hash per entry.
|
|
128
|
+
#
|
|
129
|
+
# @param target [Pathname] final destination
|
|
130
|
+
# @param store [#count, #each_entry] populated metadata store
|
|
131
|
+
def write_atomic(target, store)
|
|
132
|
+
tmp = Tempfile.new([FILENAME, '.tmp'], target.dirname.to_s)
|
|
133
|
+
begin
|
|
134
|
+
tmp.binmode
|
|
135
|
+
packer = MessagePack::Packer.new(tmp)
|
|
136
|
+
packer.write('magic' => MAGIC, 'schema_version' => SCHEMA_VERSION,
|
|
137
|
+
'record_count' => store.count, 'gem_version' => Woods::VERSION,
|
|
138
|
+
'created_at' => Time.now.utc.iso8601)
|
|
139
|
+
store.each_entry { |id, metadata| packer.write('id' => id, 'metadata' => metadata) }
|
|
140
|
+
packer.flush
|
|
141
|
+
tmp.flush
|
|
142
|
+
tmp.fsync
|
|
143
|
+
tmp.close
|
|
144
|
+
File.rename(tmp.path, target.to_s)
|
|
145
|
+
rescue StandardError
|
|
146
|
+
tmp.close
|
|
147
|
+
tmp.unlink
|
|
148
|
+
raise
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def validate_store!(store)
|
|
153
|
+
return if store.respond_to?(:each_entry) && store.respond_to?(:bulk_load)
|
|
154
|
+
|
|
155
|
+
raise InapplicableBackend,
|
|
156
|
+
"backend #{store.class} is already durable — Snapshotter should not have been invoked"
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def validate_dump_dir!(artifact, dump_dir)
|
|
160
|
+
dump_path = Pathname.new(dump_dir.to_s).expand_path
|
|
161
|
+
root = artifact.dumps_root.expand_path
|
|
162
|
+
|
|
163
|
+
return if dump_path.to_s.start_with?("#{root}/") || dump_path == root
|
|
164
|
+
|
|
165
|
+
raise ArgumentError,
|
|
166
|
+
"dump_dir #{dump_path} is not under artifact.dumps_root #{root}"
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
end
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
require 'tempfile'
|
|
5
|
+
require 'woods/storage/vector_store'
|
|
6
|
+
require 'woods/mcp/errors'
|
|
7
|
+
require 'woods/version'
|
|
8
|
+
|
|
9
|
+
module Woods
|
|
10
|
+
module Storage
|
|
11
|
+
module Snapshotter
|
|
12
|
+
# Reads and writes the +vectors.bin+ / +vectors.idx+ on-disk format.
|
|
13
|
+
#
|
|
14
|
+
# Binary layout of +vectors.bin+ (all integers little-endian):
|
|
15
|
+
#
|
|
16
|
+
# offset length field
|
|
17
|
+
# 0 4 bytes magic "WVF1"
|
|
18
|
+
# 4 4 bytes schema_version (u32 LE)
|
|
19
|
+
# 8 4 bytes dimension (u32 LE)
|
|
20
|
+
# 12 8 bytes vector_count (u64 LE)
|
|
21
|
+
# 20 4 bytes gem_version_length (u32 LE)
|
|
22
|
+
# 24 N bytes gem_version (UTF-8)
|
|
23
|
+
# 24+N 4 bytes model_name_length (u32 LE)
|
|
24
|
+
# 28+N M bytes model_name (UTF-8)
|
|
25
|
+
# ... — packed float32 data (vector_count × dimension × 4 bytes)
|
|
26
|
+
#
|
|
27
|
+
# +vectors.idx+ (one record per vector):
|
|
28
|
+
# 4 bytes id_length (u32 LE) + N bytes id (UTF-8) + 8 bytes offset (u64 LE)
|
|
29
|
+
#
|
|
30
|
+
# Atomic writes use +Tempfile+ + +File.rename+ for crash safety.
|
|
31
|
+
#
|
|
32
|
+
# @see Snapshotter::Metadata companion for metadata stores
|
|
33
|
+
module Vector # rubocop:disable Metrics/ModuleLength
|
|
34
|
+
MAGIC = 'WVF1'
|
|
35
|
+
SCHEMA_VERSION_SUPPORTED = 1
|
|
36
|
+
|
|
37
|
+
# Returns a populated in-memory vector store loaded from the latest dump,
|
|
38
|
+
# or an empty store when no dump exists yet.
|
|
39
|
+
#
|
|
40
|
+
# @param artifact [Woods::IndexArtifact] artifact layout object
|
|
41
|
+
# @param resolved_config [#dimension, nil] used for dimension validation
|
|
42
|
+
# @return [Woods::Storage::VectorStore::InMemory]
|
|
43
|
+
# @raise [Woods::MCP::UnsupportedArtifact] if magic or schema_version is invalid
|
|
44
|
+
# @raise [Woods::MCP::DimensionMismatch] if stored dimension ≠ +resolved_config.dimension+
|
|
45
|
+
def self.load_or_empty(artifact, resolved_config: nil)
|
|
46
|
+
dump_dir = artifact.latest_dump_path
|
|
47
|
+
return VectorStore::InMemory.new if dump_dir.nil?
|
|
48
|
+
|
|
49
|
+
bin_path = dump_dir.join('vectors.bin')
|
|
50
|
+
idx_path = dump_dir.join('vectors.idx')
|
|
51
|
+
return VectorStore::InMemory.new unless bin_path.exist? && idx_path.exist?
|
|
52
|
+
|
|
53
|
+
load_from(bin_path, idx_path, resolved_config)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Writes +vectors.bin+ and +vectors.idx+ into +dump_dir+ atomically.
|
|
57
|
+
#
|
|
58
|
+
# @param store [#each_entry, #bulk_load] in-memory vector store adapter
|
|
59
|
+
# @param artifact [Woods::IndexArtifact] artifact layout object
|
|
60
|
+
# @param dump_dir [Pathname, String] target directory; must be under +artifact.dumps_root+
|
|
61
|
+
# @param resolved_config [#model_name, nil] model name written to header
|
|
62
|
+
# @return [void]
|
|
63
|
+
# @raise [Woods::Storage::InapplicableBackend] if +store+ lacks +#each_entry+ / +#bulk_load+
|
|
64
|
+
# @raise [ArgumentError] if +dump_dir+ is not under +artifact.dumps_root+
|
|
65
|
+
def self.dump(store, artifact, dump_dir, resolved_config: nil)
|
|
66
|
+
validate_store!(store)
|
|
67
|
+
validate_dump_dir!(artifact, Pathname.new(dump_dir.to_s))
|
|
68
|
+
model_name = resolved_config.respond_to?(:model_name) ? resolved_config.model_name.to_s : ''
|
|
69
|
+
entries = store.each_entry.to_a
|
|
70
|
+
write_bin_and_idx(Pathname.new(dump_dir.to_s), entries, Woods::VERSION, model_name)
|
|
71
|
+
nil
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
class << self # rubocop:disable Metrics/ClassLength
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
def load_from(bin_path, idx_path, resolved_config)
|
|
78
|
+
bin_data = File.binread(bin_path.to_s)
|
|
79
|
+
header, data_offset = parse_header(bin_data, bin_path)
|
|
80
|
+
validate_magic!(header[:magic], bin_path)
|
|
81
|
+
validate_schema_version!(header[:schema_version], bin_path)
|
|
82
|
+
dim = resolved_config.respond_to?(:dimension) ? resolved_config.dimension : nil
|
|
83
|
+
validate_dimension!(header[:dimension], dim, bin_path) if dim
|
|
84
|
+
floats = bin_data.byteslice(data_offset, header[:vector_count] * header[:dimension] * 4)
|
|
85
|
+
.unpack("e#{header[:vector_count] * header[:dimension]}")
|
|
86
|
+
hydrate_store(parse_idx(idx_path), floats, header[:dimension])
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def parse_header(bin_data, bin_path) # rubocop:disable Metrics/AbcSize
|
|
90
|
+
# Minimum header is 28 bytes (magic + schema_version + dimension
|
|
91
|
+
# + vector_count + gem_version_length + model_name_length) plus
|
|
92
|
+
# the variable-length gem_version and model_name strings. A
|
|
93
|
+
# truncated header past the u32 guard below would produce a
|
|
94
|
+
# confusing NoMethodError on nil.unpack; raise a typed error
|
|
95
|
+
# with the file path instead.
|
|
96
|
+
raise_truncated(bin_path, bin_data.bytesize, 28) if bin_data.bytesize < 28
|
|
97
|
+
|
|
98
|
+
magic = bin_data.byteslice(0, 4)
|
|
99
|
+
schema_version, dimension = bin_data.byteslice(4, 8).unpack('L<L<')
|
|
100
|
+
vector_count = bin_data.byteslice(12, 8).unpack1('Q<')
|
|
101
|
+
gv_len = bin_data.byteslice(20, 4).unpack1('L<')
|
|
102
|
+
raise_truncated(bin_path, bin_data.bytesize, 24 + gv_len + 4) if bin_data.bytesize < 24 + gv_len + 4
|
|
103
|
+
|
|
104
|
+
off = 24 + gv_len
|
|
105
|
+
mn_len = bin_data.byteslice(off, 4).unpack1('L<')
|
|
106
|
+
raise_truncated(bin_path, bin_data.bytesize, off + 4 + mn_len) if bin_data.bytesize < off + 4 + mn_len
|
|
107
|
+
|
|
108
|
+
off += 4 + mn_len
|
|
109
|
+
[{ magic: magic, schema_version: schema_version,
|
|
110
|
+
dimension: dimension, vector_count: vector_count }, off]
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def raise_truncated(path, actual, expected)
|
|
114
|
+
raise Woods::MCP::UnsupportedArtifact.new(
|
|
115
|
+
"#{path}: file truncated (got #{actual} bytes, need at least #{expected}) — " \
|
|
116
|
+
'dump may have been interrupted mid-write; re-run woods:embed',
|
|
117
|
+
details: { path: path.to_s, actual_bytes: actual, needed_bytes: expected }
|
|
118
|
+
)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def parse_idx(idx_path)
|
|
122
|
+
idx_data = File.binread(idx_path.to_s)
|
|
123
|
+
pairs = []
|
|
124
|
+
pos = 0
|
|
125
|
+
while pos < idx_data.bytesize
|
|
126
|
+
id_len = idx_data.byteslice(pos, 4).unpack1('L<')
|
|
127
|
+
pos += 4
|
|
128
|
+
id = idx_data.byteslice(pos, id_len)
|
|
129
|
+
pos += id_len + 8 # skip the u64 offset (not needed for load)
|
|
130
|
+
pairs << id
|
|
131
|
+
end
|
|
132
|
+
pairs
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def hydrate_store(ids, floats, dim)
|
|
136
|
+
store = VectorStore::InMemory.new
|
|
137
|
+
entries = ids.each_with_index.map do |id, idx|
|
|
138
|
+
{ id: id, vector: floats[(idx * dim), dim], metadata: {} }
|
|
139
|
+
end
|
|
140
|
+
store.bulk_load(entries)
|
|
141
|
+
store
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def validate_magic!(found, path)
|
|
145
|
+
return if found == MAGIC
|
|
146
|
+
|
|
147
|
+
raise Woods::MCP::UnsupportedArtifact.new(
|
|
148
|
+
"#{path}: invalid magic bytes (expected #{MAGIC.inspect}, found #{found.inspect})",
|
|
149
|
+
details: { path: path.to_s, expected: MAGIC, found: found }
|
|
150
|
+
)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def validate_schema_version!(version, path)
|
|
154
|
+
return if version <= SCHEMA_VERSION_SUPPORTED
|
|
155
|
+
|
|
156
|
+
raise Woods::MCP::UnsupportedArtifact.new(
|
|
157
|
+
"#{path}: schema_version #{version} > supported max #{SCHEMA_VERSION_SUPPORTED}; " \
|
|
158
|
+
'upgrade the woods gem to read this artifact',
|
|
159
|
+
details: { path: path.to_s, artifact_version: version, max_supported: SCHEMA_VERSION_SUPPORTED }
|
|
160
|
+
)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def validate_dimension!(stored, expected, path)
|
|
164
|
+
return if stored == expected
|
|
165
|
+
|
|
166
|
+
raise Woods::MCP::DimensionMismatch.new(
|
|
167
|
+
"#{path}: stored dimension #{stored} ≠ provider dimension #{expected}",
|
|
168
|
+
details: { path: path.to_s, stored_dimension: stored, provider_dimension: expected }
|
|
169
|
+
)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def write_bin_and_idx(dump_dir, entries, gem_version, model_name)
|
|
173
|
+
header = build_header(entries, gem_version, model_name)
|
|
174
|
+
float_blob = entries.flat_map { |(_id, vector, _meta)| vector }.pack('e*')
|
|
175
|
+
idx_data = build_idx(entries, header.bytesize)
|
|
176
|
+
atomic_write(dump_dir.join('vectors.bin'), header + float_blob, binary: true)
|
|
177
|
+
atomic_write(dump_dir.join('vectors.idx'), idx_data, binary: true)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def build_header(entries, gem_version, model_name)
|
|
181
|
+
dim = entries.empty? ? 0 : entries.first[1].size
|
|
182
|
+
gv = gem_version.encode('UTF-8').b
|
|
183
|
+
mn = model_name.encode('UTF-8').b
|
|
184
|
+
buf = String.new(encoding: 'BINARY')
|
|
185
|
+
buf << MAGIC << [SCHEMA_VERSION_SUPPORTED, dim].pack('L<L<')
|
|
186
|
+
buf << [entries.size].pack('Q<')
|
|
187
|
+
buf << [gv.bytesize].pack('L<') << gv
|
|
188
|
+
buf << [mn.bytesize].pack('L<') << mn
|
|
189
|
+
buf
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def build_idx(entries, header_size)
|
|
193
|
+
buf = String.new(encoding: 'BINARY')
|
|
194
|
+
float_offset = header_size
|
|
195
|
+
entries.each do |id, vector, _meta|
|
|
196
|
+
id_bytes = id.encode('UTF-8').b
|
|
197
|
+
buf << [id_bytes.bytesize].pack('L<') << id_bytes
|
|
198
|
+
buf << [float_offset].pack('Q<')
|
|
199
|
+
float_offset += vector.size * 4
|
|
200
|
+
end
|
|
201
|
+
buf
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def atomic_write(path, content, binary: false)
|
|
205
|
+
FileUtils.mkdir_p(path.dirname) unless path.dirname.exist?
|
|
206
|
+
tmp = Tempfile.new('.woods-vec-', path.dirname.to_s)
|
|
207
|
+
tmp.binmode if binary
|
|
208
|
+
tmp.write(content)
|
|
209
|
+
tmp.flush
|
|
210
|
+
tmp.fsync
|
|
211
|
+
tmp.close
|
|
212
|
+
File.rename(tmp.path, path.to_s)
|
|
213
|
+
rescue StandardError
|
|
214
|
+
tmp&.close
|
|
215
|
+
tmp&.unlink
|
|
216
|
+
raise
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def validate_store!(store)
|
|
220
|
+
return if store.respond_to?(:each_entry) && store.respond_to?(:bulk_load)
|
|
221
|
+
|
|
222
|
+
raise InapplicableBackend,
|
|
223
|
+
"backend #{store.class} is already durable — Snapshotter should not have been invoked"
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
def validate_dump_dir!(artifact, dump_path)
|
|
227
|
+
expanded = dump_path.expand_path
|
|
228
|
+
root = artifact.dumps_root.expand_path
|
|
229
|
+
return if expanded.to_s.start_with?("#{root}/") || expanded == root
|
|
230
|
+
|
|
231
|
+
raise ArgumentError,
|
|
232
|
+
"dump_dir #{expanded} is not under artifact.dumps_root #{root}"
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'woods/storage/snapshotter/vector'
|
|
4
|
+
require 'woods/storage/snapshotter/metadata'
|
|
5
|
+
|
|
6
|
+
module Woods
|
|
7
|
+
module Storage
|
|
8
|
+
# Namespace for the Snapshotter pair that persists and hydrates in-memory
|
|
9
|
+
# storage adapters to/from disk.
|
|
10
|
+
#
|
|
11
|
+
# Two adapters live here:
|
|
12
|
+
# - {Snapshotter::Vector} — handles {VectorStore::InMemory} round-trips via +pack("e*")+.
|
|
13
|
+
# - {Snapshotter::Metadata} — handles {MetadataStore::InMemory} round-trips via MessagePack.
|
|
14
|
+
#
|
|
15
|
+
# Persistent backends (pgvector, Qdrant, SQLite) never touch the Snapshotter.
|
|
16
|
+
# Passing one to {Snapshotter::Vector.dump} or {Snapshotter::Metadata.dump} raises
|
|
17
|
+
# {InapplicableBackend} immediately.
|
|
18
|
+
#
|
|
19
|
+
# PR 2 ships stub implementations: +load_or_empty+ always returns an empty store
|
|
20
|
+
# and +dump+ is a validated no-op. PR 3 wires in the real serialization paths.
|
|
21
|
+
module Snapshotter
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|