woods 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +169 -0
- data/README.md +20 -8
- data/exe/woods-console +51 -6
- data/exe/woods-console-mcp +24 -4
- data/exe/woods-mcp +30 -7
- data/exe/woods-mcp-http +47 -6
- data/lib/generators/woods/install_generator.rb +13 -4
- data/lib/generators/woods/templates/woods.rb.tt +155 -0
- data/lib/tasks/woods.rake +15 -50
- data/lib/woods/builder.rb +174 -9
- data/lib/woods/cache/cache_middleware.rb +360 -31
- data/lib/woods/chunking/semantic_chunker.rb +334 -7
- data/lib/woods/console/adapters/job_adapter.rb +10 -4
- data/lib/woods/console/audit_logger.rb +76 -4
- data/lib/woods/console/bridge.rb +48 -15
- data/lib/woods/console/bridge_protocol.rb +44 -0
- data/lib/woods/console/confirmation.rb +3 -4
- data/lib/woods/console/console_response_renderer.rb +56 -18
- data/lib/woods/console/credential_index.rb +201 -0
- data/lib/woods/console/credential_scanner.rb +302 -0
- data/lib/woods/console/dispatch_pipeline.rb +138 -0
- data/lib/woods/console/embedded_executor.rb +682 -35
- data/lib/woods/console/eval_guard.rb +319 -0
- data/lib/woods/console/model_validator.rb +1 -3
- data/lib/woods/console/rack_middleware.rb +185 -29
- data/lib/woods/console/redactor.rb +161 -0
- data/lib/woods/console/response_context.rb +127 -0
- data/lib/woods/console/safe_context.rb +220 -23
- data/lib/woods/console/scope_predicate_parser.rb +131 -0
- data/lib/woods/console/server.rb +417 -486
- data/lib/woods/console/sql_noise_stripper.rb +87 -0
- data/lib/woods/console/sql_table_scanner.rb +213 -0
- data/lib/woods/console/sql_validator.rb +81 -31
- data/lib/woods/console/table_gate.rb +93 -0
- data/lib/woods/console/tool_specs.rb +552 -0
- data/lib/woods/console/tools/tier1.rb +3 -3
- data/lib/woods/console/tools/tier4.rb +7 -1
- data/lib/woods/dependency_graph.rb +66 -7
- data/lib/woods/embedding/indexer.rb +190 -6
- data/lib/woods/embedding/openai.rb +40 -4
- data/lib/woods/embedding/provider.rb +104 -8
- data/lib/woods/embedding/text_preparer.rb +23 -3
- data/lib/woods/embedding/token_counter.rb +133 -0
- data/lib/woods/evaluation/baseline_runner.rb +20 -2
- data/lib/woods/evaluation/metrics.rb +4 -1
- data/lib/woods/extracted_unit.rb +1 -0
- data/lib/woods/extractor.rb +7 -1
- data/lib/woods/extractors/controller_extractor.rb +6 -0
- data/lib/woods/extractors/mailer_extractor.rb +16 -2
- data/lib/woods/extractors/model_extractor.rb +6 -1
- data/lib/woods/extractors/phlex_extractor.rb +13 -4
- data/lib/woods/extractors/rails_source_extractor.rb +2 -0
- data/lib/woods/extractors/route_helper_resolver.rb +130 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
- data/lib/woods/extractors/view_component_extractor.rb +12 -1
- data/lib/woods/extractors/view_engines/base.rb +141 -0
- data/lib/woods/extractors/view_engines/erb.rb +145 -0
- data/lib/woods/extractors/view_template_extractor.rb +92 -133
- data/lib/woods/flow_assembler.rb +23 -15
- data/lib/woods/flow_precomputer.rb +21 -2
- data/lib/woods/graph_analyzer.rb +3 -4
- data/lib/woods/index_artifact.rb +173 -0
- data/lib/woods/mcp/bearer_auth.rb +45 -0
- data/lib/woods/mcp/bootstrap_state.rb +94 -0
- data/lib/woods/mcp/bootstrapper.rb +337 -16
- data/lib/woods/mcp/config_resolver.rb +288 -0
- data/lib/woods/mcp/errors.rb +134 -0
- data/lib/woods/mcp/index_reader.rb +265 -30
- data/lib/woods/mcp/origin_guard.rb +132 -0
- data/lib/woods/mcp/provider_probe.rb +166 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
- data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
- data/lib/woods/mcp/server.rb +737 -137
- data/lib/woods/model_name_cache.rb +78 -2
- data/lib/woods/notion/client.rb +25 -2
- data/lib/woods/notion/mappers/model_mapper.rb +36 -2
- data/lib/woods/railtie.rb +55 -15
- data/lib/woods/resilience/circuit_breaker.rb +9 -2
- data/lib/woods/resilience/retryable_provider.rb +40 -3
- data/lib/woods/resolved_config.rb +299 -0
- data/lib/woods/retrieval/context_assembler.rb +112 -5
- data/lib/woods/retrieval/query_classifier.rb +1 -1
- data/lib/woods/retrieval/ranker.rb +55 -6
- data/lib/woods/retrieval/search_executor.rb +42 -13
- data/lib/woods/retriever.rb +330 -24
- data/lib/woods/session_tracer/middleware.rb +35 -1
- data/lib/woods/storage/graph_store.rb +39 -0
- data/lib/woods/storage/inapplicable_backend.rb +14 -0
- data/lib/woods/storage/metadata_store.rb +129 -1
- data/lib/woods/storage/pgvector.rb +70 -8
- data/lib/woods/storage/qdrant.rb +196 -5
- data/lib/woods/storage/snapshotter/metadata.rb +172 -0
- data/lib/woods/storage/snapshotter/vector.rb +238 -0
- data/lib/woods/storage/snapshotter.rb +24 -0
- data/lib/woods/storage/vector_store.rb +184 -35
- data/lib/woods/tasks.rb +85 -0
- data/lib/woods/temporal/snapshot_store.rb +49 -1
- data/lib/woods/token_utils.rb +44 -5
- data/lib/woods/unblocked/client.rb +1 -1
- data/lib/woods/unblocked/document_builder.rb +35 -10
- data/lib/woods/unblocked/exporter.rb +1 -1
- data/lib/woods/util/host_guard.rb +61 -0
- data/lib/woods/version.rb +1 -1
- data/lib/woods.rb +126 -6
- metadata +69 -4
|
@@ -1,8 +1,16 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'json'
|
|
4
|
+
require 'time'
|
|
4
5
|
|
|
5
6
|
module Woods
|
|
7
|
+
# Same conditional-define pattern used elsewhere in the gem so this
|
|
8
|
+
# file can be required in isolation (e.g. by specs that bypass the
|
|
9
|
+
# full lib/woods.rb load) without tripping NameError on the friendly
|
|
10
|
+
# missing-sqlite3 raise below.
|
|
11
|
+
class Error < StandardError; end unless defined?(Woods::Error)
|
|
12
|
+
class ConfigurationError < Error; end unless defined?(Woods::ConfigurationError)
|
|
13
|
+
|
|
6
14
|
module Storage
|
|
7
15
|
# MetadataStore provides an interface for storing and querying unit metadata.
|
|
8
16
|
#
|
|
@@ -85,6 +93,119 @@ module Woods
|
|
|
85
93
|
end
|
|
86
94
|
end
|
|
87
95
|
|
|
96
|
+
# Pure-Ruby metadata store backed by a hash. No external dependencies,
|
|
97
|
+
# no persistence — vectors and metadata both live in the building
|
|
98
|
+
# process and die with it. Suitable for hosts that don't bundle the
|
|
99
|
+
# `sqlite3` gem (e.g., MySQL- or Postgres-only Rails apps), and for
|
|
100
|
+
# short-lived processes that rebuild the index per run.
|
|
101
|
+
#
|
|
102
|
+
# @example
|
|
103
|
+
# store = InMemory.new
|
|
104
|
+
# store.store("User", { type: "model", namespace: "Admin" })
|
|
105
|
+
# store.find("User") # => { "type" => "model", "namespace" => "Admin" }
|
|
106
|
+
#
|
|
107
|
+
class InMemory
|
|
108
|
+
include Interface
|
|
109
|
+
|
|
110
|
+
def initialize
|
|
111
|
+
@data = {}
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# @see Interface#store
|
|
115
|
+
def store(id, metadata)
|
|
116
|
+
@data[id] = stringify_keys(metadata).merge('updated_at' => Time.now.iso8601)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# @see Interface#find
|
|
120
|
+
def find(id)
|
|
121
|
+
record = @data[id]
|
|
122
|
+
return nil unless record
|
|
123
|
+
|
|
124
|
+
record.except('updated_at')
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# @see Interface#find_batch
|
|
128
|
+
def find_batch(ids)
|
|
129
|
+
ids.each_with_object({}) do |id, result|
|
|
130
|
+
data = find(id)
|
|
131
|
+
result[id] = data if data
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# @see Interface#find_by_type
|
|
136
|
+
def find_by_type(type)
|
|
137
|
+
target = type.to_s
|
|
138
|
+
@data.each_with_object([]) do |(id, record), out|
|
|
139
|
+
next unless record['type'].to_s == target
|
|
140
|
+
|
|
141
|
+
out << record.except('updated_at').merge('id' => id)
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# @see Interface#search
|
|
146
|
+
def search(query, fields: nil)
|
|
147
|
+
needle = query.to_s
|
|
148
|
+
@data.each_with_object([]) do |(id, record), out|
|
|
149
|
+
haystacks = fields ? fields.map { |f| record[f.to_s] } : [JSON.generate(record)]
|
|
150
|
+
next unless haystacks.compact.any? { |h| h.to_s.include?(needle) }
|
|
151
|
+
|
|
152
|
+
out << record.except('updated_at').merge('id' => id)
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# @see Interface#delete
|
|
157
|
+
def delete(id)
|
|
158
|
+
@data.delete(id)
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# @see Interface#count
|
|
162
|
+
def count
|
|
163
|
+
@data.size
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Iterate over every stored entry, yielding +(id, metadata)+ pairs.
|
|
167
|
+
#
|
|
168
|
+
# Persistence seam for {Snapshotter::Metadata}. Yields the raw internal
|
|
169
|
+
# hash (including +updated_at+) so the Snapshotter can reconstruct state
|
|
170
|
+
# faithfully on load.
|
|
171
|
+
#
|
|
172
|
+
# @yield [id, metadata] id is a String; metadata is a Hash with string keys
|
|
173
|
+
# @return [Enumerator] when no block given
|
|
174
|
+
def each_entry(&block)
|
|
175
|
+
return enum_for(:each_entry) unless block
|
|
176
|
+
|
|
177
|
+
@data.each(&block)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Hydrate the store from a pre-serialized dump.
|
|
181
|
+
#
|
|
182
|
+
# Dual of {#each_entry} — Snapshotter feeds the deserialized dump contents
|
|
183
|
+
# through this method to restore the store in a new process.
|
|
184
|
+
#
|
|
185
|
+
# @param entries [Enumerable<Array(String, Hash)>] Pairs of +[id, metadata]+
|
|
186
|
+
# @return [void]
|
|
187
|
+
def bulk_load(entries)
|
|
188
|
+
entries.each { |id, meta| @data[id] = meta }
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
# Drop every stored entry. Used by the MCP +reload+ tool to pick up a
|
|
192
|
+
# fresh embed run without restarting the process. Safe on an empty store.
|
|
193
|
+
def clear!
|
|
194
|
+
@data = {}
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
private
|
|
198
|
+
|
|
199
|
+
# Match the SQLite adapter's string-key contract regardless of how
|
|
200
|
+
# the caller serialises the input hash. Without this, find/search
|
|
201
|
+
# consumers that expect string keys (the SQLite path round-trips
|
|
202
|
+
# through JSON, which always returns strings) would break under
|
|
203
|
+
# symbol-keyed test fixtures.
|
|
204
|
+
def stringify_keys(hash)
|
|
205
|
+
hash.each_with_object({}) { |(k, v), out| out[k.to_s] = v }
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
88
209
|
# SQLite-backed metadata store using the JSON1 extension.
|
|
89
210
|
#
|
|
90
211
|
# Stores unit metadata as JSON in a single table with type indexing
|
|
@@ -100,7 +221,14 @@ module Woods
|
|
|
100
221
|
|
|
101
222
|
# @param db_path [String] Path to the SQLite database file, or ":memory:" for in-memory
|
|
102
223
|
def initialize(db_path = ':memory:')
|
|
103
|
-
|
|
224
|
+
begin
|
|
225
|
+
require 'sqlite3'
|
|
226
|
+
rescue LoadError
|
|
227
|
+
raise Woods::ConfigurationError,
|
|
228
|
+
'metadata_store: :sqlite requires the sqlite3 gem in your Gemfile. ' \
|
|
229
|
+
"Add `gem 'sqlite3'` and re-bundle, or set " \
|
|
230
|
+
"`config.metadata_store = :in_memory` if you don't need cross-process persistence."
|
|
231
|
+
end
|
|
104
232
|
@db = ::SQLite3::Database.new(db_path)
|
|
105
233
|
@db.results_as_hash = true
|
|
106
234
|
create_table
|
|
@@ -56,6 +56,7 @@ module Woods
|
|
|
56
56
|
# @see Interface#store
|
|
57
57
|
def store(id, vector, metadata = {})
|
|
58
58
|
validate_vector!(vector)
|
|
59
|
+
validate_dimensions!(vector) if @dimensions
|
|
59
60
|
entry = format_entry(id, vector, metadata)
|
|
60
61
|
|
|
61
62
|
@connection.execute(<<~SQL)
|
|
@@ -71,14 +72,21 @@ module Woods
|
|
|
71
72
|
# Store multiple vectors in a single multi-row INSERT.
|
|
72
73
|
#
|
|
73
74
|
# @param entries [Array<Hash>] Each entry has :id, :vector, :metadata keys
|
|
75
|
+
# @raise [ArgumentError] if any entry has a non-numeric or wrong-dimension vector.
|
|
76
|
+
# Validation runs BEFORE any INSERT so partial-batch writes can't occur.
|
|
74
77
|
def store_batch(entries)
|
|
75
78
|
return if entries.empty?
|
|
76
79
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
+
# Pre-validate every vector before any SQL — prevents partial-batch
|
|
81
|
+
# state when a later entry's dimension doesn't match.
|
|
82
|
+
entries.each_with_index do |entry, idx|
|
|
83
|
+
vector = entry[:vector]
|
|
84
|
+
validate_vector!(vector)
|
|
85
|
+
validate_dimensions!(vector, index: idx) if @dimensions
|
|
80
86
|
end
|
|
81
87
|
|
|
88
|
+
values = entries.map { |entry| format_entry(entry[:id], entry[:vector], entry[:metadata] || {}) }
|
|
89
|
+
|
|
82
90
|
@connection.execute(<<~SQL)
|
|
83
91
|
INSERT INTO #{TABLE} (id, embedding, metadata, created_at)
|
|
84
92
|
VALUES #{values.join(",\n")}
|
|
@@ -98,7 +106,7 @@ module Woods
|
|
|
98
106
|
# @see Interface#search
|
|
99
107
|
def search(query_vector, limit: 10, filters: {})
|
|
100
108
|
validate_vector!(query_vector)
|
|
101
|
-
vector_literal =
|
|
109
|
+
vector_literal = build_vector_literal(query_vector)
|
|
102
110
|
where_clause = build_where(filters)
|
|
103
111
|
|
|
104
112
|
sql = <<~SQL
|
|
@@ -142,10 +150,29 @@ module Woods
|
|
|
142
150
|
def format_entry(id, vector, metadata)
|
|
143
151
|
quoted_id = @connection.quote(id)
|
|
144
152
|
quoted_metadata = @connection.quote(JSON.generate(metadata))
|
|
145
|
-
vector_literal =
|
|
153
|
+
vector_literal = build_vector_literal(vector)
|
|
146
154
|
"(#{quoted_id}, '#{vector_literal}', #{quoted_metadata}::jsonb, CURRENT_TIMESTAMP)"
|
|
147
155
|
end
|
|
148
156
|
|
|
157
|
+
# Build the `[x,y,z]` pgvector literal from a validated numeric vector.
|
|
158
|
+
# Coerces each element through `Float()` first — `Float#to_s` is
|
|
159
|
+
# guaranteed to produce only digits, `.`, `-`, and `e`, which closes
|
|
160
|
+
# the theoretical `Numeric`-subclass `#to_s` injection vector even
|
|
161
|
+
# though {#validate_vector!} already rejects non-Numeric inputs.
|
|
162
|
+
# `Float()` raises `RangeError` on `Complex` values with an imaginary
|
|
163
|
+
# part — we surface that as an `ArgumentError` so callers see the
|
|
164
|
+
# same error shape as the other vector-validation paths instead of
|
|
165
|
+
# the raw coercion error.
|
|
166
|
+
def build_vector_literal(vector)
|
|
167
|
+
coerced = vector.each_with_index.map do |element, i|
|
|
168
|
+
Float(element).to_s
|
|
169
|
+
rescue RangeError, TypeError, ArgumentError => e
|
|
170
|
+
raise ArgumentError,
|
|
171
|
+
"Vector element at index #{i} cannot be coerced to Float: #{element.inspect} (#{e.class})"
|
|
172
|
+
end
|
|
173
|
+
"[#{coerced.join(',')}]"
|
|
174
|
+
end
|
|
175
|
+
|
|
149
176
|
# Convert a database row to a SearchResult.
|
|
150
177
|
#
|
|
151
178
|
# @param row [Hash] Database row with id, distance, metadata
|
|
@@ -173,22 +200,57 @@ module Woods
|
|
|
173
200
|
raise ArgumentError, "Invalid filter key: #{key_s.inspect}"
|
|
174
201
|
end
|
|
175
202
|
|
|
176
|
-
|
|
203
|
+
# Belt-and-suspenders: regex above already rejects any value that
|
|
204
|
+
# could alter the SQL shape, but we still pass the key through
|
|
205
|
+
# `@connection.quote` so the quoting story is uniform across the
|
|
206
|
+
# key and value positions and a future regex-relaxation does not
|
|
207
|
+
# silently unlock injection.
|
|
208
|
+
if value.is_a?(Array)
|
|
209
|
+
# Membership filter. An empty Array would produce `IN ()`
|
|
210
|
+
# which is a syntax error; emit an always-false predicate
|
|
211
|
+
# so the query still parses and returns no rows.
|
|
212
|
+
next 'FALSE' if value.empty?
|
|
213
|
+
|
|
214
|
+
quoted = value.map { |v| @connection.quote(v.to_s) }.join(', ')
|
|
215
|
+
"metadata->>#{@connection.quote(key_s)} IN (#{quoted})"
|
|
216
|
+
else
|
|
217
|
+
"metadata->>#{@connection.quote(key_s)} = #{@connection.quote(value.to_s)}"
|
|
218
|
+
end
|
|
177
219
|
end
|
|
178
220
|
"WHERE #{conditions.join(' AND ')}"
|
|
179
221
|
end
|
|
180
222
|
|
|
181
|
-
# Validate that all vector elements are numeric.
|
|
223
|
+
# Validate that all vector elements are numeric and finite.
|
|
224
|
+
# Rejecting NaN / Infinity also closes a defense-in-depth gap
|
|
225
|
+
# around the vector-literal SQL construction — `Float::NAN.to_s`
|
|
226
|
+
# yields `"NaN"` which pgvector rejects, but other float-like
|
|
227
|
+
# sentinels can leak through string construction unexpectedly.
|
|
182
228
|
#
|
|
183
229
|
# @param vector [Array] The vector to validate
|
|
184
|
-
# @raise [ArgumentError] if any element is not numeric
|
|
230
|
+
# @raise [ArgumentError] if any element is not numeric or is non-finite
|
|
185
231
|
def validate_vector!(vector)
|
|
186
232
|
vector.each_with_index do |element, i|
|
|
187
233
|
unless element.is_a?(Numeric)
|
|
188
234
|
raise ArgumentError, "Vector element at index #{i} is not numeric: #{element.inspect}"
|
|
189
235
|
end
|
|
236
|
+
if element.is_a?(Float) && !element.finite?
|
|
237
|
+
raise ArgumentError, "Vector element at index #{i} is not finite: #{element.inspect}"
|
|
238
|
+
end
|
|
190
239
|
end
|
|
191
240
|
end
|
|
241
|
+
|
|
242
|
+
# Assert the provided vector matches the store's configured dimension.
|
|
243
|
+
#
|
|
244
|
+
# @param vector [Array<Numeric>]
|
|
245
|
+
# @param index [Integer, nil] position in the batch, used in the error message
|
|
246
|
+
# @raise [Woods::Error] on dimension mismatch
|
|
247
|
+
def validate_dimensions!(vector, index: nil)
|
|
248
|
+
return if vector.length == @dimensions
|
|
249
|
+
|
|
250
|
+
where = index ? " (entry #{index})" : ''
|
|
251
|
+
raise Woods::Error,
|
|
252
|
+
"Vector dimension mismatch#{where}: got #{vector.length}, expected #{@dimensions}"
|
|
253
|
+
end
|
|
192
254
|
end
|
|
193
255
|
end
|
|
194
256
|
end
|
data/lib/woods/storage/qdrant.rb
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'ipaddr'
|
|
3
4
|
require 'net/http'
|
|
4
5
|
require 'json'
|
|
6
|
+
require 'socket'
|
|
5
7
|
require 'uri'
|
|
6
8
|
require_relative 'vector_store'
|
|
9
|
+
require_relative '../util/host_guard'
|
|
7
10
|
|
|
8
11
|
module Woods
|
|
9
12
|
module Storage
|
|
@@ -22,20 +25,167 @@ module Woods
|
|
|
22
25
|
class Qdrant # rubocop:disable Metrics/ClassLength
|
|
23
26
|
include Interface
|
|
24
27
|
|
|
28
|
+
# URL schemes allowed for the Qdrant endpoint. `file://`, `gopher://`,
|
|
29
|
+
# and anything else would let a misconfigured or attacker-controlled
|
|
30
|
+
# config value turn the adapter into an SSRF vector against the host
|
|
31
|
+
# running extraction.
|
|
32
|
+
ALLOWED_SCHEMES = %w[http https].freeze
|
|
33
|
+
|
|
34
|
+
# IP ranges that always resolve to loopback, link-local, private, or
|
|
35
|
+
# CGNAT space and should never be contacted as a vector store unless
|
|
36
|
+
# the operator explicitly opts in via `allow_private_hosts: true`.
|
|
37
|
+
#
|
|
38
|
+
# Covers:
|
|
39
|
+
# - IPv4 "this network" / wildcard (0.0.0.0/8)
|
|
40
|
+
# - IPv4 loopback, RFC1918 (10/8, 172.16/12, 192.168/16)
|
|
41
|
+
# - IPv4 link-local 169.254/16 (AWS / Azure / GCP IMDS)
|
|
42
|
+
# - IPv4 CGNAT 100.64/10 (common in managed clouds behind NAT)
|
|
43
|
+
# - IPv6 loopback (::1) and unspecified (::)
|
|
44
|
+
# - IPv6 ULA fc00::/7 (private IPv6 equivalent of RFC1918)
|
|
45
|
+
# - IPv6 link-local fe80::/10
|
|
46
|
+
#
|
|
47
|
+
# NOTE: IPv4-mapped IPv6 (`::ffff:169.254.169.254`) is handled
|
|
48
|
+
# separately in {.private_host?} by detecting the `::ffff:` prefix
|
|
49
|
+
# and extracting the embedded IPv4 portion before range comparison.
|
|
50
|
+
# A blanket `::ffff:0:0/96` range here would (on some Ruby versions,
|
|
51
|
+
# including 3.0) match every IPv4 address due to IPAddr's
|
|
52
|
+
# cross-family auto-mapping in `#include?`.
|
|
53
|
+
PRIVATE_IP_RANGES = [
|
|
54
|
+
'0.0.0.0/8',
|
|
55
|
+
'10.0.0.0/8',
|
|
56
|
+
'127.0.0.0/8',
|
|
57
|
+
'169.254.0.0/16',
|
|
58
|
+
'172.16.0.0/12',
|
|
59
|
+
'192.168.0.0/16',
|
|
60
|
+
'100.64.0.0/10',
|
|
61
|
+
'::/128',
|
|
62
|
+
'::1/128',
|
|
63
|
+
'fc00::/7',
|
|
64
|
+
'fe80::/10'
|
|
65
|
+
].map { |cidr| IPAddr.new(cidr) }.freeze
|
|
66
|
+
|
|
67
|
+
# Hostnames that always map to loopback regardless of DNS.
|
|
68
|
+
PRIVATE_HOSTNAMES = %w[localhost localhost. ip6-localhost ip6-loopback].freeze
|
|
69
|
+
|
|
25
70
|
# @param url [String] Qdrant server URL
|
|
26
71
|
# @param collection [String] Collection name
|
|
27
72
|
# @param api_key [String, nil] Optional API key for authentication
|
|
28
|
-
|
|
73
|
+
# @param dimensions [Integer, nil] Expected vector dimension. When set,
|
|
74
|
+
# {#store_batch}/{#store} pre-validate every vector's length before
|
|
75
|
+
# sending the HTTP request — Qdrant returns a 400 on mismatch, but
|
|
76
|
+
# detecting it client-side avoids wasted network round-trips and
|
|
77
|
+
# keeps error shape consistent with the pgvector adapter.
|
|
78
|
+
# @param allow_private_hosts [Boolean] Explicitly permit a URL whose
|
|
79
|
+
# host resolves inside loopback, link-local, or RFC1918 space. Off
|
|
80
|
+
# by default to block the common SSRF footgun. Set to true when the
|
|
81
|
+
# operator intentionally runs Qdrant on `localhost:6333` or inside
|
|
82
|
+
# a private network.
|
|
83
|
+
def initialize(url:, collection:, api_key: nil, dimensions: nil, allow_private_hosts: false)
|
|
84
|
+
@uri = self.class.validate_url!(url, allow_private_hosts: allow_private_hosts)
|
|
29
85
|
@url = url
|
|
30
86
|
@collection = collection
|
|
31
87
|
@api_key = api_key
|
|
32
|
-
@
|
|
88
|
+
@dimensions = dimensions
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Validate a Qdrant endpoint URL — scheme in {ALLOWED_SCHEMES} and,
|
|
92
|
+
# unless opted out, host outside loopback / link-local / RFC1918.
|
|
93
|
+
# Public so callers can pre-check configuration before constructing.
|
|
94
|
+
def self.validate_url!(url, allow_private_hosts: false)
|
|
95
|
+
uri = URI(url)
|
|
96
|
+
validate_scheme!(uri)
|
|
97
|
+
validate_host_present!(uri, url)
|
|
98
|
+
validate_host_visibility!(uri.host.to_s, allow_private_hosts: allow_private_hosts)
|
|
99
|
+
uri
|
|
100
|
+
rescue URI::InvalidURIError => e
|
|
101
|
+
raise ArgumentError, "Qdrant URL is not a valid URI: #{e.message}"
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def self.validate_scheme!(uri)
|
|
105
|
+
return if ALLOWED_SCHEMES.include?(uri.scheme)
|
|
106
|
+
|
|
107
|
+
raise ArgumentError,
|
|
108
|
+
"Qdrant URL scheme must be one of #{ALLOWED_SCHEMES.join(', ')}; got #{uri.scheme.inspect}"
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def self.validate_host_present!(uri, url)
|
|
112
|
+
return unless uri.host.nil? || uri.host.empty?
|
|
113
|
+
|
|
114
|
+
raise ArgumentError, "Qdrant URL must include a host: #{url.inspect}"
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def self.validate_host_visibility!(host, allow_private_hosts:)
|
|
118
|
+
return if allow_private_hosts
|
|
119
|
+
|
|
120
|
+
# Canonicalize (strip port, trailing dot, IPv6 brackets) via
|
|
121
|
+
# the shared helper so Qdrant and OriginGuard stay in sync.
|
|
122
|
+
canonical = Util::HostGuard.canonicalize(host)
|
|
123
|
+
|
|
124
|
+
# Non-canonical numeric hosts (hex `0x7f000001`, octal
|
|
125
|
+
# `0177.0.0.1`, bare integer `2130706433`, short-form `127.1`,
|
|
126
|
+
# mixed-radix `0x7f.0.0.1`) are accepted by URI and getaddrinfo
|
|
127
|
+
# but NOT by `IPAddr`, so the private-range check silently
|
|
128
|
+
# passed them through. Reject any host that looks numeric-but-
|
|
129
|
+
# not-standard instead of trying to canonicalize every form.
|
|
130
|
+
if Util::HostGuard.suspicious_numeric_host?(canonical)
|
|
131
|
+
raise ArgumentError,
|
|
132
|
+
"Qdrant URL uses a non-standard numeric host (#{host}). " \
|
|
133
|
+
'Hex/octal/integer/short-form IPs are rejected because they ' \
|
|
134
|
+
'can disguise loopback or private addresses. Pass the ' \
|
|
135
|
+
'dotted-decimal form explicitly.'
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
return unless private_host?(canonical)
|
|
139
|
+
|
|
140
|
+
raise ArgumentError,
|
|
141
|
+
"Qdrant URL targets a private/loopback host (#{host}); " \
|
|
142
|
+
'pass allow_private_hosts: true to permit. ' \
|
|
143
|
+
'Note: validation is at config time; DNS resolution happens ' \
|
|
144
|
+
'per request, so a public hostname that later resolves to a ' \
|
|
145
|
+
'private IP is NOT caught here — deploy Qdrant on a trusted network.'
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def self.private_host?(host)
|
|
149
|
+
return true if PRIVATE_HOSTNAMES.include?(host)
|
|
150
|
+
|
|
151
|
+
ip = unmap_ipv4(IPAddr.new(host))
|
|
152
|
+
|
|
153
|
+
# Restrict range-check to the SAME address family so IPAddr's
|
|
154
|
+
# cross-family `include?` can't silently match all IPv4
|
|
155
|
+
# addresses into an IPv6 range (or vice versa) — a quirk
|
|
156
|
+
# observed on Ruby 3.0's IPAddr that trapped legitimate public
|
|
157
|
+
# IPv4 addresses as "IPv4-mapped private" when the range list
|
|
158
|
+
# contained `::ffff:0:0/96`.
|
|
159
|
+
PRIVATE_IP_RANGES.any? do |range|
|
|
160
|
+
range.family == ip.family && range.include?(ip)
|
|
161
|
+
end
|
|
162
|
+
rescue IPAddr::InvalidAddressError
|
|
163
|
+
false
|
|
33
164
|
end
|
|
34
165
|
|
|
166
|
+
# IPv4-mapped IPv6 (`::ffff:169.254.169.254`): extract the
|
|
167
|
+
# embedded IPv4 (low 32 bits) before range comparison so the AWS
|
|
168
|
+
# IMDS address is caught by 169.254/16 even when disguised as
|
|
169
|
+
# IPv4-mapped IPv6. Returns the input unchanged for every other
|
|
170
|
+
# address.
|
|
171
|
+
def self.unmap_ipv4(ip)
|
|
172
|
+
return ip unless ip.ipv6?
|
|
173
|
+
return ip unless ip.to_string.start_with?('0000:0000:0000:0000:0000:ffff:')
|
|
174
|
+
|
|
175
|
+
mapped_ipv4 = ip.to_i & 0xffff_ffff
|
|
176
|
+
return ip unless mapped_ipv4.positive?
|
|
177
|
+
|
|
178
|
+
IPAddr.new(mapped_ipv4, Socket::AF_INET)
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
private_class_method :validate_scheme!, :validate_host_present!,
|
|
182
|
+
:validate_host_visibility!, :private_host?, :unmap_ipv4
|
|
183
|
+
|
|
35
184
|
# Create the collection if it doesn't exist.
|
|
36
185
|
#
|
|
37
186
|
# @param dimensions [Integer] Vector dimensionality
|
|
38
187
|
def ensure_collection!(dimensions:)
|
|
188
|
+
@dimensions ||= dimensions
|
|
39
189
|
body = {
|
|
40
190
|
vectors: {
|
|
41
191
|
size: dimensions,
|
|
@@ -52,6 +202,7 @@ module Woods
|
|
|
52
202
|
# @param metadata [Hash] Optional payload metadata
|
|
53
203
|
# @see Interface#store
|
|
54
204
|
def store(id, vector, metadata = {})
|
|
205
|
+
validate_dimensions!(vector) if @dimensions
|
|
55
206
|
body = {
|
|
56
207
|
points: [
|
|
57
208
|
{
|
|
@@ -72,9 +223,18 @@ module Woods
|
|
|
72
223
|
# the upstream chunk size.
|
|
73
224
|
#
|
|
74
225
|
# @param entries [Array<Hash>] Each entry has :id, :vector, :metadata keys
|
|
226
|
+
# @raise [Woods::Error] if any entry's vector doesn't match the configured
|
|
227
|
+
# dimension. Validation runs before the HTTP request so partial-batch
|
|
228
|
+
# state is impossible on dimension mismatch.
|
|
75
229
|
def store_batch(entries)
|
|
76
230
|
return if entries.empty?
|
|
77
231
|
|
|
232
|
+
if @dimensions
|
|
233
|
+
entries.each_with_index do |entry, idx|
|
|
234
|
+
validate_dimensions!(entry[:vector], index: idx)
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
|
|
78
238
|
body = {
|
|
79
239
|
points: entries.map do |entry|
|
|
80
240
|
{ id: entry[:id], vector: entry[:vector], payload: entry[:metadata] || {} }
|
|
@@ -130,13 +290,44 @@ module Woods
|
|
|
130
290
|
|
|
131
291
|
private
|
|
132
292
|
|
|
293
|
+
# Cap interpolated response bodies so misconfigured Qdrant responses
|
|
294
|
+
# (e.g. proxied HTML error pages) don't unbounded-leak into logs or
|
|
295
|
+
# re-raised error messages.
|
|
296
|
+
#
|
|
297
|
+
# @param body [String, nil]
|
|
298
|
+
# @return [String]
|
|
299
|
+
def truncate_response_body(body)
|
|
300
|
+
return '' if body.nil?
|
|
301
|
+
|
|
302
|
+
s = body.to_s
|
|
303
|
+
s.length > 500 ? "#{s[0, 500]}... [truncated]" : s
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
# Ensure the provided vector matches the store's configured dimension.
|
|
307
|
+
#
|
|
308
|
+
# @param vector [Array<Numeric>]
|
|
309
|
+
# @param index [Integer, nil] position in the batch
|
|
310
|
+
# @raise [Woods::Error] on dimension mismatch
|
|
311
|
+
def validate_dimensions!(vector, index: nil)
|
|
312
|
+
return if vector.respond_to?(:length) && vector.length == @dimensions
|
|
313
|
+
|
|
314
|
+
where = index ? " (entry #{index})" : ''
|
|
315
|
+
got = vector.respond_to?(:length) ? vector.length : vector.class
|
|
316
|
+
raise Woods::Error,
|
|
317
|
+
"Vector dimension mismatch#{where}: got #{got}, expected #{@dimensions}"
|
|
318
|
+
end
|
|
319
|
+
|
|
133
320
|
# Build a Qdrant filter from metadata key-value pairs.
|
|
134
321
|
#
|
|
135
322
|
# @param filters [Hash] Metadata filters
|
|
136
323
|
# @return [Hash] Qdrant-compatible filter with must conditions
|
|
137
324
|
def build_filter(filters)
|
|
138
325
|
conditions = filters.map do |key, value|
|
|
139
|
-
|
|
326
|
+
if value.is_a?(Array)
|
|
327
|
+
{ key: key.to_s, match: { any: value } }
|
|
328
|
+
else
|
|
329
|
+
{ key: key.to_s, match: { value: value } }
|
|
330
|
+
end
|
|
140
331
|
end
|
|
141
332
|
{ must: conditions }
|
|
142
333
|
end
|
|
@@ -153,7 +344,7 @@ module Woods
|
|
|
153
344
|
response = http_client.request(req)
|
|
154
345
|
|
|
155
346
|
unless response.is_a?(Net::HTTPSuccess)
|
|
156
|
-
raise Woods::Error, "Qdrant API error: #{response.code} #{response.body}"
|
|
347
|
+
raise Woods::Error, "Qdrant API error: #{response.code} #{truncate_response_body(response.body)}"
|
|
157
348
|
end
|
|
158
349
|
|
|
159
350
|
JSON.parse(response.body)
|
|
@@ -162,7 +353,7 @@ module Woods
|
|
|
162
353
|
@http_client = nil
|
|
163
354
|
response = http_client.request(req)
|
|
164
355
|
unless response.is_a?(Net::HTTPSuccess)
|
|
165
|
-
raise Woods::Error, "Qdrant API error: #{response.code} #{response.body}"
|
|
356
|
+
raise Woods::Error, "Qdrant API error: #{response.code} #{truncate_response_body(response.body)}"
|
|
166
357
|
end
|
|
167
358
|
|
|
168
359
|
JSON.parse(response.body)
|