woods 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +169 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +15 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +3 -4
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +737 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +1 -1
  102. data/lib/woods/unblocked/document_builder.rb +35 -10
  103. data/lib/woods/unblocked/exporter.rb +1 -1
  104. data/lib/woods/util/host_guard.rb +61 -0
  105. data/lib/woods/version.rb +1 -1
  106. data/lib/woods.rb +126 -6
  107. metadata +69 -4
@@ -1,8 +1,16 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'json'
4
+ require 'time'
4
5
 
5
6
  module Woods
7
+ # Same conditional-define pattern used elsewhere in the gem so this
8
+ # file can be required in isolation (e.g. by specs that bypass the
9
+ # full lib/woods.rb load) without tripping NameError on the friendly
10
+ # missing-sqlite3 raise below.
11
+ class Error < StandardError; end unless defined?(Woods::Error)
12
+ class ConfigurationError < Error; end unless defined?(Woods::ConfigurationError)
13
+
6
14
  module Storage
7
15
  # MetadataStore provides an interface for storing and querying unit metadata.
8
16
  #
@@ -85,6 +93,119 @@ module Woods
85
93
  end
86
94
  end
87
95
 
96
+ # Pure-Ruby metadata store backed by a hash. No external dependencies,
97
+ # no persistence — vectors and metadata both live in the building
98
+ # process and die with it. Suitable for hosts that don't bundle the
99
+ # `sqlite3` gem (e.g., MySQL- or Postgres-only Rails apps), and for
100
+ # short-lived processes that rebuild the index per run.
101
+ #
102
+ # @example
103
+ # store = InMemory.new
104
+ # store.store("User", { type: "model", namespace: "Admin" })
105
+ # store.find("User") # => { "type" => "model", "namespace" => "Admin" }
106
+ #
107
+ class InMemory
108
+ include Interface
109
+
110
+ def initialize
111
+ @data = {}
112
+ end
113
+
114
+ # @see Interface#store
115
+ def store(id, metadata)
116
+ @data[id] = stringify_keys(metadata).merge('updated_at' => Time.now.iso8601)
117
+ end
118
+
119
+ # @see Interface#find
120
+ def find(id)
121
+ record = @data[id]
122
+ return nil unless record
123
+
124
+ record.except('updated_at')
125
+ end
126
+
127
+ # @see Interface#find_batch
128
+ def find_batch(ids)
129
+ ids.each_with_object({}) do |id, result|
130
+ data = find(id)
131
+ result[id] = data if data
132
+ end
133
+ end
134
+
135
+ # @see Interface#find_by_type
136
+ def find_by_type(type)
137
+ target = type.to_s
138
+ @data.each_with_object([]) do |(id, record), out|
139
+ next unless record['type'].to_s == target
140
+
141
+ out << record.except('updated_at').merge('id' => id)
142
+ end
143
+ end
144
+
145
+ # @see Interface#search
146
+ def search(query, fields: nil)
147
+ needle = query.to_s
148
+ @data.each_with_object([]) do |(id, record), out|
149
+ haystacks = fields ? fields.map { |f| record[f.to_s] } : [JSON.generate(record)]
150
+ next unless haystacks.compact.any? { |h| h.to_s.include?(needle) }
151
+
152
+ out << record.except('updated_at').merge('id' => id)
153
+ end
154
+ end
155
+
156
+ # @see Interface#delete
157
+ def delete(id)
158
+ @data.delete(id)
159
+ end
160
+
161
+ # @see Interface#count
162
+ def count
163
+ @data.size
164
+ end
165
+
166
+ # Iterate over every stored entry, yielding +(id, metadata)+ pairs.
167
+ #
168
+ # Persistence seam for {Snapshotter::Metadata}. Yields the raw internal
169
+ # hash (including +updated_at+) so the Snapshotter can reconstruct state
170
+ # faithfully on load.
171
+ #
172
+ # @yield [id, metadata] id is a String; metadata is a Hash with string keys
173
+ # @return [Enumerator] when no block given
174
+ def each_entry(&block)
175
+ return enum_for(:each_entry) unless block
176
+
177
+ @data.each(&block)
178
+ end
179
+
180
+ # Hydrate the store from a pre-serialized dump.
181
+ #
182
+ # Dual of {#each_entry} — Snapshotter feeds the deserialized dump contents
183
+ # through this method to restore the store in a new process.
184
+ #
185
+ # @param entries [Enumerable<Array(String, Hash)>] Pairs of +[id, metadata]+
186
+ # @return [void]
187
+ def bulk_load(entries)
188
+ entries.each { |id, meta| @data[id] = meta }
189
+ end
190
+
191
+ # Drop every stored entry. Used by the MCP +reload+ tool to pick up a
192
+ # fresh embed run without restarting the process. Safe on an empty store.
193
+ def clear!
194
+ @data = {}
195
+ end
196
+
197
+ private
198
+
199
+ # Match the SQLite adapter's string-key contract regardless of how
200
+ # the caller serialises the input hash. Without this, find/search
201
+ # consumers that expect string keys (the SQLite path round-trips
202
+ # through JSON, which always returns strings) would break under
203
+ # symbol-keyed test fixtures.
204
+ def stringify_keys(hash)
205
+ hash.each_with_object({}) { |(k, v), out| out[k.to_s] = v }
206
+ end
207
+ end
208
+
88
209
  # SQLite-backed metadata store using the JSON1 extension.
89
210
  #
90
211
  # Stores unit metadata as JSON in a single table with type indexing
@@ -100,7 +221,14 @@ module Woods
100
221
 
101
222
  # @param db_path [String] Path to the SQLite database file, or ":memory:" for in-memory
102
223
  def initialize(db_path = ':memory:')
103
- require 'sqlite3'
224
+ begin
225
+ require 'sqlite3'
226
+ rescue LoadError
227
+ raise Woods::ConfigurationError,
228
+ 'metadata_store: :sqlite requires the sqlite3 gem in your Gemfile. ' \
229
+ "Add `gem 'sqlite3'` and re-bundle, or set " \
230
+ "`config.metadata_store = :in_memory` if you don't need cross-process persistence."
231
+ end
104
232
  @db = ::SQLite3::Database.new(db_path)
105
233
  @db.results_as_hash = true
106
234
  create_table
@@ -56,6 +56,7 @@ module Woods
56
56
  # @see Interface#store
57
57
  def store(id, vector, metadata = {})
58
58
  validate_vector!(vector)
59
+ validate_dimensions!(vector) if @dimensions
59
60
  entry = format_entry(id, vector, metadata)
60
61
 
61
62
  @connection.execute(<<~SQL)
@@ -71,14 +72,21 @@ module Woods
71
72
  # Store multiple vectors in a single multi-row INSERT.
72
73
  #
73
74
  # @param entries [Array<Hash>] Each entry has :id, :vector, :metadata keys
75
+ # @raise [ArgumentError] if any entry has a non-numeric or wrong-dimension vector.
76
+ # Validation runs BEFORE any INSERT so partial-batch writes can't occur.
74
77
  def store_batch(entries)
75
78
  return if entries.empty?
76
79
 
77
- values = entries.map do |entry|
78
- validate_vector!(entry[:vector])
79
- format_entry(entry[:id], entry[:vector], entry[:metadata] || {})
80
+ # Pre-validate every vector before any SQL — prevents partial-batch
81
+ # state when a later entry's dimension doesn't match.
82
+ entries.each_with_index do |entry, idx|
83
+ vector = entry[:vector]
84
+ validate_vector!(vector)
85
+ validate_dimensions!(vector, index: idx) if @dimensions
80
86
  end
81
87
 
88
+ values = entries.map { |entry| format_entry(entry[:id], entry[:vector], entry[:metadata] || {}) }
89
+
82
90
  @connection.execute(<<~SQL)
83
91
  INSERT INTO #{TABLE} (id, embedding, metadata, created_at)
84
92
  VALUES #{values.join(",\n")}
@@ -98,7 +106,7 @@ module Woods
98
106
  # @see Interface#search
99
107
  def search(query_vector, limit: 10, filters: {})
100
108
  validate_vector!(query_vector)
101
- vector_literal = "[#{query_vector.join(',')}]"
109
+ vector_literal = build_vector_literal(query_vector)
102
110
  where_clause = build_where(filters)
103
111
 
104
112
  sql = <<~SQL
@@ -142,10 +150,29 @@ module Woods
142
150
  def format_entry(id, vector, metadata)
143
151
  quoted_id = @connection.quote(id)
144
152
  quoted_metadata = @connection.quote(JSON.generate(metadata))
145
- vector_literal = "[#{vector.join(',')}]"
153
+ vector_literal = build_vector_literal(vector)
146
154
  "(#{quoted_id}, '#{vector_literal}', #{quoted_metadata}::jsonb, CURRENT_TIMESTAMP)"
147
155
  end
148
156
 
157
+ # Build the `[x,y,z]` pgvector literal from a validated numeric vector.
158
+ # Coerces each element through `Float()` first — `Float#to_s` is
159
+ # guaranteed to produce only digits, `.`, `-`, and `e`, which closes
160
+ # the theoretical `Numeric`-subclass `#to_s` injection vector even
161
+ # though {#validate_vector!} already rejects non-Numeric inputs.
162
+ # `Float()` raises `RangeError` on `Complex` values with an imaginary
163
+ # part — we surface that as an `ArgumentError` so callers see the
164
+ # same error shape as the other vector-validation paths instead of
165
+ # the raw coercion error.
166
+ def build_vector_literal(vector)
167
+ coerced = vector.each_with_index.map do |element, i|
168
+ Float(element).to_s
169
+ rescue RangeError, TypeError, ArgumentError => e
170
+ raise ArgumentError,
171
+ "Vector element at index #{i} cannot be coerced to Float: #{element.inspect} (#{e.class})"
172
+ end
173
+ "[#{coerced.join(',')}]"
174
+ end
175
+
149
176
  # Convert a database row to a SearchResult.
150
177
  #
151
178
  # @param row [Hash] Database row with id, distance, metadata
@@ -173,22 +200,57 @@ module Woods
173
200
  raise ArgumentError, "Invalid filter key: #{key_s.inspect}"
174
201
  end
175
202
 
176
- "metadata->>'#{key_s}' = #{@connection.quote(value.to_s)}"
203
+ # Belt-and-suspenders: regex above already rejects any value that
204
+ # could alter the SQL shape, but we still pass the key through
205
+ # `@connection.quote` so the quoting story is uniform across the
206
+ # key and value positions and a future regex-relaxation does not
207
+ # silently unlock injection.
208
+ if value.is_a?(Array)
209
+ # Membership filter. An empty Array would produce `IN ()`
210
+ # which is a syntax error; emit an always-false predicate
211
+ # so the query still parses and returns no rows.
212
+ next 'FALSE' if value.empty?
213
+
214
+ quoted = value.map { |v| @connection.quote(v.to_s) }.join(', ')
215
+ "metadata->>#{@connection.quote(key_s)} IN (#{quoted})"
216
+ else
217
+ "metadata->>#{@connection.quote(key_s)} = #{@connection.quote(value.to_s)}"
218
+ end
177
219
  end
178
220
  "WHERE #{conditions.join(' AND ')}"
179
221
  end
180
222
 
181
- # Validate that all vector elements are numeric.
223
+ # Validate that all vector elements are numeric and finite.
224
+ # Rejecting NaN / Infinity also closes a defense-in-depth gap
225
+ # around the vector-literal SQL construction — `Float::NAN.to_s`
226
+ # yields `"NaN"` which pgvector rejects, but other float-like
227
+ # sentinels can leak through string construction unexpectedly.
182
228
  #
183
229
  # @param vector [Array] The vector to validate
184
- # @raise [ArgumentError] if any element is not numeric
230
+ # @raise [ArgumentError] if any element is not numeric or is non-finite
185
231
  def validate_vector!(vector)
186
232
  vector.each_with_index do |element, i|
187
233
  unless element.is_a?(Numeric)
188
234
  raise ArgumentError, "Vector element at index #{i} is not numeric: #{element.inspect}"
189
235
  end
236
+ if element.is_a?(Float) && !element.finite?
237
+ raise ArgumentError, "Vector element at index #{i} is not finite: #{element.inspect}"
238
+ end
190
239
  end
191
240
  end
241
+
242
+ # Assert the provided vector matches the store's configured dimension.
243
+ #
244
+ # @param vector [Array<Numeric>]
245
+ # @param index [Integer, nil] position in the batch, used in the error message
246
+ # @raise [Woods::Error] on dimension mismatch
247
+ def validate_dimensions!(vector, index: nil)
248
+ return if vector.length == @dimensions
249
+
250
+ where = index ? " (entry #{index})" : ''
251
+ raise Woods::Error,
252
+ "Vector dimension mismatch#{where}: got #{vector.length}, expected #{@dimensions}"
253
+ end
192
254
  end
193
255
  end
194
256
  end
@@ -1,9 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'ipaddr'
3
4
  require 'net/http'
4
5
  require 'json'
6
+ require 'socket'
5
7
  require 'uri'
6
8
  require_relative 'vector_store'
9
+ require_relative '../util/host_guard'
7
10
 
8
11
  module Woods
9
12
  module Storage
@@ -22,20 +25,167 @@ module Woods
22
25
  class Qdrant # rubocop:disable Metrics/ClassLength
23
26
  include Interface
24
27
 
28
+ # URL schemes allowed for the Qdrant endpoint. `file://`, `gopher://`,
29
+ # and anything else would let a misconfigured or attacker-controlled
30
+ # config value turn the adapter into an SSRF vector against the host
31
+ # running extraction.
32
+ ALLOWED_SCHEMES = %w[http https].freeze
33
+
34
+ # IP ranges that always resolve to loopback, link-local, private, or
35
+ # CGNAT space and should never be contacted as a vector store unless
36
+ # the operator explicitly opts in via `allow_private_hosts: true`.
37
+ #
38
+ # Covers:
39
+ # - IPv4 "this network" / wildcard (0.0.0.0/8)
40
+ # - IPv4 loopback, RFC1918 (10/8, 172.16/12, 192.168/16)
41
+ # - IPv4 link-local 169.254/16 (AWS / Azure / GCP IMDS)
42
+ # - IPv4 CGNAT 100.64/10 (common in managed clouds behind NAT)
43
+ # - IPv6 loopback (::1) and unspecified (::)
44
+ # - IPv6 ULA fc00::/7 (private IPv6 equivalent of RFC1918)
45
+ # - IPv6 link-local fe80::/10
46
+ #
47
+ # NOTE: IPv4-mapped IPv6 (`::ffff:169.254.169.254`) is handled
48
+ # separately in {.private_host?} by detecting the `::ffff:` prefix
49
+ # and extracting the embedded IPv4 portion before range comparison.
50
+ # A blanket `::ffff:0:0/96` range here would (on some Ruby versions,
51
+ # including 3.0) match every IPv4 address due to IPAddr's
52
+ # cross-family auto-mapping in `#include?`.
53
+ PRIVATE_IP_RANGES = [
54
+ '0.0.0.0/8',
55
+ '10.0.0.0/8',
56
+ '127.0.0.0/8',
57
+ '169.254.0.0/16',
58
+ '172.16.0.0/12',
59
+ '192.168.0.0/16',
60
+ '100.64.0.0/10',
61
+ '::/128',
62
+ '::1/128',
63
+ 'fc00::/7',
64
+ 'fe80::/10'
65
+ ].map { |cidr| IPAddr.new(cidr) }.freeze
66
+
67
+ # Hostnames that always map to loopback regardless of DNS.
68
+ PRIVATE_HOSTNAMES = %w[localhost localhost. ip6-localhost ip6-loopback].freeze
69
+
25
70
  # @param url [String] Qdrant server URL
26
71
  # @param collection [String] Collection name
27
72
  # @param api_key [String, nil] Optional API key for authentication
28
- def initialize(url:, collection:, api_key: nil)
73
+ # @param dimensions [Integer, nil] Expected vector dimension. When set,
74
+ # {#store_batch}/{#store} pre-validate every vector's length before
75
+ # sending the HTTP request — Qdrant returns a 400 on mismatch, but
76
+ # detecting it client-side avoids wasted network round-trips and
77
+ # keeps error shape consistent with the pgvector adapter.
78
+ # @param allow_private_hosts [Boolean] Explicitly permit a URL whose
79
+ # host resolves inside loopback, link-local, or RFC1918 space. Off
80
+ # by default to block the common SSRF footgun. Set to true when the
81
+ # operator intentionally runs Qdrant on `localhost:6333` or inside
82
+ # a private network.
83
+ def initialize(url:, collection:, api_key: nil, dimensions: nil, allow_private_hosts: false)
84
+ @uri = self.class.validate_url!(url, allow_private_hosts: allow_private_hosts)
29
85
  @url = url
30
86
  @collection = collection
31
87
  @api_key = api_key
32
- @uri = URI(url)
88
+ @dimensions = dimensions
89
+ end
90
+
91
+ # Validate a Qdrant endpoint URL — scheme in {ALLOWED_SCHEMES} and,
92
+ # unless opted out, host outside loopback / link-local / RFC1918.
93
+ # Public so callers can pre-check configuration before constructing.
94
+ def self.validate_url!(url, allow_private_hosts: false)
95
+ uri = URI(url)
96
+ validate_scheme!(uri)
97
+ validate_host_present!(uri, url)
98
+ validate_host_visibility!(uri.host.to_s, allow_private_hosts: allow_private_hosts)
99
+ uri
100
+ rescue URI::InvalidURIError => e
101
+ raise ArgumentError, "Qdrant URL is not a valid URI: #{e.message}"
102
+ end
103
+
104
+ def self.validate_scheme!(uri)
105
+ return if ALLOWED_SCHEMES.include?(uri.scheme)
106
+
107
+ raise ArgumentError,
108
+ "Qdrant URL scheme must be one of #{ALLOWED_SCHEMES.join(', ')}; got #{uri.scheme.inspect}"
109
+ end
110
+
111
+ def self.validate_host_present!(uri, url)
112
+ return unless uri.host.nil? || uri.host.empty?
113
+
114
+ raise ArgumentError, "Qdrant URL must include a host: #{url.inspect}"
115
+ end
116
+
117
+ def self.validate_host_visibility!(host, allow_private_hosts:)
118
+ return if allow_private_hosts
119
+
120
+ # Canonicalize (strip port, trailing dot, IPv6 brackets) via
121
+ # the shared helper so Qdrant and OriginGuard stay in sync.
122
+ canonical = Util::HostGuard.canonicalize(host)
123
+
124
+ # Non-canonical numeric hosts (hex `0x7f000001`, octal
125
+ # `0177.0.0.1`, bare integer `2130706433`, short-form `127.1`,
126
+ # mixed-radix `0x7f.0.0.1`) are accepted by URI and getaddrinfo
127
+ # but NOT by `IPAddr`, so the private-range check silently
128
+ # passed them through. Reject any host that looks numeric-but-
129
+ # not-standard instead of trying to canonicalize every form.
130
+ if Util::HostGuard.suspicious_numeric_host?(canonical)
131
+ raise ArgumentError,
132
+ "Qdrant URL uses a non-standard numeric host (#{host}). " \
133
+ 'Hex/octal/integer/short-form IPs are rejected because they ' \
134
+ 'can disguise loopback or private addresses. Pass the ' \
135
+ 'dotted-decimal form explicitly.'
136
+ end
137
+
138
+ return unless private_host?(canonical)
139
+
140
+ raise ArgumentError,
141
+ "Qdrant URL targets a private/loopback host (#{host}); " \
142
+ 'pass allow_private_hosts: true to permit. ' \
143
+ 'Note: validation is at config time; DNS resolution happens ' \
144
+ 'per request, so a public hostname that later resolves to a ' \
145
+ 'private IP is NOT caught here — deploy Qdrant on a trusted network.'
146
+ end
147
+
148
+ def self.private_host?(host)
149
+ return true if PRIVATE_HOSTNAMES.include?(host)
150
+
151
+ ip = unmap_ipv4(IPAddr.new(host))
152
+
153
+ # Restrict range-check to the SAME address family so IPAddr's
154
+ # cross-family `include?` can't silently match all IPv4
155
+ # addresses into an IPv6 range (or vice versa) — a quirk
156
+ # observed on Ruby 3.0's IPAddr that trapped legitimate public
157
+ # IPv4 addresses as "IPv4-mapped private" when the range list
158
+ # contained `::ffff:0:0/96`.
159
+ PRIVATE_IP_RANGES.any? do |range|
160
+ range.family == ip.family && range.include?(ip)
161
+ end
162
+ rescue IPAddr::InvalidAddressError
163
+ false
33
164
  end
34
165
 
166
+ # IPv4-mapped IPv6 (`::ffff:169.254.169.254`): extract the
167
+ # embedded IPv4 (low 32 bits) before range comparison so the AWS
168
+ # IMDS address is caught by 169.254/16 even when disguised as
169
+ # IPv4-mapped IPv6. Returns the input unchanged for every other
170
+ # address.
171
+ def self.unmap_ipv4(ip)
172
+ return ip unless ip.ipv6?
173
+ return ip unless ip.to_string.start_with?('0000:0000:0000:0000:0000:ffff:')
174
+
175
+ mapped_ipv4 = ip.to_i & 0xffff_ffff
176
+ return ip unless mapped_ipv4.positive?
177
+
178
+ IPAddr.new(mapped_ipv4, Socket::AF_INET)
179
+ end
180
+
181
+ private_class_method :validate_scheme!, :validate_host_present!,
182
+ :validate_host_visibility!, :private_host?, :unmap_ipv4
183
+
35
184
  # Create the collection if it doesn't exist.
36
185
  #
37
186
  # @param dimensions [Integer] Vector dimensionality
38
187
  def ensure_collection!(dimensions:)
188
+ @dimensions ||= dimensions
39
189
  body = {
40
190
  vectors: {
41
191
  size: dimensions,
@@ -52,6 +202,7 @@ module Woods
52
202
  # @param metadata [Hash] Optional payload metadata
53
203
  # @see Interface#store
54
204
  def store(id, vector, metadata = {})
205
+ validate_dimensions!(vector) if @dimensions
55
206
  body = {
56
207
  points: [
57
208
  {
@@ -72,9 +223,18 @@ module Woods
72
223
  # the upstream chunk size.
73
224
  #
74
225
  # @param entries [Array<Hash>] Each entry has :id, :vector, :metadata keys
226
+ # @raise [Woods::Error] if any entry's vector doesn't match the configured
227
+ # dimension. Validation runs before the HTTP request so partial-batch
228
+ # state is impossible on dimension mismatch.
75
229
  def store_batch(entries)
76
230
  return if entries.empty?
77
231
 
232
+ if @dimensions
233
+ entries.each_with_index do |entry, idx|
234
+ validate_dimensions!(entry[:vector], index: idx)
235
+ end
236
+ end
237
+
78
238
  body = {
79
239
  points: entries.map do |entry|
80
240
  { id: entry[:id], vector: entry[:vector], payload: entry[:metadata] || {} }
@@ -130,13 +290,44 @@ module Woods
130
290
 
131
291
  private
132
292
 
293
+ # Cap interpolated response bodies so misconfigured Qdrant responses
294
+ # (e.g. proxied HTML error pages) don't unbounded-leak into logs or
295
+ # re-raised error messages.
296
+ #
297
+ # @param body [String, nil]
298
+ # @return [String]
299
+ def truncate_response_body(body)
300
+ return '' if body.nil?
301
+
302
+ s = body.to_s
303
+ s.length > 500 ? "#{s[0, 500]}... [truncated]" : s
304
+ end
305
+
306
+ # Ensure the provided vector matches the store's configured dimension.
307
+ #
308
+ # @param vector [Array<Numeric>]
309
+ # @param index [Integer, nil] position in the batch
310
+ # @raise [Woods::Error] on dimension mismatch
311
+ def validate_dimensions!(vector, index: nil)
312
+ return if vector.respond_to?(:length) && vector.length == @dimensions
313
+
314
+ where = index ? " (entry #{index})" : ''
315
+ got = vector.respond_to?(:length) ? vector.length : vector.class
316
+ raise Woods::Error,
317
+ "Vector dimension mismatch#{where}: got #{got}, expected #{@dimensions}"
318
+ end
319
+
133
320
  # Build a Qdrant filter from metadata key-value pairs.
134
321
  #
135
322
  # @param filters [Hash] Metadata filters
136
323
  # @return [Hash] Qdrant-compatible filter with must conditions
137
324
  def build_filter(filters)
138
325
  conditions = filters.map do |key, value|
139
- { key: key.to_s, match: { value: value } }
326
+ if value.is_a?(Array)
327
+ { key: key.to_s, match: { any: value } }
328
+ else
329
+ { key: key.to_s, match: { value: value } }
330
+ end
140
331
  end
141
332
  { must: conditions }
142
333
  end
@@ -153,7 +344,7 @@ module Woods
153
344
  response = http_client.request(req)
154
345
 
155
346
  unless response.is_a?(Net::HTTPSuccess)
156
- raise Woods::Error, "Qdrant API error: #{response.code} #{response.body}"
347
+ raise Woods::Error, "Qdrant API error: #{response.code} #{truncate_response_body(response.body)}"
157
348
  end
158
349
 
159
350
  JSON.parse(response.body)
@@ -162,7 +353,7 @@ module Woods
162
353
  @http_client = nil
163
354
  response = http_client.request(req)
164
355
  unless response.is_a?(Net::HTTPSuccess)
165
- raise Woods::Error, "Qdrant API error: #{response.code} #{response.body}"
356
+ raise Woods::Error, "Qdrant API error: #{response.code} #{truncate_response_body(response.body)}"
166
357
  end
167
358
 
168
359
  JSON.parse(response.body)