zvec-ruby 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/Rakefile +3 -0
- data/lib/zvec/active_record.rb +51 -4
- data/lib/zvec/collection.rb +344 -33
- data/lib/zvec/data_types.rb +250 -0
- data/lib/zvec/doc.rb +119 -10
- data/lib/zvec/query.rb +110 -1
- data/lib/zvec/ruby_llm.rb +79 -7
- data/lib/zvec/schema.rb +145 -1
- data/lib/zvec/version.rb +1 -1
- data/lib/zvec.rb +13 -0
- data/test/test_edge_cases.rb +380 -0
- data/test/test_helper.rb +102 -3
- data/test/test_type_detection.rb +258 -0
- data/test/test_validation.rb +305 -0
- data/test/test_version.rb +1 -1
- metadata +4 -1
data/lib/zvec/data_types.rb
CHANGED
|
@@ -1,37 +1,143 @@
|
|
|
1
1
|
module Zvec
|
|
2
|
+
# Data type constants, coercion utilities, and dispatch tables for mapping
|
|
3
|
+
# between Ruby types and the underlying C++ zvec engine types.
|
|
4
|
+
#
|
|
5
|
+
# == Scalar Types
|
|
6
|
+
#
|
|
7
|
+
# * {BINARY} -- Raw binary data
|
|
8
|
+
# * {STRING} -- UTF-8 string
|
|
9
|
+
# * {BOOL} -- Boolean (true/false)
|
|
10
|
+
# * {INT32} -- 32-bit signed integer
|
|
11
|
+
# * {INT64} -- 64-bit signed integer
|
|
12
|
+
# * {UINT32} -- 32-bit unsigned integer
|
|
13
|
+
# * {UINT64} -- 64-bit unsigned integer
|
|
14
|
+
# * {FLOAT} -- 32-bit IEEE 754 float
|
|
15
|
+
# * {DOUBLE} -- 64-bit IEEE 754 double
|
|
16
|
+
#
|
|
17
|
+
# == Dense Vector Types
|
|
18
|
+
#
|
|
19
|
+
# Dense vectors store a fixed-length array of numeric values. Choose the
|
|
20
|
+
# precision that balances accuracy vs. memory:
|
|
21
|
+
#
|
|
22
|
+
# * {VECTOR_FP32} -- 32-bit float vector (default, best accuracy)
|
|
23
|
+
# * {VECTOR_FP64} -- 64-bit double vector (highest accuracy, 2x memory)
|
|
24
|
+
# * {VECTOR_FP16} -- 16-bit half-precision vector (half the memory of FP32)
|
|
25
|
+
# * {VECTOR_INT8} -- 8-bit integer vector (smallest, for quantized models)
|
|
26
|
+
#
|
|
27
|
+
# == Sparse Vector Types
|
|
28
|
+
#
|
|
29
|
+
# Sparse vectors store only non-zero elements, ideal for high-dimensional
|
|
30
|
+
# data where most values are zero (e.g., BM25 or TF-IDF features):
|
|
31
|
+
#
|
|
32
|
+
# * {SPARSE_VECTOR_FP32} -- Sparse vector with 32-bit float values
|
|
33
|
+
# * {SPARSE_VECTOR_FP16} -- Sparse vector with 16-bit float values
|
|
34
|
+
#
|
|
35
|
+
# == Binary Vectors
|
|
36
|
+
#
|
|
37
|
+
# Binary vectors use the {BINARY} type and store bit-packed data, useful for
|
|
38
|
+
# binary hash codes or Hamming distance searches.
|
|
39
|
+
#
|
|
40
|
+
# == Array Types
|
|
41
|
+
#
|
|
42
|
+
# * {ARRAY_STRING} -- Array of strings (e.g., tags)
|
|
43
|
+
# * {ARRAY_INT32} -- Array of 32-bit integers
|
|
44
|
+
# * {ARRAY_INT64} -- Array of 64-bit integers
|
|
45
|
+
# * {ARRAY_FLOAT} -- Array of 32-bit floats
|
|
46
|
+
# * {ARRAY_DOUBLE} -- Array of 64-bit doubles
|
|
47
|
+
# * {ARRAY_BOOL} -- Array of booleans
|
|
48
|
+
#
|
|
49
|
+
# == Quantization Types
|
|
50
|
+
#
|
|
51
|
+
# Quantization reduces memory usage and speeds up search at the cost of some
|
|
52
|
+
# accuracy. Specify a quantization type when creating an index:
|
|
53
|
+
#
|
|
54
|
+
# Ext::HnswIndexParams.new(metric, quantize_type: Ext::QuantizeType::INT8)
|
|
55
|
+
#
|
|
56
|
+
# Available quantization types (via +Ext::QuantizeType+):
|
|
57
|
+
#
|
|
58
|
+
# * +FP16+ -- Half-precision (16-bit) quantization. Good balance of speed
|
|
59
|
+
# and accuracy. Halves memory vs. FP32.
|
|
60
|
+
# * +INT8+ -- 8-bit integer quantization. ~4x memory reduction vs. FP32.
|
|
61
|
+
# Slight accuracy loss.
|
|
62
|
+
# * +INT4+ -- 4-bit integer quantization. ~8x memory reduction vs. FP32.
|
|
63
|
+
# Larger accuracy loss, best for large-scale approximate search.
|
|
64
|
+
#
|
|
65
|
+
# == Metric Types
|
|
66
|
+
#
|
|
67
|
+
# * {L2} -- Euclidean (L2) distance. Lower is more similar.
|
|
68
|
+
# * {IP} -- Inner product. Higher is more similar.
|
|
69
|
+
# * {COSINE} -- Cosine similarity. Higher is more similar. Vectors are
|
|
70
|
+
# normalized internally.
|
|
71
|
+
#
|
|
2
72
|
module DataTypes
|
|
3
73
|
# Re-export C++ enum values as Ruby-friendly constants
|
|
74
|
+
|
|
75
|
+
# @return [Symbol] Raw binary data type
|
|
4
76
|
BINARY = Ext::DataType::BINARY
|
|
77
|
+
# @return [Symbol] UTF-8 string data type
|
|
5
78
|
STRING = Ext::DataType::STRING
|
|
79
|
+
# @return [Symbol] Boolean data type
|
|
6
80
|
BOOL = Ext::DataType::BOOL
|
|
81
|
+
# @return [Symbol] 32-bit signed integer data type
|
|
7
82
|
INT32 = Ext::DataType::INT32
|
|
83
|
+
# @return [Symbol] 64-bit signed integer data type
|
|
8
84
|
INT64 = Ext::DataType::INT64
|
|
85
|
+
# @return [Symbol] 32-bit unsigned integer data type
|
|
9
86
|
UINT32 = Ext::DataType::UINT32
|
|
87
|
+
# @return [Symbol] 64-bit unsigned integer data type
|
|
10
88
|
UINT64 = Ext::DataType::UINT64
|
|
89
|
+
# @return [Symbol] 32-bit float data type
|
|
11
90
|
FLOAT = Ext::DataType::FLOAT
|
|
91
|
+
# @return [Symbol] 64-bit double data type
|
|
12
92
|
DOUBLE = Ext::DataType::DOUBLE
|
|
13
93
|
|
|
94
|
+
# @return [Symbol] 32-bit float dense vector
|
|
14
95
|
VECTOR_FP32 = Ext::DataType::VECTOR_FP32
|
|
96
|
+
# @return [Symbol] 64-bit double dense vector
|
|
15
97
|
VECTOR_FP64 = Ext::DataType::VECTOR_FP64
|
|
98
|
+
# @return [Symbol] 16-bit half-precision dense vector
|
|
16
99
|
VECTOR_FP16 = Ext::DataType::VECTOR_FP16
|
|
100
|
+
# @return [Symbol] 8-bit integer dense vector (quantized)
|
|
17
101
|
VECTOR_INT8 = Ext::DataType::VECTOR_INT8
|
|
18
102
|
|
|
103
|
+
# @return [Symbol] 32-bit float sparse vector
|
|
19
104
|
SPARSE_VECTOR_FP32 = Ext::DataType::SPARSE_VECTOR_FP32
|
|
105
|
+
# @return [Symbol] 16-bit float sparse vector
|
|
20
106
|
SPARSE_VECTOR_FP16 = Ext::DataType::SPARSE_VECTOR_FP16
|
|
21
107
|
|
|
108
|
+
# @return [Symbol] Array of strings
|
|
22
109
|
ARRAY_STRING = Ext::DataType::ARRAY_STRING
|
|
110
|
+
# @return [Symbol] Array of 32-bit integers
|
|
23
111
|
ARRAY_INT32 = Ext::DataType::ARRAY_INT32
|
|
112
|
+
# @return [Symbol] Array of 64-bit integers
|
|
24
113
|
ARRAY_INT64 = Ext::DataType::ARRAY_INT64
|
|
114
|
+
# @return [Symbol] Array of 32-bit floats
|
|
25
115
|
ARRAY_FLOAT = Ext::DataType::ARRAY_FLOAT
|
|
116
|
+
# @return [Symbol] Array of 64-bit doubles
|
|
26
117
|
ARRAY_DOUBLE = Ext::DataType::ARRAY_DOUBLE
|
|
118
|
+
# @return [Symbol] Array of booleans
|
|
27
119
|
ARRAY_BOOL = Ext::DataType::ARRAY_BOOL
|
|
28
120
|
|
|
29
121
|
# Metric types
|
|
122
|
+
|
|
123
|
+
# @return [Symbol] Euclidean (L2) distance metric
|
|
30
124
|
L2 = Ext::MetricType::L2
|
|
125
|
+
# @return [Symbol] Inner product metric
|
|
31
126
|
IP = Ext::MetricType::IP
|
|
127
|
+
# @return [Symbol] Cosine similarity metric
|
|
32
128
|
COSINE = Ext::MetricType::COSINE
|
|
33
129
|
|
|
130
|
+
# Vector data types for dimension validation
|
|
131
|
+
# @return [Array<Symbol>] All dense vector data type constants
|
|
132
|
+
VECTOR_TYPES = [
|
|
133
|
+
Ext::DataType::VECTOR_FP32,
|
|
134
|
+
Ext::DataType::VECTOR_FP64,
|
|
135
|
+
Ext::DataType::VECTOR_FP16,
|
|
136
|
+
Ext::DataType::VECTOR_INT8,
|
|
137
|
+
].freeze
|
|
138
|
+
|
|
34
139
|
# Setter dispatch table: DataType -> Doc setter method name
|
|
140
|
+
# @return [Hash{Symbol => Symbol}]
|
|
35
141
|
SETTER_FOR = {
|
|
36
142
|
Ext::DataType::STRING => :set_string,
|
|
37
143
|
Ext::DataType::BOOL => :set_bool,
|
|
@@ -46,6 +152,8 @@ module Zvec
|
|
|
46
152
|
Ext::DataType::ARRAY_STRING => :set_string_array,
|
|
47
153
|
}.freeze
|
|
48
154
|
|
|
155
|
+
# Getter dispatch table: DataType -> Doc getter method name
|
|
156
|
+
# @return [Hash{Symbol => Symbol}]
|
|
49
157
|
GETTER_FOR = {
|
|
50
158
|
Ext::DataType::STRING => :get_string,
|
|
51
159
|
Ext::DataType::BOOL => :get_bool,
|
|
@@ -59,5 +167,147 @@ module Zvec
|
|
|
59
167
|
Ext::DataType::VECTOR_FP64 => :get_double_vector,
|
|
60
168
|
Ext::DataType::ARRAY_STRING => :get_string_array,
|
|
61
169
|
}.freeze
|
|
170
|
+
|
|
171
|
+
# Detect the zvec data type for a Ruby value.
|
|
172
|
+
#
|
|
173
|
+
# Handles edge cases: Integer vs Float, String booleans, nil, empty arrays.
|
|
174
|
+
#
|
|
175
|
+
# @param value [Object] the Ruby value to inspect
|
|
176
|
+
# @return [Symbol, nil] the zvec data type constant, or nil for nil input
|
|
177
|
+
#
|
|
178
|
+
# @example
|
|
179
|
+
# DataTypes.detect_type("hello") #=> Ext::DataType::STRING
|
|
180
|
+
# DataTypes.detect_type(42) #=> Ext::DataType::INT64
|
|
181
|
+
# DataTypes.detect_type([1.0]) #=> Ext::DataType::VECTOR_FP32
|
|
182
|
+
# DataTypes.detect_type(nil) #=> nil
|
|
183
|
+
def self.detect_type(value)
|
|
184
|
+
case value
|
|
185
|
+
when NilClass then nil
|
|
186
|
+
when String then Ext::DataType::STRING
|
|
187
|
+
when Integer then Ext::DataType::INT64
|
|
188
|
+
when Float then Ext::DataType::DOUBLE
|
|
189
|
+
when TrueClass, FalseClass then Ext::DataType::BOOL
|
|
190
|
+
when Array then detect_array_type(value)
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Coerce a Ruby value into a form suitable for the given zvec data type.
|
|
195
|
+
#
|
|
196
|
+
# @param value [Object] the value to coerce
|
|
197
|
+
# @param target_type [Symbol] the target zvec data type constant
|
|
198
|
+
# @param field_name [String, nil] optional field name for error messages
|
|
199
|
+
# @return [Object] the coerced value
|
|
200
|
+
# @raise [ArgumentError] if the value cannot be coerced to the target type
|
|
201
|
+
#
|
|
202
|
+
# @example
|
|
203
|
+
# DataTypes.coerce_value(42, Ext::DataType::STRING) #=> "42"
|
|
204
|
+
# DataTypes.coerce_value("3.14", Ext::DataType::DOUBLE) #=> 3.14
|
|
205
|
+
# DataTypes.coerce_value([1, 2], Ext::DataType::VECTOR_FP32) #=> [1.0, 2.0]
|
|
206
|
+
def self.coerce_value(value, target_type, field_name: nil)
|
|
207
|
+
return value if value.nil?
|
|
208
|
+
|
|
209
|
+
ctx = field_name ? " for field '#{field_name}'" : ""
|
|
210
|
+
|
|
211
|
+
case target_type
|
|
212
|
+
when Ext::DataType::STRING
|
|
213
|
+
value.to_s
|
|
214
|
+
when Ext::DataType::BOOL
|
|
215
|
+
coerce_bool(value, ctx)
|
|
216
|
+
when Ext::DataType::INT32, Ext::DataType::INT64,
|
|
217
|
+
Ext::DataType::UINT32, Ext::DataType::UINT64
|
|
218
|
+
coerce_integer(value, ctx)
|
|
219
|
+
when Ext::DataType::FLOAT, Ext::DataType::DOUBLE
|
|
220
|
+
coerce_float(value, ctx)
|
|
221
|
+
when Ext::DataType::VECTOR_FP32, Ext::DataType::VECTOR_FP64
|
|
222
|
+
coerce_float_vector(value, ctx)
|
|
223
|
+
when Ext::DataType::ARRAY_STRING
|
|
224
|
+
coerce_string_array(value, ctx)
|
|
225
|
+
else
|
|
226
|
+
value
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
class << self
|
|
231
|
+
private
|
|
232
|
+
|
|
233
|
+
# @param arr [Array] the array to detect the element type for
|
|
234
|
+
# @return [Symbol] the detected zvec data type
|
|
235
|
+
def detect_array_type(arr)
|
|
236
|
+
return Ext::DataType::VECTOR_FP32 if arr.empty?
|
|
237
|
+
|
|
238
|
+
first_non_nil = arr.find { |v| !v.nil? }
|
|
239
|
+
return Ext::DataType::VECTOR_FP32 if first_non_nil.nil?
|
|
240
|
+
|
|
241
|
+
case first_non_nil
|
|
242
|
+
when Float then Ext::DataType::VECTOR_FP32
|
|
243
|
+
when Integer then Ext::DataType::VECTOR_FP32
|
|
244
|
+
when String then Ext::DataType::ARRAY_STRING
|
|
245
|
+
when TrueClass, FalseClass then Ext::DataType::ARRAY_BOOL
|
|
246
|
+
else Ext::DataType::VECTOR_FP32
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def coerce_bool(value, ctx)
|
|
251
|
+
case value
|
|
252
|
+
when TrueClass, FalseClass then value
|
|
253
|
+
when "true", "1" then true
|
|
254
|
+
when "false", "0" then false
|
|
255
|
+
when Integer then !value.zero?
|
|
256
|
+
else
|
|
257
|
+
raise ArgumentError,
|
|
258
|
+
"Cannot coerce #{value.class} (#{value.inspect}) to Bool#{ctx}"
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
def coerce_integer(value, ctx)
|
|
263
|
+
case value
|
|
264
|
+
when Integer then value
|
|
265
|
+
when Float then value.to_i
|
|
266
|
+
when String
|
|
267
|
+
Integer(value)
|
|
268
|
+
else
|
|
269
|
+
raise ArgumentError,
|
|
270
|
+
"Cannot coerce #{value.class} (#{value.inspect}) to Integer#{ctx}"
|
|
271
|
+
end
|
|
272
|
+
rescue ::ArgumentError
|
|
273
|
+
raise ArgumentError,
|
|
274
|
+
"Cannot coerce #{value.class} (#{value.inspect}) to Integer#{ctx}"
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
def coerce_float(value, ctx)
|
|
278
|
+
case value
|
|
279
|
+
when Numeric then value.to_f
|
|
280
|
+
when String
|
|
281
|
+
Float(value)
|
|
282
|
+
else
|
|
283
|
+
raise ArgumentError,
|
|
284
|
+
"Cannot coerce #{value.class} (#{value.inspect}) to Float#{ctx}"
|
|
285
|
+
end
|
|
286
|
+
rescue ::ArgumentError
|
|
287
|
+
raise ArgumentError,
|
|
288
|
+
"Cannot coerce #{value.class} (#{value.inspect}) to Float#{ctx}"
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
def coerce_float_vector(value, ctx)
|
|
292
|
+
unless value.is_a?(Array)
|
|
293
|
+
raise ArgumentError, "Expected Array for vector#{ctx}, got #{value.class}"
|
|
294
|
+
end
|
|
295
|
+
value.map do |v|
|
|
296
|
+
next 0.0 if v.nil?
|
|
297
|
+
unless v.is_a?(Numeric)
|
|
298
|
+
raise ArgumentError,
|
|
299
|
+
"Vector#{ctx} contains non-numeric element: #{v.inspect}"
|
|
300
|
+
end
|
|
301
|
+
v.to_f
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
def coerce_string_array(value, ctx)
|
|
306
|
+
unless value.is_a?(Array)
|
|
307
|
+
raise ArgumentError, "Expected Array for string array#{ctx}, got #{value.class}"
|
|
308
|
+
end
|
|
309
|
+
value.map { |v| v.nil? ? "" : v.to_s }
|
|
310
|
+
end
|
|
311
|
+
end
|
|
62
312
|
end
|
|
63
313
|
end
|
data/lib/zvec/doc.rb
CHANGED
|
@@ -1,7 +1,37 @@
|
|
|
1
1
|
module Zvec
|
|
2
|
+
# A document (row) in a zvec collection. Wraps the C++ Doc object and
|
|
3
|
+
# provides Ruby-friendly field access with automatic type coercion.
|
|
4
|
+
#
|
|
5
|
+
# Documents can be created with or without a schema. With a schema,
|
|
6
|
+
# values are coerced and validated against declared field types and
|
|
7
|
+
# vector dimensions. Without a schema, types are auto-detected.
|
|
8
|
+
#
|
|
9
|
+
# @example Creating a document with a schema
|
|
10
|
+
# doc = Zvec::Doc.new(pk: "doc-1", schema: schema)
|
|
11
|
+
# doc["title"] = "Hello World"
|
|
12
|
+
# doc["embedding"] = [0.1, 0.2, 0.3, 0.4]
|
|
13
|
+
#
|
|
14
|
+
# @example Schema-less document (types auto-detected)
|
|
15
|
+
# doc = Zvec::Doc.new(pk: "doc-2")
|
|
16
|
+
# doc["name"] = "Alice" # stored as string
|
|
17
|
+
# doc["age"] = 30 # stored as int64
|
|
18
|
+
# doc["score"] = 0.95 # stored as double
|
|
19
|
+
# doc["active"] = true # stored as bool
|
|
20
|
+
# doc["vec"] = [1.0, 2.0] # stored as float vector
|
|
21
|
+
# doc["tags"] = ["a", "b"] # stored as string array
|
|
22
|
+
#
|
|
2
23
|
class Doc
|
|
24
|
+
# @return [Ext::Doc] the underlying C++ document object
|
|
3
25
|
attr_reader :ext_doc
|
|
4
26
|
|
|
27
|
+
# Create a new document.
|
|
28
|
+
#
|
|
29
|
+
# @param pk [String, Integer, nil] primary key (converted to String)
|
|
30
|
+
# @param fields [Hash{String, Symbol => Object}] initial field values
|
|
31
|
+
# @param schema [Zvec::Schema, nil] optional schema for type validation
|
|
32
|
+
#
|
|
33
|
+
# @example
|
|
34
|
+
# doc = Zvec::Doc.new(pk: "abc", fields: { "title" => "Hello" }, schema: schema)
|
|
5
35
|
def initialize(pk: nil, fields: {}, schema: nil)
|
|
6
36
|
@ext_doc = Ext::Doc.new
|
|
7
37
|
@ext_doc.pk = pk.to_s if pk
|
|
@@ -9,53 +39,119 @@ module Zvec
|
|
|
9
39
|
fields.each { |k, v| set(k, v) } if schema
|
|
10
40
|
end
|
|
11
41
|
|
|
42
|
+
# @return [String] the primary key
|
|
12
43
|
def pk
|
|
13
44
|
@ext_doc.pk
|
|
14
45
|
end
|
|
15
46
|
|
|
47
|
+
# Set the primary key.
|
|
48
|
+
#
|
|
49
|
+
# @param value [String, Integer] the new primary key (converted to String)
|
|
50
|
+
# @return [void]
|
|
16
51
|
def pk=(value)
|
|
17
52
|
@ext_doc.pk = value.to_s
|
|
18
53
|
end
|
|
19
54
|
|
|
55
|
+
# @return [Float] the similarity score (set after search queries)
|
|
20
56
|
def score
|
|
21
57
|
@score || @ext_doc.score
|
|
22
58
|
end
|
|
23
59
|
|
|
60
|
+
# Read a field value by name (bracket accessor).
|
|
61
|
+
#
|
|
62
|
+
# @param field_name [String, Symbol] the field name
|
|
63
|
+
# @return [Object, nil] the field value, or nil if not set
|
|
64
|
+
#
|
|
65
|
+
# @example
|
|
66
|
+
# doc["title"] #=> "Hello"
|
|
24
67
|
def [](field_name)
|
|
25
68
|
get(field_name)
|
|
26
69
|
end
|
|
27
70
|
|
|
71
|
+
# Write a field value by name (bracket accessor).
|
|
72
|
+
#
|
|
73
|
+
# @param field_name [String, Symbol] the field name
|
|
74
|
+
# @param value [Object] the value to set
|
|
75
|
+
# @return [void]
|
|
76
|
+
#
|
|
77
|
+
# @example
|
|
78
|
+
# doc["title"] = "Hello"
|
|
28
79
|
def []=(field_name, value)
|
|
29
80
|
set(field_name, value)
|
|
30
81
|
end
|
|
31
82
|
|
|
83
|
+
# Set a field value. When a schema is present, the value is coerced to
|
|
84
|
+
# the declared type and validated. Without a schema, the type is
|
|
85
|
+
# auto-detected from the Ruby value.
|
|
86
|
+
#
|
|
87
|
+
# @param field_name [String, Symbol] the field name (must be non-empty)
|
|
88
|
+
# @param value [Object] the value to set (nil sets the field to null)
|
|
89
|
+
# @return [void]
|
|
90
|
+
# @raise [ArgumentError] if field_name is blank or value type is unsupported
|
|
91
|
+
# @raise [Zvec::DimensionError] if vector dimension doesn't match schema
|
|
92
|
+
#
|
|
93
|
+
# @example
|
|
94
|
+
# doc.set("title", "Hello")
|
|
95
|
+
# doc.set(:count, 42)
|
|
96
|
+
# doc.set("embedding", [0.1, 0.2, 0.3])
|
|
32
97
|
def set(field_name, value)
|
|
33
98
|
field_name = field_name.to_s
|
|
99
|
+
raise ArgumentError, "Field name must be a non-empty string" if field_name.strip.empty?
|
|
100
|
+
|
|
34
101
|
return @ext_doc.set_null(field_name) if value.nil?
|
|
35
102
|
|
|
36
103
|
if @schema
|
|
37
104
|
type = @schema.field_type(field_name)
|
|
38
105
|
if type
|
|
106
|
+
coerced = DataTypes.coerce_value(value, type, field_name: field_name)
|
|
39
107
|
setter = DataTypes::SETTER_FOR[type]
|
|
40
|
-
|
|
108
|
+
if setter
|
|
109
|
+
# Validate vector dimension if schema has dimension info
|
|
110
|
+
if DataTypes::VECTOR_TYPES.include?(type) && coerced.is_a?(Array)
|
|
111
|
+
expected_dim = @schema.field_dimension(field_name)
|
|
112
|
+
if expected_dim && !coerced.empty? && coerced.size != expected_dim
|
|
113
|
+
raise DimensionError,
|
|
114
|
+
"Vector dimension mismatch for field '#{field_name}': " \
|
|
115
|
+
"expected #{expected_dim}, got #{coerced.size}"
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
return @ext_doc.send(setter, field_name, coerced)
|
|
119
|
+
end
|
|
41
120
|
end
|
|
42
121
|
end
|
|
43
122
|
|
|
44
|
-
# Auto-detect type
|
|
123
|
+
# Auto-detect type (schema-less mode)
|
|
45
124
|
case value
|
|
46
|
-
when String
|
|
47
|
-
when Integer
|
|
48
|
-
when Float
|
|
125
|
+
when String then @ext_doc.set_string(field_name, value)
|
|
126
|
+
when Integer then @ext_doc.set_int64(field_name, value)
|
|
127
|
+
when Float then @ext_doc.set_double(field_name, value)
|
|
49
128
|
when TrueClass, FalseClass then @ext_doc.set_bool(field_name, value)
|
|
50
129
|
when Array
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
@ext_doc.set_string_array(field_name, value)
|
|
130
|
+
detected = DataTypes.detect_type(value)
|
|
131
|
+
case detected
|
|
132
|
+
when Ext::DataType::ARRAY_STRING
|
|
133
|
+
@ext_doc.set_string_array(field_name, value.map { |v| v.nil? ? "" : v.to_s })
|
|
134
|
+
else
|
|
135
|
+
# Default: treat as float vector
|
|
136
|
+
coerced = value.map { |v| v.nil? ? 0.0 : v.to_f }
|
|
137
|
+
@ext_doc.set_float_vector(field_name, coerced)
|
|
55
138
|
end
|
|
139
|
+
else
|
|
140
|
+
raise ArgumentError,
|
|
141
|
+
"Unsupported value type #{value.class} for field '#{field_name}'"
|
|
56
142
|
end
|
|
57
143
|
end
|
|
58
144
|
|
|
145
|
+
# Get a field value by name. Uses the schema getter if available,
|
|
146
|
+
# otherwise tries common types in order.
|
|
147
|
+
#
|
|
148
|
+
# @param field_name [String, Symbol] the field name
|
|
149
|
+
# @return [Object, nil] the value, or nil if not found or null
|
|
150
|
+
#
|
|
151
|
+
# @example
|
|
152
|
+
# doc.get("title") #=> "Hello"
|
|
153
|
+
# doc.get(:embedding) #=> [0.1, 0.2, 0.3]
|
|
154
|
+
# doc.get("missing") #=> nil
|
|
59
155
|
def get(field_name)
|
|
60
156
|
field_name = field_name.to_s
|
|
61
157
|
return nil unless @ext_doc.has?(field_name)
|
|
@@ -78,25 +174,38 @@ module Zvec
|
|
|
78
174
|
nil
|
|
79
175
|
end
|
|
80
176
|
|
|
177
|
+
# @return [Array<String>] names of all fields set on this document
|
|
81
178
|
def field_names
|
|
82
179
|
@ext_doc.field_names
|
|
83
180
|
end
|
|
84
181
|
|
|
182
|
+
# @return [Boolean] true if no fields have been set
|
|
85
183
|
def empty?
|
|
86
184
|
@ext_doc.empty?
|
|
87
185
|
end
|
|
88
186
|
|
|
187
|
+
# Convert the document to a plain Ruby Hash.
|
|
188
|
+
#
|
|
189
|
+
# @return [Hash{String => Object}] includes "pk", "score", and all fields
|
|
190
|
+
#
|
|
191
|
+
# @example
|
|
192
|
+
# doc.to_h #=> {"pk" => "doc-1", "score" => 0.95, "title" => "Hello"}
|
|
89
193
|
def to_h
|
|
90
194
|
h = { "pk" => pk, "score" => score }
|
|
91
195
|
field_names.each { |f| h[f] = get(f) }
|
|
92
196
|
h
|
|
93
197
|
end
|
|
94
198
|
|
|
199
|
+
# @return [String] human-readable representation
|
|
95
200
|
def to_s
|
|
96
201
|
@ext_doc.to_s
|
|
97
202
|
end
|
|
98
203
|
|
|
99
|
-
# Wrap a C++ Doc::Ptr into a Ruby Doc
|
|
204
|
+
# Wrap a C++ Doc::Ptr into a Ruby Doc.
|
|
205
|
+
#
|
|
206
|
+
# @param ext_doc [Ext::Doc] the C++ document to wrap
|
|
207
|
+
# @param schema [Zvec::Schema, nil] optional schema for type-aware access
|
|
208
|
+
# @return [Zvec::Doc]
|
|
100
209
|
def self.from_ext(ext_doc, schema: nil)
|
|
101
210
|
doc = allocate
|
|
102
211
|
doc.instance_variable_set(:@ext_doc, ext_doc)
|
data/lib/zvec/query.rb
CHANGED
|
@@ -1,9 +1,118 @@
|
|
|
1
1
|
module Zvec
|
|
2
|
+
# Represents a vector similarity search query.
|
|
3
|
+
#
|
|
4
|
+
# == Filter Expression Syntax
|
|
5
|
+
#
|
|
6
|
+
# Filters narrow search results using scalar field conditions. The syntax
|
|
7
|
+
# supports the following operators and combinators:
|
|
8
|
+
#
|
|
9
|
+
# === Comparison Operators
|
|
10
|
+
#
|
|
11
|
+
# field == value # equality
|
|
12
|
+
# field != value # inequality
|
|
13
|
+
# field > value # greater than
|
|
14
|
+
# field >= value # greater than or equal
|
|
15
|
+
# field < value # less than
|
|
16
|
+
# field <= value # less than or equal
|
|
17
|
+
#
|
|
18
|
+
# === Logical Operators
|
|
19
|
+
#
|
|
20
|
+
# expr AND expr # both conditions must match
|
|
21
|
+
# expr OR expr # either condition matches
|
|
22
|
+
# NOT expr # negation
|
|
23
|
+
# (expr) # grouping
|
|
24
|
+
#
|
|
25
|
+
# === Set / Range Operators
|
|
26
|
+
#
|
|
27
|
+
# field IN [v1, v2] # field equals any value in the list
|
|
28
|
+
# field NOT IN [v1] # field does not equal any value in the list
|
|
29
|
+
#
|
|
30
|
+
# === String Operators
|
|
31
|
+
#
|
|
32
|
+
# field LIKE "pattern" # SQL-style LIKE with % and _ wildcards
|
|
33
|
+
#
|
|
34
|
+
# === Examples
|
|
35
|
+
#
|
|
36
|
+
# "year > 2024"
|
|
37
|
+
# "year >= 2020 AND year <= 2025"
|
|
38
|
+
# "category IN ['science', 'tech']"
|
|
39
|
+
# "title LIKE '%Ruby%'"
|
|
40
|
+
# "active == true AND rating > 4.0"
|
|
41
|
+
# "(year > 2020 OR featured == true) AND active == true"
|
|
42
|
+
#
|
|
43
|
+
# @example Basic query
|
|
44
|
+
# query = Zvec::VectorQuery.new(
|
|
45
|
+
# field_name: "embedding",
|
|
46
|
+
# vector: [0.1, 0.2, 0.3, 0.4],
|
|
47
|
+
# topk: 10
|
|
48
|
+
# )
|
|
49
|
+
#
|
|
50
|
+
# @example Query with filter
|
|
51
|
+
# query = Zvec::VectorQuery.new(
|
|
52
|
+
# field_name: "embedding",
|
|
53
|
+
# vector: [0.1, 0.2, 0.3, 0.4],
|
|
54
|
+
# topk: 5,
|
|
55
|
+
# filter: "year > 2024 AND category == 'science'"
|
|
56
|
+
# )
|
|
57
|
+
#
|
|
58
|
+
# @example Query with HNSW search params
|
|
59
|
+
# query = Zvec::VectorQuery.new(
|
|
60
|
+
# field_name: "embedding",
|
|
61
|
+
# vector: [0.1, 0.2, 0.3, 0.4],
|
|
62
|
+
# topk: 10,
|
|
63
|
+
# query_params: Zvec::Ext::HnswQueryParams.new(ef: 300)
|
|
64
|
+
# )
|
|
65
|
+
#
|
|
2
66
|
class VectorQuery
|
|
67
|
+
# @return [Ext::VectorQuery] the underlying C++ query object
|
|
3
68
|
attr_reader :ext_query
|
|
4
69
|
|
|
70
|
+
# Create a new vector similarity query.
|
|
71
|
+
#
|
|
72
|
+
# @param field_name [String, Symbol] the vector field to search
|
|
73
|
+
# (must be non-empty)
|
|
74
|
+
# @param vector [Array<Numeric>] the query vector (must be non-empty,
|
|
75
|
+
# all elements must be Numeric)
|
|
76
|
+
# @param topk [Integer] number of nearest results to return (must be > 0)
|
|
77
|
+
# @param filter [String, nil] optional filter expression
|
|
78
|
+
# (see class-level docs for syntax)
|
|
79
|
+
# @param include_vector [Boolean] whether to include the stored vectors
|
|
80
|
+
# in results
|
|
81
|
+
# @param output_fields [Array<String>, nil] specific fields to return
|
|
82
|
+
# (nil returns all)
|
|
83
|
+
# @param query_params [Ext::HnswQueryParams, Ext::IVFQueryParams,
|
|
84
|
+
# Ext::FlatQueryParams, nil] optional search-time tuning params
|
|
85
|
+
# @return [VectorQuery]
|
|
86
|
+
# @raise [Zvec::QueryError] if field_name, vector, or topk are invalid
|
|
87
|
+
#
|
|
88
|
+
# @example
|
|
89
|
+
# vq = Zvec::VectorQuery.new(
|
|
90
|
+
# field_name: "embedding",
|
|
91
|
+
# vector: [0.1, 0.2, 0.3],
|
|
92
|
+
# topk: 5,
|
|
93
|
+
# filter: "year > 2024",
|
|
94
|
+
# output_fields: ["title", "year"]
|
|
95
|
+
# )
|
|
5
96
|
def initialize(field_name:, vector:, topk: 10, filter: nil,
|
|
6
97
|
include_vector: false, output_fields: nil, query_params: nil)
|
|
98
|
+
if field_name.nil? || field_name.to_s.strip.empty?
|
|
99
|
+
raise QueryError, "field_name must be a non-empty string"
|
|
100
|
+
end
|
|
101
|
+
unless vector.is_a?(Array) && !vector.empty?
|
|
102
|
+
raise QueryError, "vector must be a non-empty Array"
|
|
103
|
+
end
|
|
104
|
+
unless topk.is_a?(Integer) && topk > 0
|
|
105
|
+
raise QueryError, "topk must be a positive integer"
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Validate all vector elements are numeric
|
|
109
|
+
vector.each_with_index do |v, i|
|
|
110
|
+
unless v.is_a?(Numeric)
|
|
111
|
+
raise QueryError,
|
|
112
|
+
"Query vector contains non-numeric element at index #{i}: #{v.inspect}"
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
7
116
|
@ext_query = Ext::VectorQuery.new
|
|
8
117
|
@ext_query.field_name = field_name.to_s
|
|
9
118
|
@ext_query.topk = topk
|
|
@@ -20,7 +129,7 @@ module Zvec
|
|
|
20
129
|
when Ext::FlatQueryParams
|
|
21
130
|
@ext_query.set_flat_query_params(query_params)
|
|
22
131
|
else
|
|
23
|
-
raise
|
|
132
|
+
raise QueryError, "Unknown query_params type: #{query_params.class}"
|
|
24
133
|
end
|
|
25
134
|
end
|
|
26
135
|
end
|