zvec-ruby 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +3 -0
- data/lib/zvec/active_record.rb +50 -3
- data/lib/zvec/collection.rb +245 -8
- data/lib/zvec/data_types.rb +123 -1
- data/lib/zvec/doc.rb +89 -1
- data/lib/zvec/query.rb +102 -5
- data/lib/zvec/ruby_llm.rb +79 -7
- data/lib/zvec/schema.rb +130 -3
- data/lib/zvec/version.rb +1 -1
- data/lib/zvec.rb +12 -0
- data/test/test_edge_cases.rb +380 -0
- data/test/test_helper.rb +9 -0
- data/test/test_validation.rb +11 -11
- data/test/test_version.rb +1 -1
- metadata +2 -1
data/lib/zvec/data_types.rb
CHANGED
|
@@ -1,37 +1,134 @@
|
|
|
1
1
|
module Zvec
|
|
2
|
+
# Data type constants, coercion utilities, and dispatch tables for mapping
|
|
3
|
+
# between Ruby types and the underlying C++ zvec engine types.
|
|
4
|
+
#
|
|
5
|
+
# == Scalar Types
|
|
6
|
+
#
|
|
7
|
+
# * {BINARY} -- Raw binary data
|
|
8
|
+
# * {STRING} -- UTF-8 string
|
|
9
|
+
# * {BOOL} -- Boolean (true/false)
|
|
10
|
+
# * {INT32} -- 32-bit signed integer
|
|
11
|
+
# * {INT64} -- 64-bit signed integer
|
|
12
|
+
# * {UINT32} -- 32-bit unsigned integer
|
|
13
|
+
# * {UINT64} -- 64-bit unsigned integer
|
|
14
|
+
# * {FLOAT} -- 32-bit IEEE 754 float
|
|
15
|
+
# * {DOUBLE} -- 64-bit IEEE 754 double
|
|
16
|
+
#
|
|
17
|
+
# == Dense Vector Types
|
|
18
|
+
#
|
|
19
|
+
# Dense vectors store a fixed-length array of numeric values. Choose the
|
|
20
|
+
# precision that balances accuracy vs. memory:
|
|
21
|
+
#
|
|
22
|
+
# * {VECTOR_FP32} -- 32-bit float vector (default, best accuracy)
|
|
23
|
+
# * {VECTOR_FP64} -- 64-bit double vector (highest accuracy, 2x memory)
|
|
24
|
+
# * {VECTOR_FP16} -- 16-bit half-precision vector (half the memory of FP32)
|
|
25
|
+
# * {VECTOR_INT8} -- 8-bit integer vector (smallest, for quantized models)
|
|
26
|
+
#
|
|
27
|
+
# == Sparse Vector Types
|
|
28
|
+
#
|
|
29
|
+
# Sparse vectors store only non-zero elements, ideal for high-dimensional
|
|
30
|
+
# data where most values are zero (e.g., BM25 or TF-IDF features):
|
|
31
|
+
#
|
|
32
|
+
# * {SPARSE_VECTOR_FP32} -- Sparse vector with 32-bit float values
|
|
33
|
+
# * {SPARSE_VECTOR_FP16} -- Sparse vector with 16-bit float values
|
|
34
|
+
#
|
|
35
|
+
# == Binary Vectors
|
|
36
|
+
#
|
|
37
|
+
# Binary vectors use the {BINARY} type and store bit-packed data, useful for
|
|
38
|
+
# binary hash codes or Hamming distance searches.
|
|
39
|
+
#
|
|
40
|
+
# == Array Types
|
|
41
|
+
#
|
|
42
|
+
# * {ARRAY_STRING} -- Array of strings (e.g., tags)
|
|
43
|
+
# * {ARRAY_INT32} -- Array of 32-bit integers
|
|
44
|
+
# * {ARRAY_INT64} -- Array of 64-bit integers
|
|
45
|
+
# * {ARRAY_FLOAT} -- Array of 32-bit floats
|
|
46
|
+
# * {ARRAY_DOUBLE} -- Array of 64-bit doubles
|
|
47
|
+
# * {ARRAY_BOOL} -- Array of booleans
|
|
48
|
+
#
|
|
49
|
+
# == Quantization Types
|
|
50
|
+
#
|
|
51
|
+
# Quantization reduces memory usage and speeds up search at the cost of some
|
|
52
|
+
# accuracy. Specify a quantization type when creating an index:
|
|
53
|
+
#
|
|
54
|
+
# Ext::HnswIndexParams.new(metric, quantize_type: Ext::QuantizeType::INT8)
|
|
55
|
+
#
|
|
56
|
+
# Available quantization types (via +Ext::QuantizeType+):
|
|
57
|
+
#
|
|
58
|
+
# * +FP16+ -- Half-precision (16-bit) quantization. Good balance of speed
|
|
59
|
+
# and accuracy. Halves memory vs. FP32.
|
|
60
|
+
# * +INT8+ -- 8-bit integer quantization. ~4x memory reduction vs. FP32.
|
|
61
|
+
# Slight accuracy loss.
|
|
62
|
+
# * +INT4+ -- 4-bit integer quantization. ~8x memory reduction vs. FP32.
|
|
63
|
+
# Larger accuracy loss, best for large-scale approximate search.
|
|
64
|
+
#
|
|
65
|
+
# == Metric Types
|
|
66
|
+
#
|
|
67
|
+
# * {L2} -- Euclidean (L2) distance. Lower is more similar.
|
|
68
|
+
# * {IP} -- Inner product. Higher is more similar.
|
|
69
|
+
# * {COSINE} -- Cosine similarity. Higher is more similar. Vectors are
|
|
70
|
+
# normalized internally.
|
|
71
|
+
#
|
|
2
72
|
module DataTypes
|
|
3
73
|
# Re-export C++ enum values as Ruby-friendly constants
|
|
74
|
+
|
|
75
|
+
# @return [Symbol] Raw binary data type
|
|
4
76
|
BINARY = Ext::DataType::BINARY
|
|
77
|
+
# @return [Symbol] UTF-8 string data type
|
|
5
78
|
STRING = Ext::DataType::STRING
|
|
79
|
+
# @return [Symbol] Boolean data type
|
|
6
80
|
BOOL = Ext::DataType::BOOL
|
|
81
|
+
# @return [Symbol] 32-bit signed integer data type
|
|
7
82
|
INT32 = Ext::DataType::INT32
|
|
83
|
+
# @return [Symbol] 64-bit signed integer data type
|
|
8
84
|
INT64 = Ext::DataType::INT64
|
|
85
|
+
# @return [Symbol] 32-bit unsigned integer data type
|
|
9
86
|
UINT32 = Ext::DataType::UINT32
|
|
87
|
+
# @return [Symbol] 64-bit unsigned integer data type
|
|
10
88
|
UINT64 = Ext::DataType::UINT64
|
|
89
|
+
# @return [Symbol] 32-bit float data type
|
|
11
90
|
FLOAT = Ext::DataType::FLOAT
|
|
91
|
+
# @return [Symbol] 64-bit double data type
|
|
12
92
|
DOUBLE = Ext::DataType::DOUBLE
|
|
13
93
|
|
|
94
|
+
# @return [Symbol] 32-bit float dense vector
|
|
14
95
|
VECTOR_FP32 = Ext::DataType::VECTOR_FP32
|
|
96
|
+
# @return [Symbol] 64-bit double dense vector
|
|
15
97
|
VECTOR_FP64 = Ext::DataType::VECTOR_FP64
|
|
98
|
+
# @return [Symbol] 16-bit half-precision dense vector
|
|
16
99
|
VECTOR_FP16 = Ext::DataType::VECTOR_FP16
|
|
100
|
+
# @return [Symbol] 8-bit integer dense vector (quantized)
|
|
17
101
|
VECTOR_INT8 = Ext::DataType::VECTOR_INT8
|
|
18
102
|
|
|
103
|
+
# @return [Symbol] 32-bit float sparse vector
|
|
19
104
|
SPARSE_VECTOR_FP32 = Ext::DataType::SPARSE_VECTOR_FP32
|
|
105
|
+
# @return [Symbol] 16-bit float sparse vector
|
|
20
106
|
SPARSE_VECTOR_FP16 = Ext::DataType::SPARSE_VECTOR_FP16
|
|
21
107
|
|
|
108
|
+
# @return [Symbol] Array of strings
|
|
22
109
|
ARRAY_STRING = Ext::DataType::ARRAY_STRING
|
|
110
|
+
# @return [Symbol] Array of 32-bit integers
|
|
23
111
|
ARRAY_INT32 = Ext::DataType::ARRAY_INT32
|
|
112
|
+
# @return [Symbol] Array of 64-bit integers
|
|
24
113
|
ARRAY_INT64 = Ext::DataType::ARRAY_INT64
|
|
114
|
+
# @return [Symbol] Array of 32-bit floats
|
|
25
115
|
ARRAY_FLOAT = Ext::DataType::ARRAY_FLOAT
|
|
116
|
+
# @return [Symbol] Array of 64-bit doubles
|
|
26
117
|
ARRAY_DOUBLE = Ext::DataType::ARRAY_DOUBLE
|
|
118
|
+
# @return [Symbol] Array of booleans
|
|
27
119
|
ARRAY_BOOL = Ext::DataType::ARRAY_BOOL
|
|
28
120
|
|
|
29
121
|
# Metric types
|
|
122
|
+
|
|
123
|
+
# @return [Symbol] Euclidean (L2) distance metric
|
|
30
124
|
L2 = Ext::MetricType::L2
|
|
125
|
+
# @return [Symbol] Inner product metric
|
|
31
126
|
IP = Ext::MetricType::IP
|
|
127
|
+
# @return [Symbol] Cosine similarity metric
|
|
32
128
|
COSINE = Ext::MetricType::COSINE
|
|
33
129
|
|
|
34
130
|
# Vector data types for dimension validation
|
|
131
|
+
# @return [Array<Symbol>] All dense vector data type constants
|
|
35
132
|
VECTOR_TYPES = [
|
|
36
133
|
Ext::DataType::VECTOR_FP32,
|
|
37
134
|
Ext::DataType::VECTOR_FP64,
|
|
@@ -40,6 +137,7 @@ module Zvec
|
|
|
40
137
|
].freeze
|
|
41
138
|
|
|
42
139
|
# Setter dispatch table: DataType -> Doc setter method name
|
|
140
|
+
# @return [Hash{Symbol => Symbol}]
|
|
43
141
|
SETTER_FOR = {
|
|
44
142
|
Ext::DataType::STRING => :set_string,
|
|
45
143
|
Ext::DataType::BOOL => :set_bool,
|
|
@@ -54,6 +152,8 @@ module Zvec
|
|
|
54
152
|
Ext::DataType::ARRAY_STRING => :set_string_array,
|
|
55
153
|
}.freeze
|
|
56
154
|
|
|
155
|
+
# Getter dispatch table: DataType -> Doc getter method name
|
|
156
|
+
# @return [Hash{Symbol => Symbol}]
|
|
57
157
|
GETTER_FOR = {
|
|
58
158
|
Ext::DataType::STRING => :get_string,
|
|
59
159
|
Ext::DataType::BOOL => :get_bool,
|
|
@@ -69,7 +169,17 @@ module Zvec
|
|
|
69
169
|
}.freeze
|
|
70
170
|
|
|
71
171
|
# Detect the zvec data type for a Ruby value.
|
|
172
|
+
#
|
|
72
173
|
# Handles edge cases: Integer vs Float, String booleans, nil, empty arrays.
|
|
174
|
+
#
|
|
175
|
+
# @param value [Object] the Ruby value to inspect
|
|
176
|
+
# @return [Symbol, nil] the zvec data type constant, or nil for nil input
|
|
177
|
+
#
|
|
178
|
+
# @example
|
|
179
|
+
# DataTypes.detect_type("hello") #=> Ext::DataType::STRING
|
|
180
|
+
# DataTypes.detect_type(42) #=> Ext::DataType::INT64
|
|
181
|
+
# DataTypes.detect_type([1.0]) #=> Ext::DataType::VECTOR_FP32
|
|
182
|
+
# DataTypes.detect_type(nil) #=> nil
|
|
73
183
|
def self.detect_type(value)
|
|
74
184
|
case value
|
|
75
185
|
when NilClass then nil
|
|
@@ -82,7 +192,17 @@ module Zvec
|
|
|
82
192
|
end
|
|
83
193
|
|
|
84
194
|
# Coerce a Ruby value into a form suitable for the given zvec data type.
|
|
85
|
-
#
|
|
195
|
+
#
|
|
196
|
+
# @param value [Object] the value to coerce
|
|
197
|
+
# @param target_type [Symbol] the target zvec data type constant
|
|
198
|
+
# @param field_name [String, nil] optional field name for error messages
|
|
199
|
+
# @return [Object] the coerced value
|
|
200
|
+
# @raise [ArgumentError] if the value cannot be coerced to the target type
|
|
201
|
+
#
|
|
202
|
+
# @example
|
|
203
|
+
# DataTypes.coerce_value(42, Ext::DataType::STRING) #=> "42"
|
|
204
|
+
# DataTypes.coerce_value("3.14", Ext::DataType::DOUBLE) #=> 3.14
|
|
205
|
+
# DataTypes.coerce_value([1, 2], Ext::DataType::VECTOR_FP32) #=> [1.0, 2.0]
|
|
86
206
|
def self.coerce_value(value, target_type, field_name: nil)
|
|
87
207
|
return value if value.nil?
|
|
88
208
|
|
|
@@ -110,6 +230,8 @@ module Zvec
|
|
|
110
230
|
class << self
|
|
111
231
|
private
|
|
112
232
|
|
|
233
|
+
# @param arr [Array] the array to detect the element type for
|
|
234
|
+
# @return [Symbol] the detected zvec data type
|
|
113
235
|
def detect_array_type(arr)
|
|
114
236
|
return Ext::DataType::VECTOR_FP32 if arr.empty?
|
|
115
237
|
|
data/lib/zvec/doc.rb
CHANGED
|
@@ -1,7 +1,37 @@
|
|
|
1
1
|
module Zvec
|
|
2
|
+
# A document (row) in a zvec collection. Wraps the C++ Doc object and
|
|
3
|
+
# provides Ruby-friendly field access with automatic type coercion.
|
|
4
|
+
#
|
|
5
|
+
# Documents can be created with or without a schema. With a schema,
|
|
6
|
+
# values are coerced and validated against declared field types and
|
|
7
|
+
# vector dimensions. Without a schema, types are auto-detected.
|
|
8
|
+
#
|
|
9
|
+
# @example Creating a document with a schema
|
|
10
|
+
# doc = Zvec::Doc.new(pk: "doc-1", schema: schema)
|
|
11
|
+
# doc["title"] = "Hello World"
|
|
12
|
+
# doc["embedding"] = [0.1, 0.2, 0.3, 0.4]
|
|
13
|
+
#
|
|
14
|
+
# @example Schema-less document (types auto-detected)
|
|
15
|
+
# doc = Zvec::Doc.new(pk: "doc-2")
|
|
16
|
+
# doc["name"] = "Alice" # stored as string
|
|
17
|
+
# doc["age"] = 30 # stored as int64
|
|
18
|
+
# doc["score"] = 0.95 # stored as double
|
|
19
|
+
# doc["active"] = true # stored as bool
|
|
20
|
+
# doc["vec"] = [1.0, 2.0] # stored as float vector
|
|
21
|
+
# doc["tags"] = ["a", "b"] # stored as string array
|
|
22
|
+
#
|
|
2
23
|
class Doc
|
|
24
|
+
# @return [Ext::Doc] the underlying C++ document object
|
|
3
25
|
attr_reader :ext_doc
|
|
4
26
|
|
|
27
|
+
# Create a new document.
|
|
28
|
+
#
|
|
29
|
+
# @param pk [String, Integer, nil] primary key (converted to String)
|
|
30
|
+
# @param fields [Hash{String, Symbol => Object}] initial field values
|
|
31
|
+
# @param schema [Zvec::Schema, nil] optional schema for type validation
|
|
32
|
+
#
|
|
33
|
+
# @example
|
|
34
|
+
# doc = Zvec::Doc.new(pk: "abc", fields: { "title" => "Hello" }, schema: schema)
|
|
5
35
|
def initialize(pk: nil, fields: {}, schema: nil)
|
|
6
36
|
@ext_doc = Ext::Doc.new
|
|
7
37
|
@ext_doc.pk = pk.to_s if pk
|
|
@@ -9,26 +39,61 @@ module Zvec
|
|
|
9
39
|
fields.each { |k, v| set(k, v) } if schema
|
|
10
40
|
end
|
|
11
41
|
|
|
42
|
+
# @return [String] the primary key
|
|
12
43
|
def pk
|
|
13
44
|
@ext_doc.pk
|
|
14
45
|
end
|
|
15
46
|
|
|
47
|
+
# Set the primary key.
|
|
48
|
+
#
|
|
49
|
+
# @param value [String, Integer] the new primary key (converted to String)
|
|
50
|
+
# @return [void]
|
|
16
51
|
def pk=(value)
|
|
17
52
|
@ext_doc.pk = value.to_s
|
|
18
53
|
end
|
|
19
54
|
|
|
55
|
+
# @return [Float] the similarity score (set after search queries)
|
|
20
56
|
def score
|
|
21
57
|
@score || @ext_doc.score
|
|
22
58
|
end
|
|
23
59
|
|
|
60
|
+
# Read a field value by name (bracket accessor).
|
|
61
|
+
#
|
|
62
|
+
# @param field_name [String, Symbol] the field name
|
|
63
|
+
# @return [Object, nil] the field value, or nil if not set
|
|
64
|
+
#
|
|
65
|
+
# @example
|
|
66
|
+
# doc["title"] #=> "Hello"
|
|
24
67
|
def [](field_name)
|
|
25
68
|
get(field_name)
|
|
26
69
|
end
|
|
27
70
|
|
|
71
|
+
# Write a field value by name (bracket accessor).
|
|
72
|
+
#
|
|
73
|
+
# @param field_name [String, Symbol] the field name
|
|
74
|
+
# @param value [Object] the value to set
|
|
75
|
+
# @return [void]
|
|
76
|
+
#
|
|
77
|
+
# @example
|
|
78
|
+
# doc["title"] = "Hello"
|
|
28
79
|
def []=(field_name, value)
|
|
29
80
|
set(field_name, value)
|
|
30
81
|
end
|
|
31
82
|
|
|
83
|
+
# Set a field value. When a schema is present, the value is coerced to
|
|
84
|
+
# the declared type and validated. Without a schema, the type is
|
|
85
|
+
# auto-detected from the Ruby value.
|
|
86
|
+
#
|
|
87
|
+
# @param field_name [String, Symbol] the field name (must be non-empty)
|
|
88
|
+
# @param value [Object] the value to set (nil sets the field to null)
|
|
89
|
+
# @return [void]
|
|
90
|
+
# @raise [ArgumentError] if field_name is blank or value type is unsupported
|
|
91
|
+
# @raise [Zvec::DimensionError] if vector dimension doesn't match schema
|
|
92
|
+
#
|
|
93
|
+
# @example
|
|
94
|
+
# doc.set("title", "Hello")
|
|
95
|
+
# doc.set(:count, 42)
|
|
96
|
+
# doc.set("embedding", [0.1, 0.2, 0.3])
|
|
32
97
|
def set(field_name, value)
|
|
33
98
|
field_name = field_name.to_s
|
|
34
99
|
raise ArgumentError, "Field name must be a non-empty string" if field_name.strip.empty?
|
|
@@ -77,6 +142,16 @@ module Zvec
|
|
|
77
142
|
end
|
|
78
143
|
end
|
|
79
144
|
|
|
145
|
+
# Get a field value by name. Uses the schema getter if available,
|
|
146
|
+
# otherwise tries common types in order.
|
|
147
|
+
#
|
|
148
|
+
# @param field_name [String, Symbol] the field name
|
|
149
|
+
# @return [Object, nil] the value, or nil if not found or null
|
|
150
|
+
#
|
|
151
|
+
# @example
|
|
152
|
+
# doc.get("title") #=> "Hello"
|
|
153
|
+
# doc.get(:embedding) #=> [0.1, 0.2, 0.3]
|
|
154
|
+
# doc.get("missing") #=> nil
|
|
80
155
|
def get(field_name)
|
|
81
156
|
field_name = field_name.to_s
|
|
82
157
|
return nil unless @ext_doc.has?(field_name)
|
|
@@ -99,25 +174,38 @@ module Zvec
|
|
|
99
174
|
nil
|
|
100
175
|
end
|
|
101
176
|
|
|
177
|
+
# @return [Array<String>] names of all fields set on this document
|
|
102
178
|
def field_names
|
|
103
179
|
@ext_doc.field_names
|
|
104
180
|
end
|
|
105
181
|
|
|
182
|
+
# @return [Boolean] true if no fields have been set
|
|
106
183
|
def empty?
|
|
107
184
|
@ext_doc.empty?
|
|
108
185
|
end
|
|
109
186
|
|
|
187
|
+
# Convert the document to a plain Ruby Hash.
|
|
188
|
+
#
|
|
189
|
+
# @return [Hash{String => Object}] includes "pk", "score", and all fields
|
|
190
|
+
#
|
|
191
|
+
# @example
|
|
192
|
+
# doc.to_h #=> {"pk" => "doc-1", "score" => 0.95, "title" => "Hello"}
|
|
110
193
|
def to_h
|
|
111
194
|
h = { "pk" => pk, "score" => score }
|
|
112
195
|
field_names.each { |f| h[f] = get(f) }
|
|
113
196
|
h
|
|
114
197
|
end
|
|
115
198
|
|
|
199
|
+
# @return [String] human-readable representation
|
|
116
200
|
def to_s
|
|
117
201
|
@ext_doc.to_s
|
|
118
202
|
end
|
|
119
203
|
|
|
120
|
-
# Wrap a C++ Doc::Ptr into a Ruby Doc
|
|
204
|
+
# Wrap a C++ Doc::Ptr into a Ruby Doc.
|
|
205
|
+
#
|
|
206
|
+
# @param ext_doc [Ext::Doc] the C++ document to wrap
|
|
207
|
+
# @param schema [Zvec::Schema, nil] optional schema for type-aware access
|
|
208
|
+
# @return [Zvec::Doc]
|
|
121
209
|
def self.from_ext(ext_doc, schema: nil)
|
|
122
210
|
doc = allocate
|
|
123
211
|
doc.instance_variable_set(:@ext_doc, ext_doc)
|
data/lib/zvec/query.rb
CHANGED
|
@@ -1,17 +1,114 @@
|
|
|
1
1
|
module Zvec
|
|
2
|
+
# Represents a vector similarity search query.
|
|
3
|
+
#
|
|
4
|
+
# == Filter Expression Syntax
|
|
5
|
+
#
|
|
6
|
+
# Filters narrow search results using scalar field conditions. The syntax
|
|
7
|
+
# supports the following operators and combinators:
|
|
8
|
+
#
|
|
9
|
+
# === Comparison Operators
|
|
10
|
+
#
|
|
11
|
+
# field == value # equality
|
|
12
|
+
# field != value # inequality
|
|
13
|
+
# field > value # greater than
|
|
14
|
+
# field >= value # greater than or equal
|
|
15
|
+
# field < value # less than
|
|
16
|
+
# field <= value # less than or equal
|
|
17
|
+
#
|
|
18
|
+
# === Logical Operators
|
|
19
|
+
#
|
|
20
|
+
# expr AND expr # both conditions must match
|
|
21
|
+
# expr OR expr # either condition matches
|
|
22
|
+
# NOT expr # negation
|
|
23
|
+
# (expr) # grouping
|
|
24
|
+
#
|
|
25
|
+
# === Set / Range Operators
|
|
26
|
+
#
|
|
27
|
+
# field IN [v1, v2] # field equals any value in the list
|
|
28
|
+
# field NOT IN [v1] # field does not equal any value in the list
|
|
29
|
+
#
|
|
30
|
+
# === String Operators
|
|
31
|
+
#
|
|
32
|
+
# field LIKE "pattern" # SQL-style LIKE with % and _ wildcards
|
|
33
|
+
#
|
|
34
|
+
# === Examples
|
|
35
|
+
#
|
|
36
|
+
# "year > 2024"
|
|
37
|
+
# "year >= 2020 AND year <= 2025"
|
|
38
|
+
# "category IN ['science', 'tech']"
|
|
39
|
+
# "title LIKE '%Ruby%'"
|
|
40
|
+
# "active == true AND rating > 4.0"
|
|
41
|
+
# "(year > 2020 OR featured == true) AND active == true"
|
|
42
|
+
#
|
|
43
|
+
# @example Basic query
|
|
44
|
+
# query = Zvec::VectorQuery.new(
|
|
45
|
+
# field_name: "embedding",
|
|
46
|
+
# vector: [0.1, 0.2, 0.3, 0.4],
|
|
47
|
+
# topk: 10
|
|
48
|
+
# )
|
|
49
|
+
#
|
|
50
|
+
# @example Query with filter
|
|
51
|
+
# query = Zvec::VectorQuery.new(
|
|
52
|
+
# field_name: "embedding",
|
|
53
|
+
# vector: [0.1, 0.2, 0.3, 0.4],
|
|
54
|
+
# topk: 5,
|
|
55
|
+
# filter: "year > 2024 AND category == 'science'"
|
|
56
|
+
# )
|
|
57
|
+
#
|
|
58
|
+
# @example Query with HNSW search params
|
|
59
|
+
# query = Zvec::VectorQuery.new(
|
|
60
|
+
# field_name: "embedding",
|
|
61
|
+
# vector: [0.1, 0.2, 0.3, 0.4],
|
|
62
|
+
# topk: 10,
|
|
63
|
+
# query_params: Zvec::Ext::HnswQueryParams.new(ef: 300)
|
|
64
|
+
# )
|
|
65
|
+
#
|
|
2
66
|
class VectorQuery
|
|
67
|
+
# @return [Ext::VectorQuery] the underlying C++ query object
|
|
3
68
|
attr_reader :ext_query
|
|
4
69
|
|
|
70
|
+
# Create a new vector similarity query.
|
|
71
|
+
#
|
|
72
|
+
# @param field_name [String, Symbol] the vector field to search
|
|
73
|
+
# (must be non-empty)
|
|
74
|
+
# @param vector [Array<Numeric>] the query vector (must be non-empty,
|
|
75
|
+
# all elements must be Numeric)
|
|
76
|
+
# @param topk [Integer] number of nearest results to return (must be > 0)
|
|
77
|
+
# @param filter [String, nil] optional filter expression
|
|
78
|
+
# (see class-level docs for syntax)
|
|
79
|
+
# @param include_vector [Boolean] whether to include the stored vectors
|
|
80
|
+
# in results
|
|
81
|
+
# @param output_fields [Array<String>, nil] specific fields to return
|
|
82
|
+
# (nil returns all)
|
|
83
|
+
# @param query_params [Ext::HnswQueryParams, Ext::IVFQueryParams,
|
|
84
|
+
# Ext::FlatQueryParams, nil] optional search-time tuning params
|
|
85
|
+
# @return [VectorQuery]
|
|
86
|
+
# @raise [Zvec::QueryError] if field_name, vector, or topk are invalid
|
|
87
|
+
#
|
|
88
|
+
# @example
|
|
89
|
+
# vq = Zvec::VectorQuery.new(
|
|
90
|
+
# field_name: "embedding",
|
|
91
|
+
# vector: [0.1, 0.2, 0.3],
|
|
92
|
+
# topk: 5,
|
|
93
|
+
# filter: "year > 2024",
|
|
94
|
+
# output_fields: ["title", "year"]
|
|
95
|
+
# )
|
|
5
96
|
def initialize(field_name:, vector:, topk: 10, filter: nil,
|
|
6
97
|
include_vector: false, output_fields: nil, query_params: nil)
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
98
|
+
if field_name.nil? || field_name.to_s.strip.empty?
|
|
99
|
+
raise QueryError, "field_name must be a non-empty string"
|
|
100
|
+
end
|
|
101
|
+
unless vector.is_a?(Array) && !vector.empty?
|
|
102
|
+
raise QueryError, "vector must be a non-empty Array"
|
|
103
|
+
end
|
|
104
|
+
unless topk.is_a?(Integer) && topk > 0
|
|
105
|
+
raise QueryError, "topk must be a positive integer"
|
|
106
|
+
end
|
|
10
107
|
|
|
11
108
|
# Validate all vector elements are numeric
|
|
12
109
|
vector.each_with_index do |v, i|
|
|
13
110
|
unless v.is_a?(Numeric)
|
|
14
|
-
raise
|
|
111
|
+
raise QueryError,
|
|
15
112
|
"Query vector contains non-numeric element at index #{i}: #{v.inspect}"
|
|
16
113
|
end
|
|
17
114
|
end
|
|
@@ -32,7 +129,7 @@ module Zvec
|
|
|
32
129
|
when Ext::FlatQueryParams
|
|
33
130
|
@ext_query.set_flat_query_params(query_params)
|
|
34
131
|
else
|
|
35
|
-
raise
|
|
132
|
+
raise QueryError, "Unknown query_params type: #{query_params.class}"
|
|
36
133
|
end
|
|
37
134
|
end
|
|
38
135
|
end
|
data/lib/zvec/ruby_llm.rb
CHANGED
|
@@ -4,17 +4,40 @@ module Zvec
|
|
|
4
4
|
module RubyLLM
|
|
5
5
|
# A vector store backend for the ruby_llm gem.
|
|
6
6
|
#
|
|
7
|
-
#
|
|
7
|
+
# Provides a simple add/search/delete interface on top of a {Zvec::Collection}.
|
|
8
|
+
# Compatible with the ruby_llm vector store protocol.
|
|
9
|
+
#
|
|
10
|
+
# @example Basic usage
|
|
8
11
|
# store = Zvec::RubyLLM::Store.new("/path/to/db", dimension: 1536)
|
|
9
|
-
# store.add("doc-1", embedding: [...],
|
|
12
|
+
# store.add("doc-1", embedding: [...], content: "Hello world")
|
|
10
13
|
# results = store.search([0.1, 0.2, ...], top_k: 5)
|
|
14
|
+
# results.first #=> { id: "doc-1", score: 0.98, content: "Hello world", metadata: {} }
|
|
15
|
+
#
|
|
16
|
+
# @example With metadata
|
|
17
|
+
# store.add("doc-2", embedding: [...], content: "Ruby", metadata: { category: "lang" })
|
|
11
18
|
#
|
|
12
19
|
class Store
|
|
20
|
+
# @return [String] default vector field name
|
|
13
21
|
DEFAULT_VECTOR_FIELD = "embedding"
|
|
22
|
+
# @return [String] default content field name
|
|
14
23
|
DEFAULT_CONTENT_FIELD = "content"
|
|
15
24
|
|
|
16
|
-
|
|
25
|
+
# @return [Zvec::Collection] the underlying collection
|
|
26
|
+
attr_reader :collection
|
|
27
|
+
# @return [Integer] the vector dimension
|
|
28
|
+
attr_reader :dimension
|
|
17
29
|
|
|
30
|
+
# Create a new store, opening an existing collection or creating one.
|
|
31
|
+
#
|
|
32
|
+
# @param path [String] directory path for the collection data
|
|
33
|
+
# @param dimension [Integer] the vector dimension (must be > 0)
|
|
34
|
+
# @param metric [Symbol] similarity metric (+:cosine+, +:l2+, or +:ip+)
|
|
35
|
+
# @param vector_field [String] name of the vector field (default: "embedding")
|
|
36
|
+
# @param content_field [String] name of the content field (default: "content")
|
|
37
|
+
# @raise [ArgumentError] if metric is not one of +:cosine+, +:l2+, +:ip+
|
|
38
|
+
#
|
|
39
|
+
# @example
|
|
40
|
+
# store = Zvec::RubyLLM::Store.new("/tmp/store", dimension: 384, metric: :l2)
|
|
18
41
|
def initialize(path, dimension:, metric: :cosine, vector_field: DEFAULT_VECTOR_FIELD,
|
|
19
42
|
content_field: DEFAULT_CONTENT_FIELD)
|
|
20
43
|
@vector_field = vector_field.to_s
|
|
@@ -47,6 +70,15 @@ module Zvec
|
|
|
47
70
|
end
|
|
48
71
|
|
|
49
72
|
# Add a document with its embedding and optional metadata.
|
|
73
|
+
#
|
|
74
|
+
# @param id [String, Integer] the document's primary key
|
|
75
|
+
# @param embedding [Array<Numeric>] the vector embedding
|
|
76
|
+
# @param content [String, nil] optional text content
|
|
77
|
+
# @param metadata [Hash{String, Symbol => Object}] additional fields to store
|
|
78
|
+
# @return [Array] write results from the collection
|
|
79
|
+
#
|
|
80
|
+
# @example
|
|
81
|
+
# store.add("doc-1", embedding: [0.1, 0.2, 0.3], content: "Hello")
|
|
50
82
|
def add(id, embedding:, content: nil, metadata: {})
|
|
51
83
|
doc = Zvec::Doc.new(pk: id, schema: @schema)
|
|
52
84
|
doc[@vector_field] = embedding
|
|
@@ -55,8 +87,20 @@ module Zvec
|
|
|
55
87
|
@collection.insert(doc)
|
|
56
88
|
end
|
|
57
89
|
|
|
58
|
-
# Batch-add documents.
|
|
59
|
-
#
|
|
90
|
+
# Batch-add multiple documents at once.
|
|
91
|
+
#
|
|
92
|
+
# @param docs [Array<Hash>] documents, each containing:
|
|
93
|
+
# * +:id+ [String, Integer] -- primary key (required)
|
|
94
|
+
# * +:embedding+ [Array<Numeric>] -- the vector (required)
|
|
95
|
+
# * +:content+ [String, nil] -- optional text content
|
|
96
|
+
# * +:metadata+ [Hash, nil] -- optional additional fields
|
|
97
|
+
# @return [Array] write results from the collection
|
|
98
|
+
#
|
|
99
|
+
# @example
|
|
100
|
+
# store.add_many([
|
|
101
|
+
# { id: "a", embedding: [0.1, 0.2], content: "Hello" },
|
|
102
|
+
# { id: "b", embedding: [0.3, 0.4], content: "World" },
|
|
103
|
+
# ])
|
|
60
104
|
def add_many(docs)
|
|
61
105
|
zvec_docs = docs.map do |d|
|
|
62
106
|
doc = Zvec::Doc.new(pk: d[:id], schema: @schema)
|
|
@@ -69,6 +113,22 @@ module Zvec
|
|
|
69
113
|
end
|
|
70
114
|
|
|
71
115
|
# Search for similar vectors.
|
|
116
|
+
#
|
|
117
|
+
# @param query_vector [Array<Numeric>] the query vector
|
|
118
|
+
# @param top_k [Integer] maximum number of results (default: 10)
|
|
119
|
+
# @param filter [String, nil] optional filter expression
|
|
120
|
+
# (see {Zvec::VectorQuery} for filter syntax)
|
|
121
|
+
# @return [Array<Hash>] results, each containing:
|
|
122
|
+
# * +:id+ [String] -- document primary key
|
|
123
|
+
# * +:score+ [Float] -- similarity score
|
|
124
|
+
# * +:content+ [String, nil] -- the content field value
|
|
125
|
+
# * +:metadata+ [Hash] -- all other stored fields
|
|
126
|
+
#
|
|
127
|
+
# @example
|
|
128
|
+
# results = store.search([0.1, 0.2, 0.3], top_k: 5)
|
|
129
|
+
# results.first[:id] #=> "doc-1"
|
|
130
|
+
# results.first[:score] #=> 0.95
|
|
131
|
+
# results.first[:content] #=> "Hello"
|
|
72
132
|
def search(query_vector, top_k: 10, filter: nil)
|
|
73
133
|
results = @collection.query(
|
|
74
134
|
field_name: @vector_field,
|
|
@@ -86,20 +146,32 @@ module Zvec
|
|
|
86
146
|
end
|
|
87
147
|
end
|
|
88
148
|
|
|
89
|
-
# Delete documents by
|
|
149
|
+
# Delete documents by primary key(s).
|
|
150
|
+
#
|
|
151
|
+
# @param ids [Array<String, Integer>] one or more primary keys
|
|
152
|
+
# @return [Array] write results from the collection
|
|
90
153
|
def delete(*ids)
|
|
91
154
|
@collection.delete(*ids.flatten)
|
|
92
155
|
end
|
|
93
156
|
|
|
94
|
-
# Fetch documents by
|
|
157
|
+
# Fetch documents by primary key(s).
|
|
158
|
+
#
|
|
159
|
+
# @param ids [Array<String, Integer>] one or more primary keys
|
|
160
|
+
# @return [Hash{String => Zvec::Doc}] mapping of pk to document
|
|
95
161
|
def fetch(*ids)
|
|
96
162
|
@collection.fetch(*ids.flatten)
|
|
97
163
|
end
|
|
98
164
|
|
|
165
|
+
# Flush pending writes to disk.
|
|
166
|
+
#
|
|
167
|
+
# @return [self]
|
|
99
168
|
def flush
|
|
100
169
|
@collection.flush
|
|
101
170
|
end
|
|
102
171
|
|
|
172
|
+
# Return the number of documents in the store.
|
|
173
|
+
#
|
|
174
|
+
# @return [Integer]
|
|
103
175
|
def count
|
|
104
176
|
@collection.doc_count
|
|
105
177
|
end
|