zvec-ruby 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/Rakefile +3 -0
- data/lib/zvec/active_record.rb +51 -4
- data/lib/zvec/collection.rb +344 -33
- data/lib/zvec/data_types.rb +250 -0
- data/lib/zvec/doc.rb +119 -10
- data/lib/zvec/query.rb +110 -1
- data/lib/zvec/ruby_llm.rb +79 -7
- data/lib/zvec/schema.rb +145 -1
- data/lib/zvec/version.rb +1 -1
- data/lib/zvec.rb +13 -0
- data/test/test_edge_cases.rb +380 -0
- data/test/test_helper.rb +102 -3
- data/test/test_type_detection.rb +258 -0
- data/test/test_validation.rb +305 -0
- data/test/test_version.rb +1 -1
- metadata +4 -1
data/lib/zvec/ruby_llm.rb
CHANGED
|
@@ -4,17 +4,40 @@ module Zvec
|
|
|
4
4
|
module RubyLLM
|
|
5
5
|
# A vector store backend for the ruby_llm gem.
|
|
6
6
|
#
|
|
7
|
-
#
|
|
7
|
+
# Provides a simple add/search/delete interface on top of a {Zvec::Collection}.
|
|
8
|
+
# Compatible with the ruby_llm vector store protocol.
|
|
9
|
+
#
|
|
10
|
+
# @example Basic usage
|
|
8
11
|
# store = Zvec::RubyLLM::Store.new("/path/to/db", dimension: 1536)
|
|
9
|
-
# store.add("doc-1", embedding: [...],
|
|
12
|
+
# store.add("doc-1", embedding: [...], content: "Hello world")
|
|
10
13
|
# results = store.search([0.1, 0.2, ...], top_k: 5)
|
|
14
|
+
# results.first #=> { id: "doc-1", score: 0.98, content: "Hello world", metadata: {} }
|
|
15
|
+
#
|
|
16
|
+
# @example With metadata
|
|
17
|
+
# store.add("doc-2", embedding: [...], content: "Ruby", metadata: { category: "lang" })
|
|
11
18
|
#
|
|
12
19
|
class Store
|
|
20
|
+
# @return [String] default vector field name
|
|
13
21
|
DEFAULT_VECTOR_FIELD = "embedding"
|
|
22
|
+
# @return [String] default content field name
|
|
14
23
|
DEFAULT_CONTENT_FIELD = "content"
|
|
15
24
|
|
|
16
|
-
|
|
25
|
+
# @return [Zvec::Collection] the underlying collection
|
|
26
|
+
attr_reader :collection
|
|
27
|
+
# @return [Integer] the vector dimension
|
|
28
|
+
attr_reader :dimension
|
|
17
29
|
|
|
30
|
+
# Create a new store, opening an existing collection or creating one.
|
|
31
|
+
#
|
|
32
|
+
# @param path [String] directory path for the collection data
|
|
33
|
+
# @param dimension [Integer] the vector dimension (must be > 0)
|
|
34
|
+
# @param metric [Symbol] similarity metric (+:cosine+, +:l2+, or +:ip+)
|
|
35
|
+
# @param vector_field [String] name of the vector field (default: "embedding")
|
|
36
|
+
# @param content_field [String] name of the content field (default: "content")
|
|
37
|
+
# @raise [ArgumentError] if metric is not one of +:cosine+, +:l2+, +:ip+
|
|
38
|
+
#
|
|
39
|
+
# @example
|
|
40
|
+
# store = Zvec::RubyLLM::Store.new("/tmp/store", dimension: 384, metric: :l2)
|
|
18
41
|
def initialize(path, dimension:, metric: :cosine, vector_field: DEFAULT_VECTOR_FIELD,
|
|
19
42
|
content_field: DEFAULT_CONTENT_FIELD)
|
|
20
43
|
@vector_field = vector_field.to_s
|
|
@@ -47,6 +70,15 @@ module Zvec
|
|
|
47
70
|
end
|
|
48
71
|
|
|
49
72
|
# Add a document with its embedding and optional metadata.
|
|
73
|
+
#
|
|
74
|
+
# @param id [String, Integer] the document's primary key
|
|
75
|
+
# @param embedding [Array<Numeric>] the vector embedding
|
|
76
|
+
# @param content [String, nil] optional text content
|
|
77
|
+
# @param metadata [Hash{String, Symbol => Object}] additional fields to store
|
|
78
|
+
# @return [Array] write results from the collection
|
|
79
|
+
#
|
|
80
|
+
# @example
|
|
81
|
+
# store.add("doc-1", embedding: [0.1, 0.2, 0.3], content: "Hello")
|
|
50
82
|
def add(id, embedding:, content: nil, metadata: {})
|
|
51
83
|
doc = Zvec::Doc.new(pk: id, schema: @schema)
|
|
52
84
|
doc[@vector_field] = embedding
|
|
@@ -55,8 +87,20 @@ module Zvec
|
|
|
55
87
|
@collection.insert(doc)
|
|
56
88
|
end
|
|
57
89
|
|
|
58
|
-
# Batch-add documents.
|
|
59
|
-
#
|
|
90
|
+
# Batch-add multiple documents at once.
|
|
91
|
+
#
|
|
92
|
+
# @param docs [Array<Hash>] documents, each containing:
|
|
93
|
+
# * +:id+ [String, Integer] -- primary key (required)
|
|
94
|
+
# * +:embedding+ [Array<Numeric>] -- the vector (required)
|
|
95
|
+
# * +:content+ [String, nil] -- optional text content
|
|
96
|
+
# * +:metadata+ [Hash, nil] -- optional additional fields
|
|
97
|
+
# @return [Array] write results from the collection
|
|
98
|
+
#
|
|
99
|
+
# @example
|
|
100
|
+
# store.add_many([
|
|
101
|
+
# { id: "a", embedding: [0.1, 0.2], content: "Hello" },
|
|
102
|
+
# { id: "b", embedding: [0.3, 0.4], content: "World" },
|
|
103
|
+
# ])
|
|
60
104
|
def add_many(docs)
|
|
61
105
|
zvec_docs = docs.map do |d|
|
|
62
106
|
doc = Zvec::Doc.new(pk: d[:id], schema: @schema)
|
|
@@ -69,6 +113,22 @@ module Zvec
|
|
|
69
113
|
end
|
|
70
114
|
|
|
71
115
|
# Search for similar vectors.
|
|
116
|
+
#
|
|
117
|
+
# @param query_vector [Array<Numeric>] the query vector
|
|
118
|
+
# @param top_k [Integer] maximum number of results (default: 10)
|
|
119
|
+
# @param filter [String, nil] optional filter expression
|
|
120
|
+
# (see {Zvec::VectorQuery} for filter syntax)
|
|
121
|
+
# @return [Array<Hash>] results, each containing:
|
|
122
|
+
# * +:id+ [String] -- document primary key
|
|
123
|
+
# * +:score+ [Float] -- similarity score
|
|
124
|
+
# * +:content+ [String, nil] -- the content field value
|
|
125
|
+
# * +:metadata+ [Hash] -- all other stored fields
|
|
126
|
+
#
|
|
127
|
+
# @example
|
|
128
|
+
# results = store.search([0.1, 0.2, 0.3], top_k: 5)
|
|
129
|
+
# results.first[:id] #=> "doc-1"
|
|
130
|
+
# results.first[:score] #=> 0.95
|
|
131
|
+
# results.first[:content] #=> "Hello"
|
|
72
132
|
def search(query_vector, top_k: 10, filter: nil)
|
|
73
133
|
results = @collection.query(
|
|
74
134
|
field_name: @vector_field,
|
|
@@ -86,20 +146,32 @@ module Zvec
|
|
|
86
146
|
end
|
|
87
147
|
end
|
|
88
148
|
|
|
89
|
-
# Delete documents by
|
|
149
|
+
# Delete documents by primary key(s).
|
|
150
|
+
#
|
|
151
|
+
# @param ids [Array<String, Integer>] one or more primary keys
|
|
152
|
+
# @return [Array] write results from the collection
|
|
90
153
|
def delete(*ids)
|
|
91
154
|
@collection.delete(*ids.flatten)
|
|
92
155
|
end
|
|
93
156
|
|
|
94
|
-
# Fetch documents by
|
|
157
|
+
# Fetch documents by primary key(s).
|
|
158
|
+
#
|
|
159
|
+
# @param ids [Array<String, Integer>] one or more primary keys
|
|
160
|
+
# @return [Hash{String => Zvec::Doc}] mapping of pk to document
|
|
95
161
|
def fetch(*ids)
|
|
96
162
|
@collection.fetch(*ids.flatten)
|
|
97
163
|
end
|
|
98
164
|
|
|
165
|
+
# Flush pending writes to disk.
|
|
166
|
+
#
|
|
167
|
+
# @return [self]
|
|
99
168
|
def flush
|
|
100
169
|
@collection.flush
|
|
101
170
|
end
|
|
102
171
|
|
|
172
|
+
# Return the number of documents in the store.
|
|
173
|
+
#
|
|
174
|
+
# @return [Integer]
|
|
103
175
|
def count
|
|
104
176
|
@collection.doc_count
|
|
105
177
|
end
|
data/lib/zvec/schema.rb
CHANGED
|
@@ -1,68 +1,212 @@
|
|
|
1
1
|
module Zvec
|
|
2
|
+
# Defines the structure of a collection: its name, fields, types, and
|
|
3
|
+
# vector dimensions.
|
|
4
|
+
#
|
|
5
|
+
# Schemas are immutable once created -- fields can be added during
|
|
6
|
+
# initialization but not removed afterward.
|
|
7
|
+
#
|
|
8
|
+
# @example Creating a schema with a DSL block
|
|
9
|
+
# schema = Zvec::Schema.new("articles") do
|
|
10
|
+
# string "title"
|
|
11
|
+
# string "body", nullable: true
|
|
12
|
+
# int32 "year"
|
|
13
|
+
# float "rating"
|
|
14
|
+
# bool "published"
|
|
15
|
+
# vector "embedding", dimension: 384,
|
|
16
|
+
# index: Zvec::Ext::HnswIndexParams.new(Zvec::COSINE)
|
|
17
|
+
# end
|
|
18
|
+
#
|
|
19
|
+
# @example Binary vector field
|
|
20
|
+
# schema = Zvec::Schema.new("hashes") do
|
|
21
|
+
# field "hash_vec", DataTypes::BINARY, dimension: 128
|
|
22
|
+
# end
|
|
23
|
+
#
|
|
24
|
+
# @example Sparse vector field
|
|
25
|
+
# schema = Zvec::Schema.new("sparse_docs") do
|
|
26
|
+
# field "tfidf", DataTypes::SPARSE_VECTOR_FP32, dimension: 30000
|
|
27
|
+
# end
|
|
28
|
+
#
|
|
2
29
|
class Schema
|
|
30
|
+
# @return [Ext::CollectionSchema] the underlying C++ schema object
|
|
3
31
|
attr_reader :ext_schema
|
|
4
32
|
|
|
33
|
+
# Create a new schema.
|
|
34
|
+
#
|
|
35
|
+
# @param name [String, Symbol] the collection name (must be non-empty)
|
|
36
|
+
# @yield optional DSL block evaluated in the schema's context
|
|
37
|
+
# @raise [Zvec::SchemaError] if name is nil or blank
|
|
38
|
+
#
|
|
39
|
+
# @example
|
|
40
|
+
# schema = Zvec::Schema.new("my_collection") do
|
|
41
|
+
# string "title"
|
|
42
|
+
# vector "embedding", dimension: 128
|
|
43
|
+
# end
|
|
5
44
|
def initialize(name, &block)
|
|
6
|
-
|
|
45
|
+
if name.nil? || name.to_s.strip.empty?
|
|
46
|
+
raise SchemaError, "Schema name must be a non-empty string"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
@ext_schema = Ext::CollectionSchema.new(name.to_s)
|
|
7
50
|
@field_types = {}
|
|
51
|
+
@field_dimensions = {}
|
|
8
52
|
instance_eval(&block) if block
|
|
9
53
|
end
|
|
10
54
|
|
|
55
|
+
# Add a field with an explicit data type.
|
|
56
|
+
#
|
|
57
|
+
# @param name [String, Symbol] the field name (must be non-empty)
|
|
58
|
+
# @param type [Symbol] a DataTypes constant (e.g., +DataTypes::STRING+)
|
|
59
|
+
# @param dimension [Integer, nil] required for vector fields
|
|
60
|
+
# @param nullable [Boolean] whether the field allows null values
|
|
61
|
+
# @param index [Ext::HnswIndexParams, Ext::FlatIndexParams, Ext::IVFIndexParams, nil]
|
|
62
|
+
# optional index parameters for this field
|
|
63
|
+
# @return [self] for method chaining
|
|
64
|
+
# @raise [Zvec::SchemaError] if field name is blank
|
|
65
|
+
#
|
|
66
|
+
# @example
|
|
67
|
+
# schema.field("tags", DataTypes::ARRAY_STRING)
|
|
68
|
+
# schema.field("embedding", DataTypes::VECTOR_FP32, dimension: 128)
|
|
11
69
|
def field(name, type, dimension: nil, nullable: false, index: nil)
|
|
12
70
|
name = name.to_s
|
|
71
|
+
if name.strip.empty?
|
|
72
|
+
raise SchemaError, "Field name must be a non-empty string"
|
|
73
|
+
end
|
|
74
|
+
|
|
13
75
|
fs = Ext::FieldSchema.new(name, type)
|
|
14
76
|
fs.dimension = dimension if dimension
|
|
15
77
|
fs.nullable = nullable
|
|
16
78
|
fs.set_index_params(index) if index
|
|
17
79
|
@ext_schema.add_field(fs)
|
|
18
80
|
@field_types[name] = type
|
|
81
|
+
@field_dimensions[name] = dimension if dimension
|
|
19
82
|
self
|
|
20
83
|
end
|
|
21
84
|
|
|
85
|
+
# Add a dense vector field. Defaults to FP32 precision.
|
|
86
|
+
#
|
|
87
|
+
# @param name [String, Symbol] the field name
|
|
88
|
+
# @param dimension [Integer] the vector dimension (must be > 0)
|
|
89
|
+
# @param type [Symbol] vector data type (default: {DataTypes::VECTOR_FP32}).
|
|
90
|
+
# Also accepts {DataTypes::VECTOR_FP64}, {DataTypes::VECTOR_FP16},
|
|
91
|
+
# or {DataTypes::VECTOR_INT8}.
|
|
92
|
+
# @param index [Ext::HnswIndexParams, Ext::FlatIndexParams, Ext::IVFIndexParams, nil]
|
|
93
|
+
# optional index parameters
|
|
94
|
+
# @return [self]
|
|
95
|
+
# @raise [ArgumentError] if dimension is not a positive integer
|
|
96
|
+
#
|
|
97
|
+
# @example Standard FP32 vector with HNSW index
|
|
98
|
+
# schema.vector "embedding", dimension: 384,
|
|
99
|
+
# index: Ext::HnswIndexParams.new(Zvec::COSINE)
|
|
100
|
+
#
|
|
101
|
+
# @example FP16 vector (half memory)
|
|
102
|
+
# schema.vector "embedding", dimension: 384,
|
|
103
|
+
# type: DataTypes::VECTOR_FP16
|
|
104
|
+
#
|
|
105
|
+
# @example INT8 quantized vector (minimal memory)
|
|
106
|
+
# schema.vector "embedding", dimension: 384,
|
|
107
|
+
# type: DataTypes::VECTOR_INT8
|
|
22
108
|
def vector(name, dimension:, type: DataTypes::VECTOR_FP32, index: nil)
|
|
109
|
+
raise ArgumentError, "Vector dimension must be a positive integer, got #{dimension.inspect}" unless dimension.is_a?(Integer) && dimension > 0
|
|
110
|
+
|
|
23
111
|
field(name, type, dimension: dimension, index: index)
|
|
24
112
|
end
|
|
25
113
|
|
|
114
|
+
# Add a string field.
|
|
115
|
+
#
|
|
116
|
+
# @param name [String, Symbol] the field name
|
|
117
|
+
# @param opts [Hash] options passed to {#field} (+nullable:+, +index:+)
|
|
118
|
+
# @return [self]
|
|
26
119
|
def string(name, **opts)
|
|
27
120
|
field(name, DataTypes::STRING, **opts)
|
|
28
121
|
end
|
|
29
122
|
|
|
123
|
+
# Add a 32-bit integer field.
|
|
124
|
+
#
|
|
125
|
+
# @param name [String, Symbol] the field name
|
|
126
|
+
# @param opts [Hash] options passed to {#field}
|
|
127
|
+
# @return [self]
|
|
30
128
|
def int32(name, **opts)
|
|
31
129
|
field(name, DataTypes::INT32, **opts)
|
|
32
130
|
end
|
|
33
131
|
|
|
132
|
+
# Add a 64-bit integer field.
|
|
133
|
+
#
|
|
134
|
+
# @param name [String, Symbol] the field name
|
|
135
|
+
# @param opts [Hash] options passed to {#field}
|
|
136
|
+
# @return [self]
|
|
34
137
|
def int64(name, **opts)
|
|
35
138
|
field(name, DataTypes::INT64, **opts)
|
|
36
139
|
end
|
|
37
140
|
|
|
141
|
+
# Add a 32-bit float field.
|
|
142
|
+
#
|
|
143
|
+
# @param name [String, Symbol] the field name
|
|
144
|
+
# @param opts [Hash] options passed to {#field}
|
|
145
|
+
# @return [self]
|
|
38
146
|
def float(name, **opts)
|
|
39
147
|
field(name, DataTypes::FLOAT, **opts)
|
|
40
148
|
end
|
|
41
149
|
|
|
150
|
+
# Add a 64-bit double field.
|
|
151
|
+
#
|
|
152
|
+
# @param name [String, Symbol] the field name
|
|
153
|
+
# @param opts [Hash] options passed to {#field}
|
|
154
|
+
# @return [self]
|
|
42
155
|
def double(name, **opts)
|
|
43
156
|
field(name, DataTypes::DOUBLE, **opts)
|
|
44
157
|
end
|
|
45
158
|
|
|
159
|
+
# Add a boolean field.
|
|
160
|
+
#
|
|
161
|
+
# @param name [String, Symbol] the field name
|
|
162
|
+
# @param opts [Hash] options passed to {#field}
|
|
163
|
+
# @return [self]
|
|
46
164
|
def bool(name, **opts)
|
|
47
165
|
field(name, DataTypes::BOOL, **opts)
|
|
48
166
|
end
|
|
49
167
|
|
|
168
|
+
# @return [String] the collection name
|
|
50
169
|
def name
|
|
51
170
|
@ext_schema.name
|
|
52
171
|
end
|
|
53
172
|
|
|
173
|
+
# @return [Array<String>] all field names in this schema
|
|
54
174
|
def field_names
|
|
55
175
|
@ext_schema.field_names
|
|
56
176
|
end
|
|
57
177
|
|
|
178
|
+
# Look up the data type of a field by name.
|
|
179
|
+
#
|
|
180
|
+
# @param name [String, Symbol] the field name
|
|
181
|
+
# @return [Symbol, nil] the data type constant, or nil if not found
|
|
58
182
|
def field_type(name)
|
|
59
183
|
@field_types[name.to_s]
|
|
60
184
|
end
|
|
61
185
|
|
|
186
|
+
# Look up the dimension of a vector field.
|
|
187
|
+
#
|
|
188
|
+
# @param name [String, Symbol] the field name
|
|
189
|
+
# @return [Integer, nil] the dimension, or nil if the field is not a vector
|
|
190
|
+
def field_dimension(name)
|
|
191
|
+
@field_dimensions[name.to_s]
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Check whether a field exists in the schema.
|
|
195
|
+
#
|
|
196
|
+
# @param name [String, Symbol] the field name
|
|
197
|
+
# @return [Boolean]
|
|
62
198
|
def has_field?(name)
|
|
63
199
|
@ext_schema.has_field?(name.to_s)
|
|
64
200
|
end
|
|
65
201
|
|
|
202
|
+
# Returns a hash of vector field names to their dimensions.
|
|
203
|
+
#
|
|
204
|
+
# @return [Hash{String => Integer}] e.g. +{"embedding" => 384}+
|
|
205
|
+
def vector_fields_with_dimensions
|
|
206
|
+
@field_dimensions.select { |name, _| DataTypes::VECTOR_TYPES.include?(@field_types[name]) }
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# @return [String] human-readable representation of the schema
|
|
66
210
|
def to_s
|
|
67
211
|
@ext_schema.to_s
|
|
68
212
|
end
|
data/lib/zvec/version.rb
CHANGED
data/lib/zvec.rb
CHANGED
|
@@ -16,7 +16,20 @@ require_relative "zvec/query"
|
|
|
16
16
|
require_relative "zvec/collection"
|
|
17
17
|
|
|
18
18
|
module Zvec
|
|
19
|
+
# Base error class for all Zvec errors.
|
|
19
20
|
class Error < StandardError; end
|
|
20
21
|
|
|
22
|
+
# Raised when vector dimensions do not match the expected schema dimension.
|
|
23
|
+
class DimensionError < Error; end
|
|
24
|
+
|
|
25
|
+
# Raised for schema definition errors (invalid field names, types, etc.).
|
|
26
|
+
class SchemaError < Error; end
|
|
27
|
+
|
|
28
|
+
# Raised for query construction or execution errors.
|
|
29
|
+
class QueryError < Error; end
|
|
30
|
+
|
|
31
|
+
# Raised for collection lifecycle errors (open/close/reopen issues).
|
|
32
|
+
class CollectionError < Error; end
|
|
33
|
+
|
|
21
34
|
include DataTypes
|
|
22
35
|
end
|