zvec-ruby 0.1.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +21 -0
- data/LICENSE +190 -0
- data/README.md +189 -0
- data/Rakefile +83 -0
- data/examples/basic.rb +63 -0
- data/examples/with_ruby_llm.rb +79 -0
- data/lib/zvec/3.4/zvec_ext.bundle +0 -0
- data/lib/zvec/active_record.rb +113 -0
- data/lib/zvec/collection.rb +165 -0
- data/lib/zvec/data_types.rb +63 -0
- data/lib/zvec/doc.rb +107 -0
- data/lib/zvec/query.rb +28 -0
- data/lib/zvec/ruby_llm.rb +108 -0
- data/lib/zvec/schema.rb +70 -0
- data/lib/zvec/version.rb +3 -0
- data/lib/zvec.rb +22 -0
- data/test/test_active_record.rb +55 -0
- data/test/test_collection.rb +312 -0
- data/test/test_data_types.rb +165 -0
- data/test/test_doc.rb +271 -0
- data/test/test_ext_bindings.rb +313 -0
- data/test/test_helper.rb +170 -0
- data/test/test_query.rb +64 -0
- data/test/test_ruby_llm_store.rb +166 -0
- data/test/test_schema.rb +133 -0
- data/test/test_version.rb +19 -0
- data/zvec.gemspec +43 -0
- metadata +113 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
require "zvec"
|
|
2
|
+
require "active_support/concern"
|
|
3
|
+
|
|
4
|
+
module Zvec
|
|
5
|
+
module ActiveRecord
|
|
6
|
+
# Rails concern that adds vector search capabilities to ActiveRecord models.
|
|
7
|
+
#
|
|
8
|
+
# Usage:
|
|
9
|
+
# class Article < ApplicationRecord
|
|
10
|
+
# include Zvec::ActiveRecord::Vectorize
|
|
11
|
+
#
|
|
12
|
+
# vectorize :content,
|
|
13
|
+
# dimensions: 1536,
|
|
14
|
+
# prefix: "articles",
|
|
15
|
+
# embed_with: ->(text) { OpenAI.embed(text) }
|
|
16
|
+
# end
|
|
17
|
+
#
|
|
18
|
+
# Article.vector_search([0.1, 0.2, ...], top_k: 5)
|
|
19
|
+
# article.update_embedding!
|
|
20
|
+
#
|
|
21
|
+
module Vectorize
|
|
22
|
+
extend ActiveSupport::Concern
|
|
23
|
+
|
|
24
|
+
class_methods do
|
|
25
|
+
def vectorize(field, dimensions:, prefix: nil, embed_with: nil,
|
|
26
|
+
metric: :cosine, zvec_path: nil)
|
|
27
|
+
prefix ||= table_name
|
|
28
|
+
zvec_path ||= Rails.root.join("tmp", "zvec", prefix).to_s if defined?(Rails)
|
|
29
|
+
zvec_path ||= File.join("tmp", "zvec", prefix)
|
|
30
|
+
|
|
31
|
+
class_attribute :zvec_config, instance_writer: false
|
|
32
|
+
self.zvec_config = {
|
|
33
|
+
field: field.to_s,
|
|
34
|
+
dimensions: dimensions,
|
|
35
|
+
prefix: prefix,
|
|
36
|
+
embed_with: embed_with,
|
|
37
|
+
metric: metric,
|
|
38
|
+
zvec_path: zvec_path
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
after_save :zvec_update_embedding!, if: -> { saved_change_to_attribute?(zvec_config[:field]) }
|
|
42
|
+
after_destroy :zvec_remove_embedding!
|
|
43
|
+
|
|
44
|
+
include InstanceMethods
|
|
45
|
+
extend SearchMethods
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
module InstanceMethods
|
|
50
|
+
def zvec_update_embedding!
|
|
51
|
+
cfg = self.class.zvec_config
|
|
52
|
+
text = send(cfg[:field])
|
|
53
|
+
return if text.blank?
|
|
54
|
+
|
|
55
|
+
embed_fn = cfg[:embed_with]
|
|
56
|
+
raise Zvec::Error, "No embed_with function configured" unless embed_fn
|
|
57
|
+
|
|
58
|
+
embedding = embed_fn.call(text)
|
|
59
|
+
store = self.class.zvec_store
|
|
60
|
+
store.add(id.to_s, embedding: embedding, content: text)
|
|
61
|
+
store.flush
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def zvec_remove_embedding!
|
|
65
|
+
self.class.zvec_store.delete(id.to_s)
|
|
66
|
+
rescue => e
|
|
67
|
+
# Silently ignore if document doesn't exist
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def zvec_embedding
|
|
71
|
+
result = self.class.zvec_store.fetch(id.to_s)
|
|
72
|
+
result[id.to_s]
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
module SearchMethods
|
|
77
|
+
def zvec_store
|
|
78
|
+
@zvec_store ||= begin
|
|
79
|
+
cfg = zvec_config
|
|
80
|
+
Zvec::RubyLLM::Store.new(
|
|
81
|
+
cfg[:zvec_path],
|
|
82
|
+
dimension: cfg[:dimensions],
|
|
83
|
+
metric: cfg[:metric]
|
|
84
|
+
)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def vector_search(query, top_k: 10, embed: true)
|
|
89
|
+
cfg = zvec_config
|
|
90
|
+
|
|
91
|
+
query_vector = if embed && query.is_a?(String) && cfg[:embed_with]
|
|
92
|
+
cfg[:embed_with].call(query)
|
|
93
|
+
elsif query.is_a?(Array)
|
|
94
|
+
query
|
|
95
|
+
else
|
|
96
|
+
raise ArgumentError, "query must be a vector Array or a String with embed_with configured"
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
results = zvec_store.search(query_vector, top_k: top_k)
|
|
100
|
+
ids = results.map { |r| r[:id] }
|
|
101
|
+
records = where(id: ids).index_by { |r| r.id.to_s }
|
|
102
|
+
|
|
103
|
+
results.filter_map do |r|
|
|
104
|
+
record = records[r[:id]]
|
|
105
|
+
next unless record
|
|
106
|
+
record.define_singleton_method(:zvec_score) { r[:score] }
|
|
107
|
+
record
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
module Zvec
|
|
2
|
+
class Collection
|
|
3
|
+
attr_reader :schema
|
|
4
|
+
|
|
5
|
+
def initialize(ext_collection, schema: nil)
|
|
6
|
+
@ext = ext_collection
|
|
7
|
+
@schema = schema
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
# Create a new collection and open it.
|
|
11
|
+
def self.create_and_open(path, schema, read_only: false, enable_mmap: true)
|
|
12
|
+
opts = Ext::CollectionOptions.new
|
|
13
|
+
opts.read_only = read_only
|
|
14
|
+
opts.enable_mmap = enable_mmap
|
|
15
|
+
ext = Ext::Collection.create_and_open(path, schema.ext_schema, opts)
|
|
16
|
+
new(ext, schema: schema)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Open an existing collection.
|
|
20
|
+
def self.open(path, read_only: false, enable_mmap: true)
|
|
21
|
+
opts = Ext::CollectionOptions.new
|
|
22
|
+
opts.read_only = read_only
|
|
23
|
+
opts.enable_mmap = enable_mmap
|
|
24
|
+
ext = Ext::Collection.open(path, opts)
|
|
25
|
+
new(ext)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def path
|
|
29
|
+
@ext.path
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def stats
|
|
33
|
+
@ext.stats
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def doc_count
|
|
37
|
+
@ext.stats.doc_count
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# --- DDL ---
|
|
41
|
+
|
|
42
|
+
def create_index(field_name, index_params)
|
|
43
|
+
@ext.create_index(field_name.to_s, index_params)
|
|
44
|
+
self
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def drop_index(field_name)
|
|
48
|
+
@ext.drop_index(field_name.to_s)
|
|
49
|
+
self
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def optimize
|
|
53
|
+
@ext.optimize
|
|
54
|
+
self
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def flush
|
|
58
|
+
@ext.flush
|
|
59
|
+
self
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def destroy
|
|
63
|
+
@ext.destroy
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# --- DML ---
|
|
67
|
+
|
|
68
|
+
def insert(docs)
|
|
69
|
+
docs = [docs] unless docs.is_a?(Array)
|
|
70
|
+
ext_docs = docs.map { |d| d.is_a?(Doc) ? d.ext_doc : d }
|
|
71
|
+
results = @ext.insert(ext_docs)
|
|
72
|
+
check_write_results!(results)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def upsert(docs)
|
|
76
|
+
docs = [docs] unless docs.is_a?(Array)
|
|
77
|
+
ext_docs = docs.map { |d| d.is_a?(Doc) ? d.ext_doc : d }
|
|
78
|
+
results = @ext.upsert(ext_docs)
|
|
79
|
+
check_write_results!(results)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def update(docs)
|
|
83
|
+
docs = [docs] unless docs.is_a?(Array)
|
|
84
|
+
ext_docs = docs.map { |d| d.is_a?(Doc) ? d.ext_doc : d }
|
|
85
|
+
results = @ext.update(ext_docs)
|
|
86
|
+
check_write_results!(results)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def delete(*pks)
|
|
90
|
+
pks = pks.flatten.map(&:to_s)
|
|
91
|
+
results = @ext.delete_pks(pks)
|
|
92
|
+
check_write_results!(results)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def delete_by_filter(filter)
|
|
96
|
+
@ext.delete_by_filter(filter)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# --- DQL ---
|
|
100
|
+
|
|
101
|
+
def query(field_name:, vector:, topk: 10, filter: nil,
|
|
102
|
+
include_vector: false, output_fields: nil, query_params: nil)
|
|
103
|
+
vq = VectorQuery.new(
|
|
104
|
+
field_name: field_name,
|
|
105
|
+
vector: vector,
|
|
106
|
+
topk: topk,
|
|
107
|
+
filter: filter,
|
|
108
|
+
include_vector: include_vector,
|
|
109
|
+
output_fields: output_fields,
|
|
110
|
+
query_params: query_params
|
|
111
|
+
)
|
|
112
|
+
raw_results = @ext.query(vq.ext_query)
|
|
113
|
+
raw_results.map do |h|
|
|
114
|
+
Doc.new(
|
|
115
|
+
pk: h["pk"],
|
|
116
|
+
fields: h.reject { |k, _| %w[pk score doc_id].include?(k) },
|
|
117
|
+
schema: @schema
|
|
118
|
+
).tap { |d| d.instance_variable_set(:@score, h["score"]) }
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def fetch(*pks)
|
|
123
|
+
pks = pks.flatten.map(&:to_s)
|
|
124
|
+
raw = @ext.fetch(pks)
|
|
125
|
+
raw.transform_values do |h|
|
|
126
|
+
Doc.new(pk: nil, fields: h, schema: @schema)
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Convenience: insert a hash directly
|
|
131
|
+
def add(pk:, **fields)
|
|
132
|
+
doc = Doc.new(pk: pk, fields: fields, schema: @schema)
|
|
133
|
+
insert(doc)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
private
|
|
137
|
+
|
|
138
|
+
def check_write_results!(results)
|
|
139
|
+
results.each do |ok, msg|
|
|
140
|
+
raise Error, (msg.empty? ? "Write operation failed" : msg) unless ok
|
|
141
|
+
end
|
|
142
|
+
results
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
public
|
|
146
|
+
|
|
147
|
+
# Convenience: search with simpler API
|
|
148
|
+
def search(vector, field: nil, top_k: 10, filter: nil)
|
|
149
|
+
# Auto-detect vector field if not specified
|
|
150
|
+
fname = field&.to_s
|
|
151
|
+
unless fname
|
|
152
|
+
if @schema
|
|
153
|
+
vfield = @schema.ext_schema.vector_fields.first
|
|
154
|
+
raise Error, "No vector fields in schema" unless vfield
|
|
155
|
+
fname = vfield.name
|
|
156
|
+
else
|
|
157
|
+
vfields = @ext.schema.vector_fields
|
|
158
|
+
raise Error, "No vector fields in schema" if vfields.empty?
|
|
159
|
+
fname = vfields.first.name
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
query(field_name: fname, vector: vector, topk: top_k, filter: filter)
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
module Zvec
|
|
2
|
+
module DataTypes
|
|
3
|
+
# Re-export C++ enum values as Ruby-friendly constants
|
|
4
|
+
BINARY = Ext::DataType::BINARY
|
|
5
|
+
STRING = Ext::DataType::STRING
|
|
6
|
+
BOOL = Ext::DataType::BOOL
|
|
7
|
+
INT32 = Ext::DataType::INT32
|
|
8
|
+
INT64 = Ext::DataType::INT64
|
|
9
|
+
UINT32 = Ext::DataType::UINT32
|
|
10
|
+
UINT64 = Ext::DataType::UINT64
|
|
11
|
+
FLOAT = Ext::DataType::FLOAT
|
|
12
|
+
DOUBLE = Ext::DataType::DOUBLE
|
|
13
|
+
|
|
14
|
+
VECTOR_FP32 = Ext::DataType::VECTOR_FP32
|
|
15
|
+
VECTOR_FP64 = Ext::DataType::VECTOR_FP64
|
|
16
|
+
VECTOR_FP16 = Ext::DataType::VECTOR_FP16
|
|
17
|
+
VECTOR_INT8 = Ext::DataType::VECTOR_INT8
|
|
18
|
+
|
|
19
|
+
SPARSE_VECTOR_FP32 = Ext::DataType::SPARSE_VECTOR_FP32
|
|
20
|
+
SPARSE_VECTOR_FP16 = Ext::DataType::SPARSE_VECTOR_FP16
|
|
21
|
+
|
|
22
|
+
ARRAY_STRING = Ext::DataType::ARRAY_STRING
|
|
23
|
+
ARRAY_INT32 = Ext::DataType::ARRAY_INT32
|
|
24
|
+
ARRAY_INT64 = Ext::DataType::ARRAY_INT64
|
|
25
|
+
ARRAY_FLOAT = Ext::DataType::ARRAY_FLOAT
|
|
26
|
+
ARRAY_DOUBLE = Ext::DataType::ARRAY_DOUBLE
|
|
27
|
+
ARRAY_BOOL = Ext::DataType::ARRAY_BOOL
|
|
28
|
+
|
|
29
|
+
# Metric types
|
|
30
|
+
L2 = Ext::MetricType::L2
|
|
31
|
+
IP = Ext::MetricType::IP
|
|
32
|
+
COSINE = Ext::MetricType::COSINE
|
|
33
|
+
|
|
34
|
+
# Setter dispatch table: DataType -> Doc setter method name
|
|
35
|
+
SETTER_FOR = {
|
|
36
|
+
Ext::DataType::STRING => :set_string,
|
|
37
|
+
Ext::DataType::BOOL => :set_bool,
|
|
38
|
+
Ext::DataType::INT32 => :set_int32,
|
|
39
|
+
Ext::DataType::INT64 => :set_int64,
|
|
40
|
+
Ext::DataType::UINT32 => :set_uint32,
|
|
41
|
+
Ext::DataType::UINT64 => :set_uint64,
|
|
42
|
+
Ext::DataType::FLOAT => :set_float,
|
|
43
|
+
Ext::DataType::DOUBLE => :set_double,
|
|
44
|
+
Ext::DataType::VECTOR_FP32 => :set_float_vector,
|
|
45
|
+
Ext::DataType::VECTOR_FP64 => :set_double_vector,
|
|
46
|
+
Ext::DataType::ARRAY_STRING => :set_string_array,
|
|
47
|
+
}.freeze
|
|
48
|
+
|
|
49
|
+
GETTER_FOR = {
|
|
50
|
+
Ext::DataType::STRING => :get_string,
|
|
51
|
+
Ext::DataType::BOOL => :get_bool,
|
|
52
|
+
Ext::DataType::INT32 => :get_int32,
|
|
53
|
+
Ext::DataType::INT64 => :get_int64,
|
|
54
|
+
Ext::DataType::UINT32 => :get_int32,
|
|
55
|
+
Ext::DataType::UINT64 => :get_int64,
|
|
56
|
+
Ext::DataType::FLOAT => :get_float,
|
|
57
|
+
Ext::DataType::DOUBLE => :get_double,
|
|
58
|
+
Ext::DataType::VECTOR_FP32 => :get_float_vector,
|
|
59
|
+
Ext::DataType::VECTOR_FP64 => :get_double_vector,
|
|
60
|
+
Ext::DataType::ARRAY_STRING => :get_string_array,
|
|
61
|
+
}.freeze
|
|
62
|
+
end
|
|
63
|
+
end
|
data/lib/zvec/doc.rb
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
module Zvec
|
|
2
|
+
class Doc
|
|
3
|
+
attr_reader :ext_doc
|
|
4
|
+
|
|
5
|
+
def initialize(pk: nil, fields: {}, schema: nil)
|
|
6
|
+
@ext_doc = Ext::Doc.new
|
|
7
|
+
@ext_doc.pk = pk.to_s if pk
|
|
8
|
+
@schema = schema
|
|
9
|
+
fields.each { |k, v| set(k, v) } if schema
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def pk
|
|
13
|
+
@ext_doc.pk
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def pk=(value)
|
|
17
|
+
@ext_doc.pk = value.to_s
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def score
|
|
21
|
+
@score || @ext_doc.score
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def [](field_name)
|
|
25
|
+
get(field_name)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def []=(field_name, value)
|
|
29
|
+
set(field_name, value)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def set(field_name, value)
|
|
33
|
+
field_name = field_name.to_s
|
|
34
|
+
return @ext_doc.set_null(field_name) if value.nil?
|
|
35
|
+
|
|
36
|
+
if @schema
|
|
37
|
+
type = @schema.field_type(field_name)
|
|
38
|
+
if type
|
|
39
|
+
setter = DataTypes::SETTER_FOR[type]
|
|
40
|
+
return @ext_doc.send(setter, field_name, value) if setter
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Auto-detect type
|
|
45
|
+
case value
|
|
46
|
+
when String then @ext_doc.set_string(field_name, value)
|
|
47
|
+
when Integer then @ext_doc.set_int64(field_name, value)
|
|
48
|
+
when Float then @ext_doc.set_double(field_name, value)
|
|
49
|
+
when TrueClass, FalseClass then @ext_doc.set_bool(field_name, value)
|
|
50
|
+
when Array
|
|
51
|
+
if value.empty? || value.first.is_a?(Float) || value.first.is_a?(Integer)
|
|
52
|
+
@ext_doc.set_float_vector(field_name, value.map(&:to_f))
|
|
53
|
+
elsif value.first.is_a?(String)
|
|
54
|
+
@ext_doc.set_string_array(field_name, value)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def get(field_name)
|
|
60
|
+
field_name = field_name.to_s
|
|
61
|
+
return nil unless @ext_doc.has?(field_name)
|
|
62
|
+
return nil unless @ext_doc.has_value?(field_name)
|
|
63
|
+
|
|
64
|
+
if @schema
|
|
65
|
+
type = @schema.field_type(field_name)
|
|
66
|
+
if type
|
|
67
|
+
getter = DataTypes::GETTER_FOR[type]
|
|
68
|
+
return @ext_doc.send(getter, field_name) if getter
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Try common types in order
|
|
73
|
+
%i[get_string get_int64 get_float get_double get_bool
|
|
74
|
+
get_float_vector get_string_array].each do |m|
|
|
75
|
+
val = @ext_doc.send(m, field_name)
|
|
76
|
+
return val unless val.nil?
|
|
77
|
+
end
|
|
78
|
+
nil
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def field_names
|
|
82
|
+
@ext_doc.field_names
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def empty?
|
|
86
|
+
@ext_doc.empty?
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def to_h
|
|
90
|
+
h = { "pk" => pk, "score" => score }
|
|
91
|
+
field_names.each { |f| h[f] = get(f) }
|
|
92
|
+
h
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def to_s
|
|
96
|
+
@ext_doc.to_s
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Wrap a C++ Doc::Ptr into a Ruby Doc
|
|
100
|
+
def self.from_ext(ext_doc, schema: nil)
|
|
101
|
+
doc = allocate
|
|
102
|
+
doc.instance_variable_set(:@ext_doc, ext_doc)
|
|
103
|
+
doc.instance_variable_set(:@schema, schema)
|
|
104
|
+
doc
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
data/lib/zvec/query.rb
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
module Zvec
|
|
2
|
+
class VectorQuery
|
|
3
|
+
attr_reader :ext_query
|
|
4
|
+
|
|
5
|
+
def initialize(field_name:, vector:, topk: 10, filter: nil,
|
|
6
|
+
include_vector: false, output_fields: nil, query_params: nil)
|
|
7
|
+
@ext_query = Ext::VectorQuery.new
|
|
8
|
+
@ext_query.field_name = field_name.to_s
|
|
9
|
+
@ext_query.topk = topk
|
|
10
|
+
@ext_query.set_query_vector(vector.map(&:to_f))
|
|
11
|
+
@ext_query.filter = filter if filter
|
|
12
|
+
@ext_query.include_vector = include_vector
|
|
13
|
+
@ext_query.set_output_fields(output_fields.map(&:to_s)) if output_fields
|
|
14
|
+
if query_params
|
|
15
|
+
case query_params
|
|
16
|
+
when Ext::HnswQueryParams
|
|
17
|
+
@ext_query.set_hnsw_query_params(query_params)
|
|
18
|
+
when Ext::IVFQueryParams
|
|
19
|
+
@ext_query.set_ivf_query_params(query_params)
|
|
20
|
+
when Ext::FlatQueryParams
|
|
21
|
+
@ext_query.set_flat_query_params(query_params)
|
|
22
|
+
else
|
|
23
|
+
raise ArgumentError, "Unknown query_params type: #{query_params.class}"
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
require "zvec"
|
|
2
|
+
|
|
3
|
+
module Zvec
|
|
4
|
+
module RubyLLM
|
|
5
|
+
# A vector store backend for the ruby_llm gem.
|
|
6
|
+
#
|
|
7
|
+
# Usage with ruby_llm:
|
|
8
|
+
# store = Zvec::RubyLLM::Store.new("/path/to/db", dimension: 1536)
|
|
9
|
+
# store.add("doc-1", embedding: [...], metadata: { title: "Hello" })
|
|
10
|
+
# results = store.search([0.1, 0.2, ...], top_k: 5)
|
|
11
|
+
#
|
|
12
|
+
class Store
|
|
13
|
+
DEFAULT_VECTOR_FIELD = "embedding"
|
|
14
|
+
DEFAULT_CONTENT_FIELD = "content"
|
|
15
|
+
|
|
16
|
+
attr_reader :collection, :dimension
|
|
17
|
+
|
|
18
|
+
def initialize(path, dimension:, metric: :cosine, vector_field: DEFAULT_VECTOR_FIELD,
|
|
19
|
+
content_field: DEFAULT_CONTENT_FIELD)
|
|
20
|
+
@vector_field = vector_field.to_s
|
|
21
|
+
@content_field = content_field.to_s
|
|
22
|
+
@dimension = dimension
|
|
23
|
+
|
|
24
|
+
metric_type = case metric.to_sym
|
|
25
|
+
when :cosine then Zvec::DataTypes::COSINE
|
|
26
|
+
when :l2 then Zvec::DataTypes::L2
|
|
27
|
+
when :ip then Zvec::DataTypes::IP
|
|
28
|
+
else raise ArgumentError, "Unknown metric: #{metric}"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
cf = @content_field
|
|
32
|
+
vf = @vector_field
|
|
33
|
+
dim = dimension
|
|
34
|
+
schema = Zvec::Schema.new("ruby_llm_store") do
|
|
35
|
+
string cf, nullable: true
|
|
36
|
+
vector vf, dimension: dim,
|
|
37
|
+
index: Zvec::Ext::HnswIndexParams.new(metric_type)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
@schema = schema
|
|
41
|
+
|
|
42
|
+
if Dir.exist?(path)
|
|
43
|
+
@collection = Zvec::Collection.open(path)
|
|
44
|
+
else
|
|
45
|
+
@collection = Zvec::Collection.create_and_open(path, schema)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Add a document with its embedding and optional metadata.
|
|
50
|
+
def add(id, embedding:, content: nil, metadata: {})
|
|
51
|
+
doc = Zvec::Doc.new(pk: id, schema: @schema)
|
|
52
|
+
doc[@vector_field] = embedding
|
|
53
|
+
doc[@content_field] = content if content
|
|
54
|
+
metadata.each { |k, v| doc[k] = v }
|
|
55
|
+
@collection.insert(doc)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Batch-add documents.
|
|
59
|
+
# docs: array of { id:, embedding:, content:, metadata: {} }
|
|
60
|
+
def add_many(docs)
|
|
61
|
+
zvec_docs = docs.map do |d|
|
|
62
|
+
doc = Zvec::Doc.new(pk: d[:id], schema: @schema)
|
|
63
|
+
doc[@vector_field] = d[:embedding]
|
|
64
|
+
doc[@content_field] = d[:content] if d[:content]
|
|
65
|
+
(d[:metadata] || {}).each { |k, v| doc[k] = v }
|
|
66
|
+
doc
|
|
67
|
+
end
|
|
68
|
+
@collection.insert(zvec_docs)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Search for similar vectors.
|
|
72
|
+
def search(query_vector, top_k: 10, filter: nil)
|
|
73
|
+
results = @collection.query(
|
|
74
|
+
field_name: @vector_field,
|
|
75
|
+
vector: query_vector,
|
|
76
|
+
topk: top_k,
|
|
77
|
+
filter: filter
|
|
78
|
+
)
|
|
79
|
+
results.map do |doc|
|
|
80
|
+
{
|
|
81
|
+
id: doc.pk,
|
|
82
|
+
score: doc.score,
|
|
83
|
+
content: doc[@content_field],
|
|
84
|
+
metadata: doc.to_h.reject { |k, _| ["pk", "score", @vector_field, @content_field].include?(k) }
|
|
85
|
+
}
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Delete documents by IDs.
|
|
90
|
+
def delete(*ids)
|
|
91
|
+
@collection.delete(*ids.flatten)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Fetch documents by IDs.
|
|
95
|
+
def fetch(*ids)
|
|
96
|
+
@collection.fetch(*ids.flatten)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def flush
|
|
100
|
+
@collection.flush
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def count
|
|
104
|
+
@collection.doc_count
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
data/lib/zvec/schema.rb
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
module Zvec
|
|
2
|
+
class Schema
|
|
3
|
+
attr_reader :ext_schema
|
|
4
|
+
|
|
5
|
+
def initialize(name, &block)
|
|
6
|
+
@ext_schema = Ext::CollectionSchema.new(name)
|
|
7
|
+
@field_types = {}
|
|
8
|
+
instance_eval(&block) if block
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def field(name, type, dimension: nil, nullable: false, index: nil)
|
|
12
|
+
name = name.to_s
|
|
13
|
+
fs = Ext::FieldSchema.new(name, type)
|
|
14
|
+
fs.dimension = dimension if dimension
|
|
15
|
+
fs.nullable = nullable
|
|
16
|
+
fs.set_index_params(index) if index
|
|
17
|
+
@ext_schema.add_field(fs)
|
|
18
|
+
@field_types[name] = type
|
|
19
|
+
self
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def vector(name, dimension:, type: DataTypes::VECTOR_FP32, index: nil)
|
|
23
|
+
field(name, type, dimension: dimension, index: index)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def string(name, **opts)
|
|
27
|
+
field(name, DataTypes::STRING, **opts)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def int32(name, **opts)
|
|
31
|
+
field(name, DataTypes::INT32, **opts)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def int64(name, **opts)
|
|
35
|
+
field(name, DataTypes::INT64, **opts)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def float(name, **opts)
|
|
39
|
+
field(name, DataTypes::FLOAT, **opts)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def double(name, **opts)
|
|
43
|
+
field(name, DataTypes::DOUBLE, **opts)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def bool(name, **opts)
|
|
47
|
+
field(name, DataTypes::BOOL, **opts)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def name
|
|
51
|
+
@ext_schema.name
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def field_names
|
|
55
|
+
@ext_schema.field_names
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def field_type(name)
|
|
59
|
+
@field_types[name.to_s]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def has_field?(name)
|
|
63
|
+
@ext_schema.has_field?(name.to_s)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def to_s
|
|
67
|
+
@ext_schema.to_s
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
data/lib/zvec/version.rb
ADDED
data/lib/zvec.rb
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
require_relative "zvec/version"
|
|
2
|
+
|
|
3
|
+
# Try loading precompiled native extension first (platform gem),
|
|
4
|
+
# then fall back to the compiled-from-source version.
|
|
5
|
+
begin
|
|
6
|
+
ruby_version = RUBY_VERSION[/\d+\.\d+/]
|
|
7
|
+
require "zvec/#{ruby_version}/zvec_ext"
|
|
8
|
+
rescue LoadError
|
|
9
|
+
require "zvec/zvec_ext"
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
require_relative "zvec/data_types"
|
|
13
|
+
require_relative "zvec/schema"
|
|
14
|
+
require_relative "zvec/doc"
|
|
15
|
+
require_relative "zvec/query"
|
|
16
|
+
require_relative "zvec/collection"
|
|
17
|
+
|
|
18
|
+
module Zvec
|
|
19
|
+
class Error < StandardError; end
|
|
20
|
+
|
|
21
|
+
include DataTypes
|
|
22
|
+
end
|