zvec-ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,113 @@
1
+ require "zvec"
2
+ require "active_support/concern"
3
+
4
+ module Zvec
5
+ module ActiveRecord
6
+ # Rails concern that adds vector search capabilities to ActiveRecord models.
7
+ #
8
+ # Usage:
9
+ # class Article < ApplicationRecord
10
+ # include Zvec::ActiveRecord::Vectorize
11
+ #
12
+ # vectorize :content,
13
+ # dimensions: 1536,
14
+ # prefix: "articles",
15
+ # embed_with: ->(text) { OpenAI.embed(text) }
16
+ # end
17
+ #
18
+ # Article.vector_search([0.1, 0.2, ...], top_k: 5)
19
+ # article.update_embedding!
20
+ #
21
+ module Vectorize
22
+ extend ActiveSupport::Concern
23
+
24
+ class_methods do
25
+ def vectorize(field, dimensions:, prefix: nil, embed_with: nil,
26
+ metric: :cosine, zvec_path: nil)
27
+ prefix ||= table_name
28
+ zvec_path ||= Rails.root.join("tmp", "zvec", prefix).to_s if defined?(Rails)
29
+ zvec_path ||= File.join("tmp", "zvec", prefix)
30
+
31
+ class_attribute :zvec_config, instance_writer: false
32
+ self.zvec_config = {
33
+ field: field.to_s,
34
+ dimensions: dimensions,
35
+ prefix: prefix,
36
+ embed_with: embed_with,
37
+ metric: metric,
38
+ zvec_path: zvec_path
39
+ }
40
+
41
+ after_save :zvec_update_embedding!, if: -> { saved_change_to_attribute?(zvec_config[:field]) }
42
+ after_destroy :zvec_remove_embedding!
43
+
44
+ include InstanceMethods
45
+ extend SearchMethods
46
+ end
47
+ end
48
+
49
+ module InstanceMethods
50
+ def zvec_update_embedding!
51
+ cfg = self.class.zvec_config
52
+ text = send(cfg[:field])
53
+ return if text.blank?
54
+
55
+ embed_fn = cfg[:embed_with]
56
+ raise Zvec::Error, "No embed_with function configured" unless embed_fn
57
+
58
+ embedding = embed_fn.call(text)
59
+ store = self.class.zvec_store
60
+ store.add(id.to_s, embedding: embedding, content: text)
61
+ store.flush
62
+ end
63
+
64
+ def zvec_remove_embedding!
65
+ self.class.zvec_store.delete(id.to_s)
66
+ rescue => e
67
+ # Silently ignore if document doesn't exist
68
+ end
69
+
70
+ def zvec_embedding
71
+ result = self.class.zvec_store.fetch(id.to_s)
72
+ result[id.to_s]
73
+ end
74
+ end
75
+
76
+ module SearchMethods
77
+ def zvec_store
78
+ @zvec_store ||= begin
79
+ cfg = zvec_config
80
+ Zvec::RubyLLM::Store.new(
81
+ cfg[:zvec_path],
82
+ dimension: cfg[:dimensions],
83
+ metric: cfg[:metric]
84
+ )
85
+ end
86
+ end
87
+
88
+ def vector_search(query, top_k: 10, embed: true)
89
+ cfg = zvec_config
90
+
91
+ query_vector = if embed && query.is_a?(String) && cfg[:embed_with]
92
+ cfg[:embed_with].call(query)
93
+ elsif query.is_a?(Array)
94
+ query
95
+ else
96
+ raise ArgumentError, "query must be a vector Array or a String with embed_with configured"
97
+ end
98
+
99
+ results = zvec_store.search(query_vector, top_k: top_k)
100
+ ids = results.map { |r| r[:id] }
101
+ records = where(id: ids).index_by { |r| r.id.to_s }
102
+
103
+ results.filter_map do |r|
104
+ record = records[r[:id]]
105
+ next unless record
106
+ record.define_singleton_method(:zvec_score) { r[:score] }
107
+ record
108
+ end
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,165 @@
1
+ module Zvec
2
+ class Collection
3
+ attr_reader :schema
4
+
5
+ def initialize(ext_collection, schema: nil)
6
+ @ext = ext_collection
7
+ @schema = schema
8
+ end
9
+
10
+ # Create a new collection and open it.
11
+ def self.create_and_open(path, schema, read_only: false, enable_mmap: true)
12
+ opts = Ext::CollectionOptions.new
13
+ opts.read_only = read_only
14
+ opts.enable_mmap = enable_mmap
15
+ ext = Ext::Collection.create_and_open(path, schema.ext_schema, opts)
16
+ new(ext, schema: schema)
17
+ end
18
+
19
+ # Open an existing collection.
20
+ def self.open(path, read_only: false, enable_mmap: true)
21
+ opts = Ext::CollectionOptions.new
22
+ opts.read_only = read_only
23
+ opts.enable_mmap = enable_mmap
24
+ ext = Ext::Collection.open(path, opts)
25
+ new(ext)
26
+ end
27
+
28
+ def path
29
+ @ext.path
30
+ end
31
+
32
+ def stats
33
+ @ext.stats
34
+ end
35
+
36
+ def doc_count
37
+ @ext.stats.doc_count
38
+ end
39
+
40
+ # --- DDL ---
41
+
42
+ def create_index(field_name, index_params)
43
+ @ext.create_index(field_name.to_s, index_params)
44
+ self
45
+ end
46
+
47
+ def drop_index(field_name)
48
+ @ext.drop_index(field_name.to_s)
49
+ self
50
+ end
51
+
52
+ def optimize
53
+ @ext.optimize
54
+ self
55
+ end
56
+
57
+ def flush
58
+ @ext.flush
59
+ self
60
+ end
61
+
62
+ def destroy
63
+ @ext.destroy
64
+ end
65
+
66
+ # --- DML ---
67
+
68
+ def insert(docs)
69
+ docs = [docs] unless docs.is_a?(Array)
70
+ ext_docs = docs.map { |d| d.is_a?(Doc) ? d.ext_doc : d }
71
+ results = @ext.insert(ext_docs)
72
+ check_write_results!(results)
73
+ end
74
+
75
+ def upsert(docs)
76
+ docs = [docs] unless docs.is_a?(Array)
77
+ ext_docs = docs.map { |d| d.is_a?(Doc) ? d.ext_doc : d }
78
+ results = @ext.upsert(ext_docs)
79
+ check_write_results!(results)
80
+ end
81
+
82
+ def update(docs)
83
+ docs = [docs] unless docs.is_a?(Array)
84
+ ext_docs = docs.map { |d| d.is_a?(Doc) ? d.ext_doc : d }
85
+ results = @ext.update(ext_docs)
86
+ check_write_results!(results)
87
+ end
88
+
89
+ def delete(*pks)
90
+ pks = pks.flatten.map(&:to_s)
91
+ results = @ext.delete_pks(pks)
92
+ check_write_results!(results)
93
+ end
94
+
95
+ def delete_by_filter(filter)
96
+ @ext.delete_by_filter(filter)
97
+ end
98
+
99
+ # --- DQL ---
100
+
101
+ def query(field_name:, vector:, topk: 10, filter: nil,
102
+ include_vector: false, output_fields: nil, query_params: nil)
103
+ vq = VectorQuery.new(
104
+ field_name: field_name,
105
+ vector: vector,
106
+ topk: topk,
107
+ filter: filter,
108
+ include_vector: include_vector,
109
+ output_fields: output_fields,
110
+ query_params: query_params
111
+ )
112
+ raw_results = @ext.query(vq.ext_query)
113
+ raw_results.map do |h|
114
+ Doc.new(
115
+ pk: h["pk"],
116
+ fields: h.reject { |k, _| %w[pk score doc_id].include?(k) },
117
+ schema: @schema
118
+ ).tap { |d| d.instance_variable_set(:@score, h["score"]) }
119
+ end
120
+ end
121
+
122
+ def fetch(*pks)
123
+ pks = pks.flatten.map(&:to_s)
124
+ raw = @ext.fetch(pks)
125
+ raw.transform_values do |h|
126
+ Doc.new(pk: nil, fields: h, schema: @schema)
127
+ end
128
+ end
129
+
130
+ # Convenience: insert a hash directly
131
+ def add(pk:, **fields)
132
+ doc = Doc.new(pk: pk, fields: fields, schema: @schema)
133
+ insert(doc)
134
+ end
135
+
136
+ private
137
+
138
+ def check_write_results!(results)
139
+ results.each do |ok, msg|
140
+ raise Error, (msg.empty? ? "Write operation failed" : msg) unless ok
141
+ end
142
+ results
143
+ end
144
+
145
+ public
146
+
147
+ # Convenience: search with simpler API
148
+ def search(vector, field: nil, top_k: 10, filter: nil)
149
+ # Auto-detect vector field if not specified
150
+ fname = field&.to_s
151
+ unless fname
152
+ if @schema
153
+ vfield = @schema.ext_schema.vector_fields.first
154
+ raise Error, "No vector fields in schema" unless vfield
155
+ fname = vfield.name
156
+ else
157
+ vfields = @ext.schema.vector_fields
158
+ raise Error, "No vector fields in schema" if vfields.empty?
159
+ fname = vfields.first.name
160
+ end
161
+ end
162
+ query(field_name: fname, vector: vector, topk: top_k, filter: filter)
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,63 @@
1
+ module Zvec
2
+ module DataTypes
3
+ # Re-export C++ enum values as Ruby-friendly constants
4
+ BINARY = Ext::DataType::BINARY
5
+ STRING = Ext::DataType::STRING
6
+ BOOL = Ext::DataType::BOOL
7
+ INT32 = Ext::DataType::INT32
8
+ INT64 = Ext::DataType::INT64
9
+ UINT32 = Ext::DataType::UINT32
10
+ UINT64 = Ext::DataType::UINT64
11
+ FLOAT = Ext::DataType::FLOAT
12
+ DOUBLE = Ext::DataType::DOUBLE
13
+
14
+ VECTOR_FP32 = Ext::DataType::VECTOR_FP32
15
+ VECTOR_FP64 = Ext::DataType::VECTOR_FP64
16
+ VECTOR_FP16 = Ext::DataType::VECTOR_FP16
17
+ VECTOR_INT8 = Ext::DataType::VECTOR_INT8
18
+
19
+ SPARSE_VECTOR_FP32 = Ext::DataType::SPARSE_VECTOR_FP32
20
+ SPARSE_VECTOR_FP16 = Ext::DataType::SPARSE_VECTOR_FP16
21
+
22
+ ARRAY_STRING = Ext::DataType::ARRAY_STRING
23
+ ARRAY_INT32 = Ext::DataType::ARRAY_INT32
24
+ ARRAY_INT64 = Ext::DataType::ARRAY_INT64
25
+ ARRAY_FLOAT = Ext::DataType::ARRAY_FLOAT
26
+ ARRAY_DOUBLE = Ext::DataType::ARRAY_DOUBLE
27
+ ARRAY_BOOL = Ext::DataType::ARRAY_BOOL
28
+
29
+ # Metric types
30
+ L2 = Ext::MetricType::L2
31
+ IP = Ext::MetricType::IP
32
+ COSINE = Ext::MetricType::COSINE
33
+
34
+ # Setter dispatch table: DataType -> Doc setter method name
35
+ SETTER_FOR = {
36
+ Ext::DataType::STRING => :set_string,
37
+ Ext::DataType::BOOL => :set_bool,
38
+ Ext::DataType::INT32 => :set_int32,
39
+ Ext::DataType::INT64 => :set_int64,
40
+ Ext::DataType::UINT32 => :set_uint32,
41
+ Ext::DataType::UINT64 => :set_uint64,
42
+ Ext::DataType::FLOAT => :set_float,
43
+ Ext::DataType::DOUBLE => :set_double,
44
+ Ext::DataType::VECTOR_FP32 => :set_float_vector,
45
+ Ext::DataType::VECTOR_FP64 => :set_double_vector,
46
+ Ext::DataType::ARRAY_STRING => :set_string_array,
47
+ }.freeze
48
+
49
+ GETTER_FOR = {
50
+ Ext::DataType::STRING => :get_string,
51
+ Ext::DataType::BOOL => :get_bool,
52
+ Ext::DataType::INT32 => :get_int32,
53
+ Ext::DataType::INT64 => :get_int64,
54
+ Ext::DataType::UINT32 => :get_int32,
55
+ Ext::DataType::UINT64 => :get_int64,
56
+ Ext::DataType::FLOAT => :get_float,
57
+ Ext::DataType::DOUBLE => :get_double,
58
+ Ext::DataType::VECTOR_FP32 => :get_float_vector,
59
+ Ext::DataType::VECTOR_FP64 => :get_double_vector,
60
+ Ext::DataType::ARRAY_STRING => :get_string_array,
61
+ }.freeze
62
+ end
63
+ end
data/lib/zvec/doc.rb ADDED
@@ -0,0 +1,107 @@
1
+ module Zvec
2
+ class Doc
3
+ attr_reader :ext_doc
4
+
5
+ def initialize(pk: nil, fields: {}, schema: nil)
6
+ @ext_doc = Ext::Doc.new
7
+ @ext_doc.pk = pk.to_s if pk
8
+ @schema = schema
9
+ fields.each { |k, v| set(k, v) } if schema
10
+ end
11
+
12
+ def pk
13
+ @ext_doc.pk
14
+ end
15
+
16
+ def pk=(value)
17
+ @ext_doc.pk = value.to_s
18
+ end
19
+
20
+ def score
21
+ @score || @ext_doc.score
22
+ end
23
+
24
+ def [](field_name)
25
+ get(field_name)
26
+ end
27
+
28
+ def []=(field_name, value)
29
+ set(field_name, value)
30
+ end
31
+
32
+ def set(field_name, value)
33
+ field_name = field_name.to_s
34
+ return @ext_doc.set_null(field_name) if value.nil?
35
+
36
+ if @schema
37
+ type = @schema.field_type(field_name)
38
+ if type
39
+ setter = DataTypes::SETTER_FOR[type]
40
+ return @ext_doc.send(setter, field_name, value) if setter
41
+ end
42
+ end
43
+
44
+ # Auto-detect type
45
+ case value
46
+ when String then @ext_doc.set_string(field_name, value)
47
+ when Integer then @ext_doc.set_int64(field_name, value)
48
+ when Float then @ext_doc.set_double(field_name, value)
49
+ when TrueClass, FalseClass then @ext_doc.set_bool(field_name, value)
50
+ when Array
51
+ if value.empty? || value.first.is_a?(Float) || value.first.is_a?(Integer)
52
+ @ext_doc.set_float_vector(field_name, value.map(&:to_f))
53
+ elsif value.first.is_a?(String)
54
+ @ext_doc.set_string_array(field_name, value)
55
+ end
56
+ end
57
+ end
58
+
59
+ def get(field_name)
60
+ field_name = field_name.to_s
61
+ return nil unless @ext_doc.has?(field_name)
62
+ return nil unless @ext_doc.has_value?(field_name)
63
+
64
+ if @schema
65
+ type = @schema.field_type(field_name)
66
+ if type
67
+ getter = DataTypes::GETTER_FOR[type]
68
+ return @ext_doc.send(getter, field_name) if getter
69
+ end
70
+ end
71
+
72
+ # Try common types in order
73
+ %i[get_string get_int64 get_float get_double get_bool
74
+ get_float_vector get_string_array].each do |m|
75
+ val = @ext_doc.send(m, field_name)
76
+ return val unless val.nil?
77
+ end
78
+ nil
79
+ end
80
+
81
+ def field_names
82
+ @ext_doc.field_names
83
+ end
84
+
85
+ def empty?
86
+ @ext_doc.empty?
87
+ end
88
+
89
+ def to_h
90
+ h = { "pk" => pk, "score" => score }
91
+ field_names.each { |f| h[f] = get(f) }
92
+ h
93
+ end
94
+
95
+ def to_s
96
+ @ext_doc.to_s
97
+ end
98
+
99
+ # Wrap a C++ Doc::Ptr into a Ruby Doc
100
+ def self.from_ext(ext_doc, schema: nil)
101
+ doc = allocate
102
+ doc.instance_variable_set(:@ext_doc, ext_doc)
103
+ doc.instance_variable_set(:@schema, schema)
104
+ doc
105
+ end
106
+ end
107
+ end
data/lib/zvec/query.rb ADDED
@@ -0,0 +1,28 @@
1
+ module Zvec
2
+ class VectorQuery
3
+ attr_reader :ext_query
4
+
5
+ def initialize(field_name:, vector:, topk: 10, filter: nil,
6
+ include_vector: false, output_fields: nil, query_params: nil)
7
+ @ext_query = Ext::VectorQuery.new
8
+ @ext_query.field_name = field_name.to_s
9
+ @ext_query.topk = topk
10
+ @ext_query.set_query_vector(vector.map(&:to_f))
11
+ @ext_query.filter = filter if filter
12
+ @ext_query.include_vector = include_vector
13
+ @ext_query.set_output_fields(output_fields.map(&:to_s)) if output_fields
14
+ if query_params
15
+ case query_params
16
+ when Ext::HnswQueryParams
17
+ @ext_query.set_hnsw_query_params(query_params)
18
+ when Ext::IVFQueryParams
19
+ @ext_query.set_ivf_query_params(query_params)
20
+ when Ext::FlatQueryParams
21
+ @ext_query.set_flat_query_params(query_params)
22
+ else
23
+ raise ArgumentError, "Unknown query_params type: #{query_params.class}"
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,108 @@
1
+ require "zvec"
2
+
3
+ module Zvec
4
+ module RubyLLM
5
+ # A vector store backend for the ruby_llm gem.
6
+ #
7
+ # Usage with ruby_llm:
8
+ # store = Zvec::RubyLLM::Store.new("/path/to/db", dimension: 1536)
9
+ # store.add("doc-1", embedding: [...], metadata: { title: "Hello" })
10
+ # results = store.search([0.1, 0.2, ...], top_k: 5)
11
+ #
12
+ class Store
13
+ DEFAULT_VECTOR_FIELD = "embedding"
14
+ DEFAULT_CONTENT_FIELD = "content"
15
+
16
+ attr_reader :collection, :dimension
17
+
18
+ def initialize(path, dimension:, metric: :cosine, vector_field: DEFAULT_VECTOR_FIELD,
19
+ content_field: DEFAULT_CONTENT_FIELD)
20
+ @vector_field = vector_field.to_s
21
+ @content_field = content_field.to_s
22
+ @dimension = dimension
23
+
24
+ metric_type = case metric.to_sym
25
+ when :cosine then Zvec::DataTypes::COSINE
26
+ when :l2 then Zvec::DataTypes::L2
27
+ when :ip then Zvec::DataTypes::IP
28
+ else raise ArgumentError, "Unknown metric: #{metric}"
29
+ end
30
+
31
+ cf = @content_field
32
+ vf = @vector_field
33
+ dim = dimension
34
+ schema = Zvec::Schema.new("ruby_llm_store") do
35
+ string cf, nullable: true
36
+ vector vf, dimension: dim,
37
+ index: Zvec::Ext::HnswIndexParams.new(metric_type)
38
+ end
39
+
40
+ @schema = schema
41
+
42
+ if Dir.exist?(path)
43
+ @collection = Zvec::Collection.open(path)
44
+ else
45
+ @collection = Zvec::Collection.create_and_open(path, schema)
46
+ end
47
+ end
48
+
49
+ # Add a document with its embedding and optional metadata.
50
+ def add(id, embedding:, content: nil, metadata: {})
51
+ doc = Zvec::Doc.new(pk: id, schema: @schema)
52
+ doc[@vector_field] = embedding
53
+ doc[@content_field] = content if content
54
+ metadata.each { |k, v| doc[k] = v }
55
+ @collection.insert(doc)
56
+ end
57
+
58
+ # Batch-add documents.
59
+ # docs: array of { id:, embedding:, content:, metadata: {} }
60
+ def add_many(docs)
61
+ zvec_docs = docs.map do |d|
62
+ doc = Zvec::Doc.new(pk: d[:id], schema: @schema)
63
+ doc[@vector_field] = d[:embedding]
64
+ doc[@content_field] = d[:content] if d[:content]
65
+ (d[:metadata] || {}).each { |k, v| doc[k] = v }
66
+ doc
67
+ end
68
+ @collection.insert(zvec_docs)
69
+ end
70
+
71
+ # Search for similar vectors.
72
+ def search(query_vector, top_k: 10, filter: nil)
73
+ results = @collection.query(
74
+ field_name: @vector_field,
75
+ vector: query_vector,
76
+ topk: top_k,
77
+ filter: filter
78
+ )
79
+ results.map do |doc|
80
+ {
81
+ id: doc.pk,
82
+ score: doc.score,
83
+ content: doc[@content_field],
84
+ metadata: doc.to_h.reject { |k, _| ["pk", "score", @vector_field, @content_field].include?(k) }
85
+ }
86
+ end
87
+ end
88
+
89
+ # Delete documents by IDs.
90
+ def delete(*ids)
91
+ @collection.delete(*ids.flatten)
92
+ end
93
+
94
+ # Fetch documents by IDs.
95
+ def fetch(*ids)
96
+ @collection.fetch(*ids.flatten)
97
+ end
98
+
99
+ def flush
100
+ @collection.flush
101
+ end
102
+
103
+ def count
104
+ @collection.doc_count
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,70 @@
1
+ module Zvec
2
+ class Schema
3
+ attr_reader :ext_schema
4
+
5
+ def initialize(name, &block)
6
+ @ext_schema = Ext::CollectionSchema.new(name)
7
+ @field_types = {}
8
+ instance_eval(&block) if block
9
+ end
10
+
11
+ def field(name, type, dimension: nil, nullable: false, index: nil)
12
+ name = name.to_s
13
+ fs = Ext::FieldSchema.new(name, type)
14
+ fs.dimension = dimension if dimension
15
+ fs.nullable = nullable
16
+ fs.set_index_params(index) if index
17
+ @ext_schema.add_field(fs)
18
+ @field_types[name] = type
19
+ self
20
+ end
21
+
22
+ def vector(name, dimension:, type: DataTypes::VECTOR_FP32, index: nil)
23
+ field(name, type, dimension: dimension, index: index)
24
+ end
25
+
26
+ def string(name, **opts)
27
+ field(name, DataTypes::STRING, **opts)
28
+ end
29
+
30
+ def int32(name, **opts)
31
+ field(name, DataTypes::INT32, **opts)
32
+ end
33
+
34
+ def int64(name, **opts)
35
+ field(name, DataTypes::INT64, **opts)
36
+ end
37
+
38
+ def float(name, **opts)
39
+ field(name, DataTypes::FLOAT, **opts)
40
+ end
41
+
42
+ def double(name, **opts)
43
+ field(name, DataTypes::DOUBLE, **opts)
44
+ end
45
+
46
+ def bool(name, **opts)
47
+ field(name, DataTypes::BOOL, **opts)
48
+ end
49
+
50
+ def name
51
+ @ext_schema.name
52
+ end
53
+
54
+ def field_names
55
+ @ext_schema.field_names
56
+ end
57
+
58
+ def field_type(name)
59
+ @field_types[name.to_s]
60
+ end
61
+
62
+ def has_field?(name)
63
+ @ext_schema.has_field?(name.to_s)
64
+ end
65
+
66
+ def to_s
67
+ @ext_schema.to_s
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,3 @@
1
+ module Zvec
2
+ VERSION = "0.1.0"
3
+ end
data/lib/zvec.rb ADDED
@@ -0,0 +1,22 @@
1
+ require_relative "zvec/version"
2
+
3
+ # Try loading precompiled native extension first (platform gem),
4
+ # then fall back to the compiled-from-source version.
5
+ begin
6
+ ruby_version = RUBY_VERSION[/\d+\.\d+/]
7
+ require "zvec/#{ruby_version}/zvec_ext"
8
+ rescue LoadError
9
+ require "zvec/zvec_ext"
10
+ end
11
+
12
+ require_relative "zvec/data_types"
13
+ require_relative "zvec/schema"
14
+ require_relative "zvec/doc"
15
+ require_relative "zvec/query"
16
+ require_relative "zvec/collection"
17
+
18
+ module Zvec
19
+ class Error < StandardError; end
20
+
21
+ include DataTypes
22
+ end