pgvector 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/LICENSE.txt +1 -1
- data/README.md +28 -3
- data/lib/pgvector/half_vector.rb +19 -0
- data/lib/pgvector/pg.rb +26 -4
- data/lib/pgvector/sparse_vector.rb +87 -0
- data/lib/pgvector/vector.rb +25 -0
- data/lib/pgvector/version.rb +1 -1
- data/lib/pgvector.rb +15 -2
- data/lib/sequel/plugins/pgvector.rb +6 -0
- metadata +7 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07a80636c13841d2fa97f8b740feb095c1c2f94fa3691b374d1472bba918bddf
|
4
|
+
data.tar.gz: 50c9e67781fbbe23fa3dbe2f87790025f18d3686e412b46f62d1cb0d2995ecb3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f0aa5b733e1bc4022c2052be6b04aa0656bf306f588baf2907827674aeadc7dc52507895043049246634999dea37de629c19a164bd289fbd979eb628aaab0c33
|
7
|
+
data.tar.gz: f52e2e9c1c074c4f809fe78d6926bcb630694d79199e288ebb33d136a05191c4b2fff6038f8d7fc56d4291f949caa1b536d942aa7d0912be2daccf5a7ee23f83
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## 0.3.0 (2024-06-25)
|
2
|
+
|
3
|
+
- Added support for `halfvec` and `sparsevec` types
|
4
|
+
- Added `taxicab`, `hamming`, and `jaccard` distances for Sequel
|
5
|
+
- Dropped support for Ruby < 3.1
|
6
|
+
|
1
7
|
## 0.2.2 (2023-10-03)
|
2
8
|
|
3
9
|
- Added `nearest_neighbors` method to datasets with Sequel
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -6,7 +6,7 @@ Supports [pg](https://github.com/ged/ruby-pg) and [Sequel](https://github.com/je
|
|
6
6
|
|
7
7
|
For Rails, check out [Neighbor](https://github.com/ankane/neighbor)
|
8
8
|
|
9
|
-
[](https://github.com/pgvector/pgvector-ruby/actions)
|
10
10
|
|
11
11
|
## Installation
|
12
12
|
|
@@ -26,6 +26,7 @@ Or check out some examples:
|
|
26
26
|
- [Embeddings](examples/openai_embeddings.rb) with OpenAI
|
27
27
|
- [User-based recommendations](examples/disco_user_recs.rb) with Disco
|
28
28
|
- [Item-based recommendations](examples/disco_item_recs.rb) with Disco
|
29
|
+
- [Bulk loading](examples/bulk_loading.rb) with `COPY`
|
29
30
|
|
30
31
|
## pg
|
31
32
|
|
@@ -35,7 +36,7 @@ Enable the extension
|
|
35
36
|
conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
|
36
37
|
```
|
37
38
|
|
38
|
-
|
39
|
+
Optionally enable type casting for results
|
39
40
|
|
40
41
|
```ruby
|
41
42
|
registry = PG::BasicTypeRegistry.new.define_default_types
|
@@ -43,6 +44,12 @@ Pgvector::PG.register_vector(registry)
|
|
43
44
|
conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn, registry: registry)
|
44
45
|
```
|
45
46
|
|
47
|
+
Create a table
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
conn.exec("CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))")
|
51
|
+
```
|
52
|
+
|
46
53
|
Insert a vector
|
47
54
|
|
48
55
|
```ruby
|
@@ -56,6 +63,16 @@ Get the nearest neighbors to a vector
|
|
56
63
|
conn.exec_params("SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5", [embedding]).to_a
|
57
64
|
```
|
58
65
|
|
66
|
+
Add an approximate index
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
conn.exec("CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)")
|
70
|
+
# or
|
71
|
+
conn.exec("CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)")
|
72
|
+
```
|
73
|
+
|
74
|
+
Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
|
75
|
+
|
59
76
|
## Sequel
|
60
77
|
|
61
78
|
Enable the extension
|
@@ -93,7 +110,7 @@ Get the nearest neighbors to a record
|
|
93
110
|
item.nearest_neighbors(:embedding, distance: "euclidean").limit(5)
|
94
111
|
```
|
95
112
|
|
96
|
-
Also supports `inner_product` and `
|
113
|
+
Also supports `inner_product`, `cosine`, `taxicab`, `hamming`, and `jaccard` distance
|
97
114
|
|
98
115
|
Get the nearest neighbors to a vector
|
99
116
|
|
@@ -101,6 +118,14 @@ Get the nearest neighbors to a vector
|
|
101
118
|
Item.nearest_neighbors(:embedding, [1, 1, 1], distance: "euclidean").limit(5)
|
102
119
|
```
|
103
120
|
|
121
|
+
Add an approximate index
|
122
|
+
|
123
|
+
```ruby
|
124
|
+
DB.add_index :items, :embedding, type: "hnsw", opclass: "vector_l2_ops"
|
125
|
+
```
|
126
|
+
|
127
|
+
Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
|
128
|
+
|
104
129
|
## History
|
105
130
|
|
106
131
|
View the [changelog](https://github.com/pgvector/pgvector-ruby/blob/master/CHANGELOG.md)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Pgvector
|
2
|
+
class HalfVector
|
3
|
+
def initialize(data)
|
4
|
+
@data = data.to_a.map(&:to_f)
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.from_text(string)
|
8
|
+
new(string[1..-2].split(",").map(&:to_f))
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
"[#{@data.to_a.map(&:to_f).join(",")}]"
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_a
|
16
|
+
@data
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/pgvector/pg.rb
CHANGED
@@ -5,14 +5,24 @@ module Pgvector
|
|
5
5
|
def self.register_vector(registry)
|
6
6
|
registry.register_type(0, "vector", nil, TextDecoder::Vector)
|
7
7
|
registry.register_type(1, "vector", nil, BinaryDecoder::Vector)
|
8
|
+
|
9
|
+
# no binary decoder for halfvec since unpack does not have directive for half-precision
|
10
|
+
registry.register_type(0, "halfvec", nil, TextDecoder::Halfvec)
|
11
|
+
|
12
|
+
registry.register_type(0, "sparsevec", nil, TextDecoder::Sparsevec)
|
13
|
+
registry.register_type(1, "sparsevec", nil, BinaryDecoder::Sparsevec)
|
8
14
|
end
|
9
15
|
|
10
16
|
module BinaryDecoder
|
11
17
|
class Vector < ::PG::SimpleDecoder
|
12
18
|
def decode(string, tuple = nil, field = nil)
|
13
|
-
|
14
|
-
|
15
|
-
|
19
|
+
::Pgvector::Vector.from_binary(string).to_a
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class Sparsevec < ::PG::SimpleDecoder
|
24
|
+
def decode(string, tuple = nil, field = nil)
|
25
|
+
SparseVector.from_binary(string)
|
16
26
|
end
|
17
27
|
end
|
18
28
|
end
|
@@ -20,7 +30,19 @@ module Pgvector
|
|
20
30
|
module TextDecoder
|
21
31
|
class Vector < ::PG::SimpleDecoder
|
22
32
|
def decode(string, tuple = nil, field = nil)
|
23
|
-
Pgvector.
|
33
|
+
::Pgvector::Vector.from_text(string).to_a
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Halfvec < ::PG::SimpleDecoder
|
38
|
+
def decode(string, tuple = nil, field = nil)
|
39
|
+
HalfVector.from_text(string).to_a
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class Sparsevec < ::PG::SimpleDecoder
|
44
|
+
def decode(string, tuple = nil, field = nil)
|
45
|
+
SparseVector.from_text(string)
|
24
46
|
end
|
25
47
|
end
|
26
48
|
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module Pgvector
|
2
|
+
class SparseVector
|
3
|
+
attr_reader :dimensions, :indices, :values
|
4
|
+
|
5
|
+
NO_DEFAULT = Object.new
|
6
|
+
|
7
|
+
def initialize(value, dimensions = NO_DEFAULT)
|
8
|
+
if value.is_a?(Hash)
|
9
|
+
if dimensions == NO_DEFAULT
|
10
|
+
raise ArgumentError, "missing dimensions"
|
11
|
+
end
|
12
|
+
from_hash(value, dimensions)
|
13
|
+
else
|
14
|
+
unless dimensions == NO_DEFAULT
|
15
|
+
raise ArgumentError, "extra argument"
|
16
|
+
end
|
17
|
+
from_array(value)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_s
|
22
|
+
"{#{@indices.zip(@values).map { |i, v| "#{i.to_i + 1}:#{v.to_f}" }.join(",")}}/#{@dimensions.to_i}"
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_a
|
26
|
+
arr = Array.new(dimensions, 0.0)
|
27
|
+
@indices.zip(@values) do |i, v|
|
28
|
+
arr[i] = v
|
29
|
+
end
|
30
|
+
arr
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def from_hash(data, dimensions)
|
36
|
+
elements = data.select { |_, v| v != 0 }.sort
|
37
|
+
@dimensions = dimensions.to_i
|
38
|
+
@indices = elements.map { |v| v[0].to_i }
|
39
|
+
@values = elements.map { |v| v[1].to_f }
|
40
|
+
end
|
41
|
+
|
42
|
+
def from_array(arr)
|
43
|
+
arr = arr.to_a
|
44
|
+
@dimensions = arr.size
|
45
|
+
@indices = []
|
46
|
+
@values = []
|
47
|
+
arr.each_with_index do |v, i|
|
48
|
+
if v != 0
|
49
|
+
@indices << i
|
50
|
+
@values << v.to_f
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class << self
|
56
|
+
def from_text(string)
|
57
|
+
elements, dimensions = string.split("/", 2)
|
58
|
+
indices = []
|
59
|
+
values = []
|
60
|
+
elements[1..-2].split(",").each do |e|
|
61
|
+
index, value = e.split(":", 2)
|
62
|
+
indices << index.to_i - 1
|
63
|
+
values << value.to_f
|
64
|
+
end
|
65
|
+
from_parts(dimensions.to_i, indices, values)
|
66
|
+
end
|
67
|
+
|
68
|
+
def from_binary(string)
|
69
|
+
dim, nnz, unused = string[0, 12].unpack("l>l>l>")
|
70
|
+
raise "expected unused to be 0" if unused != 0
|
71
|
+
indices = string[12, nnz * 4].unpack("l>#{nnz}")
|
72
|
+
values = string[(12 + nnz * 4)..-1].unpack("g#{nnz}")
|
73
|
+
from_parts(dim, indices, values)
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def from_parts(dimensions, indices, values)
|
79
|
+
vec = allocate
|
80
|
+
vec.instance_variable_set(:@dimensions, dimensions)
|
81
|
+
vec.instance_variable_set(:@indices, indices)
|
82
|
+
vec.instance_variable_set(:@values, values)
|
83
|
+
vec
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Pgvector
|
2
|
+
class Vector
|
3
|
+
def initialize(data)
|
4
|
+
@data = data.to_a.map(&:to_f)
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.from_text(string)
|
8
|
+
Vector.new(string[1..-2].split(",").map(&:to_f))
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.from_binary(string)
|
12
|
+
dim, unused = string[0, 4].unpack("nn")
|
13
|
+
raise "expected unused to be 0" if unused != 0
|
14
|
+
Vector.new(string[4..-1].unpack("g#{dim}"))
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
"[#{@data.to_a.map(&:to_f).join(",")}]"
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_a
|
22
|
+
@data
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/pgvector/version.rb
CHANGED
data/lib/pgvector.rb
CHANGED
@@ -1,14 +1,27 @@
|
|
1
1
|
# modules
|
2
|
+
require_relative "pgvector/half_vector"
|
3
|
+
require_relative "pgvector/sparse_vector"
|
4
|
+
require_relative "pgvector/vector"
|
2
5
|
require_relative "pgvector/version"
|
3
6
|
|
4
7
|
module Pgvector
|
5
8
|
autoload :PG, "pgvector/pg"
|
6
9
|
|
7
10
|
def self.encode(data)
|
8
|
-
|
11
|
+
if data.is_a?(SparseVector)
|
12
|
+
data.to_s
|
13
|
+
else
|
14
|
+
Vector.new(data).to_s
|
15
|
+
end
|
9
16
|
end
|
10
17
|
|
11
18
|
def self.decode(string)
|
12
|
-
string[
|
19
|
+
if string[0] == "["
|
20
|
+
Vector.from_text(string).to_a
|
21
|
+
elsif string[0] == "{"
|
22
|
+
SparseVector.from_text(string)
|
23
|
+
else
|
24
|
+
string
|
25
|
+
end
|
13
26
|
end
|
14
27
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pgvector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-06-26 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|
@@ -20,7 +20,10 @@ files:
|
|
20
20
|
- LICENSE.txt
|
21
21
|
- README.md
|
22
22
|
- lib/pgvector.rb
|
23
|
+
- lib/pgvector/half_vector.rb
|
23
24
|
- lib/pgvector/pg.rb
|
25
|
+
- lib/pgvector/sparse_vector.rb
|
26
|
+
- lib/pgvector/vector.rb
|
24
27
|
- lib/pgvector/version.rb
|
25
28
|
- lib/sequel/plugins/pgvector.rb
|
26
29
|
homepage: https://github.com/pgvector/pgvector-ruby
|
@@ -35,14 +38,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
35
38
|
requirements:
|
36
39
|
- - ">="
|
37
40
|
- !ruby/object:Gem::Version
|
38
|
-
version: '3'
|
41
|
+
version: '3.1'
|
39
42
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
43
|
requirements:
|
41
44
|
- - ">="
|
42
45
|
- !ruby/object:Gem::Version
|
43
46
|
version: '0'
|
44
47
|
requirements: []
|
45
|
-
rubygems_version: 3.
|
48
|
+
rubygems_version: 3.5.11
|
46
49
|
signing_key:
|
47
50
|
specification_version: 4
|
48
51
|
summary: pgvector support for Ruby
|