pgvector 0.2.2 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/LICENSE.txt +1 -1
- data/README.md +28 -3
- data/lib/pgvector/bit.rb +28 -0
- data/lib/pgvector/half_vector.rb +19 -0
- data/lib/pgvector/pg.rb +41 -4
- data/lib/pgvector/sparse_vector.rb +87 -0
- data/lib/pgvector/vector.rb +25 -0
- data/lib/pgvector/version.rb +1 -1
- data/lib/pgvector.rb +16 -2
- data/lib/sequel/extensions/pgvector.rb +5 -0
- data/lib/sequel/plugins/pgvector.rb +6 -0
- metadata +9 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d3ef6a53417383ff7f8a2a514df78513dfe9029e524f0f624df8828fb99dba0
|
4
|
+
data.tar.gz: f326a21d3942079cdadc22d9a61367e7a6651ae37c5eb07a3955182f9f569ad0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fa4d6519685d179e3d5712cbcf1e6989ca4f98e5b0902f5061eca2be55451162f199fcc75f19841dbd8ada8c18de69ac6fa39cad545d4b5a6c542e9cdd0109ea
|
7
|
+
data.tar.gz: fd71245492b2ff8a06a0af60dbd12e57e44025f2662d4ea77eb7176aaaa4c4cb3b39c1e159edce92944da5b54fddbb19d1b504acd76a12c4efdb9a0fb6e797da
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
## 0.3.1 (2024-07-10)
|
2
|
+
|
3
|
+
- Added support for `bit` type to pg
|
4
|
+
- Added extension for Sequel
|
5
|
+
|
6
|
+
## 0.3.0 (2024-06-25)
|
7
|
+
|
8
|
+
- Added support for `halfvec` and `sparsevec` types
|
9
|
+
- Added `taxicab`, `hamming`, and `jaccard` distances for Sequel
|
10
|
+
- Dropped support for Ruby < 3.1
|
11
|
+
|
1
12
|
## 0.2.2 (2023-10-03)
|
2
13
|
|
3
14
|
- Added `nearest_neighbors` method to datasets with Sequel
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -6,7 +6,7 @@ Supports [pg](https://github.com/ged/ruby-pg) and [Sequel](https://github.com/je
|
|
6
6
|
|
7
7
|
For Rails, check out [Neighbor](https://github.com/ankane/neighbor)
|
8
8
|
|
9
|
-
[![Build Status](https://github.com/pgvector/pgvector-ruby/workflows/build/badge.svg
|
9
|
+
[![Build Status](https://github.com/pgvector/pgvector-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector-ruby/actions)
|
10
10
|
|
11
11
|
## Installation
|
12
12
|
|
@@ -26,6 +26,7 @@ Or check out some examples:
|
|
26
26
|
- [Embeddings](examples/openai_embeddings.rb) with OpenAI
|
27
27
|
- [User-based recommendations](examples/disco_user_recs.rb) with Disco
|
28
28
|
- [Item-based recommendations](examples/disco_item_recs.rb) with Disco
|
29
|
+
- [Bulk loading](examples/bulk_loading.rb) with `COPY`
|
29
30
|
|
30
31
|
## pg
|
31
32
|
|
@@ -35,7 +36,7 @@ Enable the extension
|
|
35
36
|
conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
|
36
37
|
```
|
37
38
|
|
38
|
-
|
39
|
+
Optionally enable type casting for results
|
39
40
|
|
40
41
|
```ruby
|
41
42
|
registry = PG::BasicTypeRegistry.new.define_default_types
|
@@ -43,6 +44,12 @@ Pgvector::PG.register_vector(registry)
|
|
43
44
|
conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn, registry: registry)
|
44
45
|
```
|
45
46
|
|
47
|
+
Create a table
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
conn.exec("CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))")
|
51
|
+
```
|
52
|
+
|
46
53
|
Insert a vector
|
47
54
|
|
48
55
|
```ruby
|
@@ -56,6 +63,16 @@ Get the nearest neighbors to a vector
|
|
56
63
|
conn.exec_params("SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5", [embedding]).to_a
|
57
64
|
```
|
58
65
|
|
66
|
+
Add an approximate index
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
conn.exec("CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)")
|
70
|
+
# or
|
71
|
+
conn.exec("CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)")
|
72
|
+
```
|
73
|
+
|
74
|
+
Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
|
75
|
+
|
59
76
|
## Sequel
|
60
77
|
|
61
78
|
Enable the extension
|
@@ -93,7 +110,7 @@ Get the nearest neighbors to a record
|
|
93
110
|
item.nearest_neighbors(:embedding, distance: "euclidean").limit(5)
|
94
111
|
```
|
95
112
|
|
96
|
-
Also supports `inner_product` and `
|
113
|
+
Also supports `inner_product`, `cosine`, `taxicab`, `hamming`, and `jaccard` distance
|
97
114
|
|
98
115
|
Get the nearest neighbors to a vector
|
99
116
|
|
@@ -101,6 +118,14 @@ Get the nearest neighbors to a vector
|
|
101
118
|
Item.nearest_neighbors(:embedding, [1, 1, 1], distance: "euclidean").limit(5)
|
102
119
|
```
|
103
120
|
|
121
|
+
Add an approximate index
|
122
|
+
|
123
|
+
```ruby
|
124
|
+
DB.add_index :items, :embedding, type: "hnsw", opclass: "vector_l2_ops"
|
125
|
+
```
|
126
|
+
|
127
|
+
Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
|
128
|
+
|
104
129
|
## History
|
105
130
|
|
106
131
|
View the [changelog](https://github.com/pgvector/pgvector-ruby/blob/master/CHANGELOG.md)
|
data/lib/pgvector/bit.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
module Pgvector
|
2
|
+
class Bit
|
3
|
+
def initialize(data)
|
4
|
+
if data.is_a?(Array)
|
5
|
+
@data = data.map { |v| v ? "1" : "0" }.join
|
6
|
+
else
|
7
|
+
@data = data.to_str
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.from_text(string)
|
12
|
+
Bit.new(string)
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.from_binary(string)
|
16
|
+
length = string[..3].unpack1("l>")
|
17
|
+
Bit.new(string[4..].unpack("B*").join[...length])
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_s
|
21
|
+
@data
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_a
|
25
|
+
@data.each_char.map { |v| v != "0" }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Pgvector
|
2
|
+
class HalfVector
|
3
|
+
def initialize(data)
|
4
|
+
@data = data.to_a.map(&:to_f)
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.from_text(string)
|
8
|
+
new(string[1..-2].split(",").map(&:to_f))
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
"[#{@data.to_a.map(&:to_f).join(",")}]"
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_a
|
16
|
+
@data
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/pgvector/pg.rb
CHANGED
@@ -5,14 +5,33 @@ module Pgvector
|
|
5
5
|
def self.register_vector(registry)
|
6
6
|
registry.register_type(0, "vector", nil, TextDecoder::Vector)
|
7
7
|
registry.register_type(1, "vector", nil, BinaryDecoder::Vector)
|
8
|
+
|
9
|
+
# no binary decoder for halfvec since unpack does not have directive for half-precision
|
10
|
+
registry.register_type(0, "halfvec", nil, TextDecoder::Halfvec)
|
11
|
+
|
12
|
+
registry.register_type(0, "bit", nil, TextDecoder::Bit)
|
13
|
+
registry.register_type(1, "bit", nil, BinaryDecoder::Bit)
|
14
|
+
|
15
|
+
registry.register_type(0, "sparsevec", nil, TextDecoder::Sparsevec)
|
16
|
+
registry.register_type(1, "sparsevec", nil, BinaryDecoder::Sparsevec)
|
8
17
|
end
|
9
18
|
|
10
19
|
module BinaryDecoder
|
11
20
|
class Vector < ::PG::SimpleDecoder
|
12
21
|
def decode(string, tuple = nil, field = nil)
|
13
|
-
|
14
|
-
|
15
|
-
|
22
|
+
::Pgvector::Vector.from_binary(string).to_a
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class Bit < ::PG::SimpleDecoder
|
27
|
+
def decode(string, tuple = nil, field = nil)
|
28
|
+
::Pgvector::Bit.from_binary(string).to_s
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class Sparsevec < ::PG::SimpleDecoder
|
33
|
+
def decode(string, tuple = nil, field = nil)
|
34
|
+
SparseVector.from_binary(string)
|
16
35
|
end
|
17
36
|
end
|
18
37
|
end
|
@@ -20,7 +39,25 @@ module Pgvector
|
|
20
39
|
module TextDecoder
|
21
40
|
class Vector < ::PG::SimpleDecoder
|
22
41
|
def decode(string, tuple = nil, field = nil)
|
23
|
-
Pgvector.
|
42
|
+
::Pgvector::Vector.from_text(string).to_a
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class Halfvec < ::PG::SimpleDecoder
|
47
|
+
def decode(string, tuple = nil, field = nil)
|
48
|
+
HalfVector.from_text(string).to_a
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class Bit < ::PG::SimpleDecoder
|
53
|
+
def decode(string, tuple = nil, field = nil)
|
54
|
+
::Pgvector::Bit.from_text(string).to_s
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class Sparsevec < ::PG::SimpleDecoder
|
59
|
+
def decode(string, tuple = nil, field = nil)
|
60
|
+
SparseVector.from_text(string)
|
24
61
|
end
|
25
62
|
end
|
26
63
|
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module Pgvector
|
2
|
+
class SparseVector
|
3
|
+
attr_reader :dimensions, :indices, :values
|
4
|
+
|
5
|
+
NO_DEFAULT = Object.new
|
6
|
+
|
7
|
+
def initialize(value, dimensions = NO_DEFAULT)
|
8
|
+
if value.is_a?(Hash)
|
9
|
+
if dimensions == NO_DEFAULT
|
10
|
+
raise ArgumentError, "missing dimensions"
|
11
|
+
end
|
12
|
+
from_hash(value, dimensions)
|
13
|
+
else
|
14
|
+
unless dimensions == NO_DEFAULT
|
15
|
+
raise ArgumentError, "extra argument"
|
16
|
+
end
|
17
|
+
from_array(value)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_s
|
22
|
+
"{#{@indices.zip(@values).map { |i, v| "#{i.to_i + 1}:#{v.to_f}" }.join(",")}}/#{@dimensions.to_i}"
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_a
|
26
|
+
arr = Array.new(dimensions, 0.0)
|
27
|
+
@indices.zip(@values) do |i, v|
|
28
|
+
arr[i] = v
|
29
|
+
end
|
30
|
+
arr
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def from_hash(data, dimensions)
|
36
|
+
elements = data.select { |_, v| v != 0 }.sort
|
37
|
+
@dimensions = dimensions.to_i
|
38
|
+
@indices = elements.map { |v| v[0].to_i }
|
39
|
+
@values = elements.map { |v| v[1].to_f }
|
40
|
+
end
|
41
|
+
|
42
|
+
def from_array(arr)
|
43
|
+
arr = arr.to_a
|
44
|
+
@dimensions = arr.size
|
45
|
+
@indices = []
|
46
|
+
@values = []
|
47
|
+
arr.each_with_index do |v, i|
|
48
|
+
if v != 0
|
49
|
+
@indices << i
|
50
|
+
@values << v.to_f
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class << self
|
56
|
+
def from_text(string)
|
57
|
+
elements, dimensions = string.split("/", 2)
|
58
|
+
indices = []
|
59
|
+
values = []
|
60
|
+
elements[1..-2].split(",").each do |e|
|
61
|
+
index, value = e.split(":", 2)
|
62
|
+
indices << index.to_i - 1
|
63
|
+
values << value.to_f
|
64
|
+
end
|
65
|
+
from_parts(dimensions.to_i, indices, values)
|
66
|
+
end
|
67
|
+
|
68
|
+
def from_binary(string)
|
69
|
+
dim, nnz, unused = string[0, 12].unpack("l>l>l>")
|
70
|
+
raise "expected unused to be 0" if unused != 0
|
71
|
+
indices = string[12, nnz * 4].unpack("l>#{nnz}")
|
72
|
+
values = string[(12 + nnz * 4)..-1].unpack("g#{nnz}")
|
73
|
+
from_parts(dim, indices, values)
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def from_parts(dimensions, indices, values)
|
79
|
+
vec = allocate
|
80
|
+
vec.instance_variable_set(:@dimensions, dimensions)
|
81
|
+
vec.instance_variable_set(:@indices, indices)
|
82
|
+
vec.instance_variable_set(:@values, values)
|
83
|
+
vec
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Pgvector
|
2
|
+
class Vector
|
3
|
+
def initialize(data)
|
4
|
+
@data = data.to_a.map(&:to_f)
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.from_text(string)
|
8
|
+
Vector.new(string[1..-2].split(",").map(&:to_f))
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.from_binary(string)
|
12
|
+
dim, unused = string[0, 4].unpack("nn")
|
13
|
+
raise "expected unused to be 0" if unused != 0
|
14
|
+
Vector.new(string[4..-1].unpack("g#{dim}"))
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
"[#{@data.to_a.map(&:to_f).join(",")}]"
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_a
|
22
|
+
@data
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/pgvector/version.rb
CHANGED
data/lib/pgvector.rb
CHANGED
@@ -1,14 +1,28 @@
|
|
1
1
|
# modules
|
2
|
+
require_relative "pgvector/bit"
|
3
|
+
require_relative "pgvector/half_vector"
|
4
|
+
require_relative "pgvector/sparse_vector"
|
5
|
+
require_relative "pgvector/vector"
|
2
6
|
require_relative "pgvector/version"
|
3
7
|
|
4
8
|
module Pgvector
|
5
9
|
autoload :PG, "pgvector/pg"
|
6
10
|
|
7
11
|
def self.encode(data)
|
8
|
-
|
12
|
+
if data.is_a?(Vector) || data.is_a?(HalfVector) || data.is_a?(SparseVector)
|
13
|
+
data.to_s
|
14
|
+
else
|
15
|
+
Vector.new(data).to_s
|
16
|
+
end
|
9
17
|
end
|
10
18
|
|
11
19
|
def self.decode(string)
|
12
|
-
string[
|
20
|
+
if string[0] == "["
|
21
|
+
Vector.from_text(string).to_a
|
22
|
+
elsif string[0] == "{"
|
23
|
+
SparseVector.from_text(string)
|
24
|
+
else
|
25
|
+
string
|
26
|
+
end
|
13
27
|
end
|
14
28
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pgvector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-07-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|
@@ -20,8 +20,13 @@ files:
|
|
20
20
|
- LICENSE.txt
|
21
21
|
- README.md
|
22
22
|
- lib/pgvector.rb
|
23
|
+
- lib/pgvector/bit.rb
|
24
|
+
- lib/pgvector/half_vector.rb
|
23
25
|
- lib/pgvector/pg.rb
|
26
|
+
- lib/pgvector/sparse_vector.rb
|
27
|
+
- lib/pgvector/vector.rb
|
24
28
|
- lib/pgvector/version.rb
|
29
|
+
- lib/sequel/extensions/pgvector.rb
|
25
30
|
- lib/sequel/plugins/pgvector.rb
|
26
31
|
homepage: https://github.com/pgvector/pgvector-ruby
|
27
32
|
licenses:
|
@@ -35,14 +40,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
35
40
|
requirements:
|
36
41
|
- - ">="
|
37
42
|
- !ruby/object:Gem::Version
|
38
|
-
version: '3'
|
43
|
+
version: '3.1'
|
39
44
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
45
|
requirements:
|
41
46
|
- - ">="
|
42
47
|
- !ruby/object:Gem::Version
|
43
48
|
version: '0'
|
44
49
|
requirements: []
|
45
|
-
rubygems_version: 3.
|
50
|
+
rubygems_version: 3.5.11
|
46
51
|
signing_key:
|
47
52
|
specification_version: 4
|
48
53
|
summary: pgvector support for Ruby
|