pgvector 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7485ea4be0d5be0177a972db911c696daf3438a661ddac61b08f4e8b2da3ac51
4
- data.tar.gz: 2532ef79f5db88aecb681d9455e38f3e5fc1d30bde015d0a1e9daaa9fe82635e
3
+ metadata.gz: 3d3ef6a53417383ff7f8a2a514df78513dfe9029e524f0f624df8828fb99dba0
4
+ data.tar.gz: f326a21d3942079cdadc22d9a61367e7a6651ae37c5eb07a3955182f9f569ad0
5
5
  SHA512:
6
- metadata.gz: be40e4c3e16dd904a200115794a8ffaa850b40c5055330bd873ec7a707164a53b29d22040defbb4dd8a9cff597d4e1ad5c659d37a580ccc201ce30a9eb17fef9
7
- data.tar.gz: f5d36289b043d987920911ab08d85d7d1066039f84dcc2a24436701c06b246adcb8fd3c32e3f76f3e1604001403574c14818f91a64357ce4ebd675458e57184c
6
+ metadata.gz: fa4d6519685d179e3d5712cbcf1e6989ca4f98e5b0902f5061eca2be55451162f199fcc75f19841dbd8ada8c18de69ac6fa39cad545d4b5a6c542e9cdd0109ea
7
+ data.tar.gz: fd71245492b2ff8a06a0af60dbd12e57e44025f2662d4ea77eb7176aaaa4c4cb3b39c1e159edce92944da5b54fddbb19d1b504acd76a12c4efdb9a0fb6e797da
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## 0.3.1 (2024-07-10)
2
+
3
+ - Added support for `bit` type to pg
4
+ - Added extension for Sequel
5
+
6
+ ## 0.3.0 (2024-06-25)
7
+
8
+ - Added support for `halfvec` and `sparsevec` types
9
+ - Added `taxicab`, `hamming`, and `jaccard` distances for Sequel
10
+ - Dropped support for Ruby < 3.1
11
+
1
12
  ## 0.2.2 (2023-10-03)
2
13
 
3
14
  - Added `nearest_neighbors` method to datasets with Sequel
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2022-2023 Andrew Kane
3
+ Copyright (c) 2022-2024 Andrew Kane
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -6,7 +6,7 @@ Supports [pg](https://github.com/ged/ruby-pg) and [Sequel](https://github.com/je
6
6
 
7
7
  For Rails, check out [Neighbor](https://github.com/ankane/neighbor)
8
8
 
9
- [![Build Status](https://github.com/pgvector/pgvector-ruby/workflows/build/badge.svg?branch=master)](https://github.com/pgvector/pgvector-ruby/actions)
9
+ [![Build Status](https://github.com/pgvector/pgvector-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector-ruby/actions)
10
10
 
11
11
  ## Installation
12
12
 
@@ -26,6 +26,7 @@ Or check out some examples:
26
26
  - [Embeddings](examples/openai_embeddings.rb) with OpenAI
27
27
  - [User-based recommendations](examples/disco_user_recs.rb) with Disco
28
28
  - [Item-based recommendations](examples/disco_item_recs.rb) with Disco
29
+ - [Bulk loading](examples/bulk_loading.rb) with `COPY`
29
30
 
30
31
  ## pg
31
32
 
@@ -35,7 +36,7 @@ Enable the extension
35
36
  conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
36
37
  ```
37
38
 
38
- Register the vector type with your connection
39
+ Optionally enable type casting for results
39
40
 
40
41
  ```ruby
41
42
  registry = PG::BasicTypeRegistry.new.define_default_types
@@ -43,6 +44,12 @@ Pgvector::PG.register_vector(registry)
43
44
  conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn, registry: registry)
44
45
  ```
45
46
 
47
+ Create a table
48
+
49
+ ```ruby
50
+ conn.exec("CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))")
51
+ ```
52
+
46
53
  Insert a vector
47
54
 
48
55
  ```ruby
@@ -56,6 +63,16 @@ Get the nearest neighbors to a vector
56
63
  conn.exec_params("SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5", [embedding]).to_a
57
64
  ```
58
65
 
66
+ Add an approximate index
67
+
68
+ ```ruby
69
+ conn.exec("CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)")
70
+ # or
71
+ conn.exec("CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)")
72
+ ```
73
+
74
+ Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
75
+
59
76
  ## Sequel
60
77
 
61
78
  Enable the extension
@@ -93,7 +110,7 @@ Get the nearest neighbors to a record
93
110
  item.nearest_neighbors(:embedding, distance: "euclidean").limit(5)
94
111
  ```
95
112
 
96
- Also supports `inner_product` and `cosine` distance
113
+ Also supports `inner_product`, `cosine`, `taxicab`, `hamming`, and `jaccard` distance
97
114
 
98
115
  Get the nearest neighbors to a vector
99
116
 
@@ -101,6 +118,14 @@ Get the nearest neighbors to a vector
101
118
  Item.nearest_neighbors(:embedding, [1, 1, 1], distance: "euclidean").limit(5)
102
119
  ```
103
120
 
121
+ Add an approximate index
122
+
123
+ ```ruby
124
+ DB.add_index :items, :embedding, type: "hnsw", opclass: "vector_l2_ops"
125
+ ```
126
+
127
+ Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
128
+
104
129
  ## History
105
130
 
106
131
  View the [changelog](https://github.com/pgvector/pgvector-ruby/blob/master/CHANGELOG.md)
@@ -0,0 +1,28 @@
1
+ module Pgvector
2
+ class Bit
3
+ def initialize(data)
4
+ if data.is_a?(Array)
5
+ @data = data.map { |v| v ? "1" : "0" }.join
6
+ else
7
+ @data = data.to_str
8
+ end
9
+ end
10
+
11
+ def self.from_text(string)
12
+ Bit.new(string)
13
+ end
14
+
15
+ def self.from_binary(string)
16
+ length = string[..3].unpack1("l>")
17
+ Bit.new(string[4..].unpack("B*").join[...length])
18
+ end
19
+
20
+ def to_s
21
+ @data
22
+ end
23
+
24
+ def to_a
25
+ @data.each_char.map { |v| v != "0" }
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,19 @@
1
+ module Pgvector
2
+ class HalfVector
3
+ def initialize(data)
4
+ @data = data.to_a.map(&:to_f)
5
+ end
6
+
7
+ def self.from_text(string)
8
+ new(string[1..-2].split(",").map(&:to_f))
9
+ end
10
+
11
+ def to_s
12
+ "[#{@data.to_a.map(&:to_f).join(",")}]"
13
+ end
14
+
15
+ def to_a
16
+ @data
17
+ end
18
+ end
19
+ end
data/lib/pgvector/pg.rb CHANGED
@@ -5,14 +5,33 @@ module Pgvector
5
5
  def self.register_vector(registry)
6
6
  registry.register_type(0, "vector", nil, TextDecoder::Vector)
7
7
  registry.register_type(1, "vector", nil, BinaryDecoder::Vector)
8
+
9
+ # no binary decoder for halfvec since unpack does not have directive for half-precision
10
+ registry.register_type(0, "halfvec", nil, TextDecoder::Halfvec)
11
+
12
+ registry.register_type(0, "bit", nil, TextDecoder::Bit)
13
+ registry.register_type(1, "bit", nil, BinaryDecoder::Bit)
14
+
15
+ registry.register_type(0, "sparsevec", nil, TextDecoder::Sparsevec)
16
+ registry.register_type(1, "sparsevec", nil, BinaryDecoder::Sparsevec)
8
17
  end
9
18
 
10
19
  module BinaryDecoder
11
20
  class Vector < ::PG::SimpleDecoder
12
21
  def decode(string, tuple = nil, field = nil)
13
- dim, unused = string[0, 4].unpack("nn")
14
- raise "expected unused to be 0" if unused != 0
15
- string[4..-1].unpack("g#{dim}")
22
+ ::Pgvector::Vector.from_binary(string).to_a
23
+ end
24
+ end
25
+
26
+ class Bit < ::PG::SimpleDecoder
27
+ def decode(string, tuple = nil, field = nil)
28
+ ::Pgvector::Bit.from_binary(string).to_s
29
+ end
30
+ end
31
+
32
+ class Sparsevec < ::PG::SimpleDecoder
33
+ def decode(string, tuple = nil, field = nil)
34
+ SparseVector.from_binary(string)
16
35
  end
17
36
  end
18
37
  end
@@ -20,7 +39,25 @@ module Pgvector
20
39
  module TextDecoder
21
40
  class Vector < ::PG::SimpleDecoder
22
41
  def decode(string, tuple = nil, field = nil)
23
- Pgvector.decode(string)
42
+ ::Pgvector::Vector.from_text(string).to_a
43
+ end
44
+ end
45
+
46
+ class Halfvec < ::PG::SimpleDecoder
47
+ def decode(string, tuple = nil, field = nil)
48
+ HalfVector.from_text(string).to_a
49
+ end
50
+ end
51
+
52
+ class Bit < ::PG::SimpleDecoder
53
+ def decode(string, tuple = nil, field = nil)
54
+ ::Pgvector::Bit.from_text(string).to_s
55
+ end
56
+ end
57
+
58
+ class Sparsevec < ::PG::SimpleDecoder
59
+ def decode(string, tuple = nil, field = nil)
60
+ SparseVector.from_text(string)
24
61
  end
25
62
  end
26
63
  end
@@ -0,0 +1,87 @@
1
+ module Pgvector
2
+ class SparseVector
3
+ attr_reader :dimensions, :indices, :values
4
+
5
+ NO_DEFAULT = Object.new
6
+
7
+ def initialize(value, dimensions = NO_DEFAULT)
8
+ if value.is_a?(Hash)
9
+ if dimensions == NO_DEFAULT
10
+ raise ArgumentError, "missing dimensions"
11
+ end
12
+ from_hash(value, dimensions)
13
+ else
14
+ unless dimensions == NO_DEFAULT
15
+ raise ArgumentError, "extra argument"
16
+ end
17
+ from_array(value)
18
+ end
19
+ end
20
+
21
+ def to_s
22
+ "{#{@indices.zip(@values).map { |i, v| "#{i.to_i + 1}:#{v.to_f}" }.join(",")}}/#{@dimensions.to_i}"
23
+ end
24
+
25
+ def to_a
26
+ arr = Array.new(dimensions, 0.0)
27
+ @indices.zip(@values) do |i, v|
28
+ arr[i] = v
29
+ end
30
+ arr
31
+ end
32
+
33
+ private
34
+
35
+ def from_hash(data, dimensions)
36
+ elements = data.select { |_, v| v != 0 }.sort
37
+ @dimensions = dimensions.to_i
38
+ @indices = elements.map { |v| v[0].to_i }
39
+ @values = elements.map { |v| v[1].to_f }
40
+ end
41
+
42
+ def from_array(arr)
43
+ arr = arr.to_a
44
+ @dimensions = arr.size
45
+ @indices = []
46
+ @values = []
47
+ arr.each_with_index do |v, i|
48
+ if v != 0
49
+ @indices << i
50
+ @values << v.to_f
51
+ end
52
+ end
53
+ end
54
+
55
+ class << self
56
+ def from_text(string)
57
+ elements, dimensions = string.split("/", 2)
58
+ indices = []
59
+ values = []
60
+ elements[1..-2].split(",").each do |e|
61
+ index, value = e.split(":", 2)
62
+ indices << index.to_i - 1
63
+ values << value.to_f
64
+ end
65
+ from_parts(dimensions.to_i, indices, values)
66
+ end
67
+
68
+ def from_binary(string)
69
+ dim, nnz, unused = string[0, 12].unpack("l>l>l>")
70
+ raise "expected unused to be 0" if unused != 0
71
+ indices = string[12, nnz * 4].unpack("l>#{nnz}")
72
+ values = string[(12 + nnz * 4)..-1].unpack("g#{nnz}")
73
+ from_parts(dim, indices, values)
74
+ end
75
+
76
+ private
77
+
78
+ def from_parts(dimensions, indices, values)
79
+ vec = allocate
80
+ vec.instance_variable_set(:@dimensions, dimensions)
81
+ vec.instance_variable_set(:@indices, indices)
82
+ vec.instance_variable_set(:@values, values)
83
+ vec
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,25 @@
1
+ module Pgvector
2
+ class Vector
3
+ def initialize(data)
4
+ @data = data.to_a.map(&:to_f)
5
+ end
6
+
7
+ def self.from_text(string)
8
+ Vector.new(string[1..-2].split(",").map(&:to_f))
9
+ end
10
+
11
+ def self.from_binary(string)
12
+ dim, unused = string[0, 4].unpack("nn")
13
+ raise "expected unused to be 0" if unused != 0
14
+ Vector.new(string[4..-1].unpack("g#{dim}"))
15
+ end
16
+
17
+ def to_s
18
+ "[#{@data.to_a.map(&:to_f).join(",")}]"
19
+ end
20
+
21
+ def to_a
22
+ @data
23
+ end
24
+ end
25
+ end
@@ -1,3 +1,3 @@
1
1
  module Pgvector
2
- VERSION = "0.2.2"
2
+ VERSION = "0.3.1"
3
3
  end
data/lib/pgvector.rb CHANGED
@@ -1,14 +1,28 @@
1
1
  # modules
2
+ require_relative "pgvector/bit"
3
+ require_relative "pgvector/half_vector"
4
+ require_relative "pgvector/sparse_vector"
5
+ require_relative "pgvector/vector"
2
6
  require_relative "pgvector/version"
3
7
 
4
8
  module Pgvector
5
9
  autoload :PG, "pgvector/pg"
6
10
 
7
11
  def self.encode(data)
8
- "[#{data.to_a.map(&:to_f).join(",")}]"
12
+ if data.is_a?(Vector) || data.is_a?(HalfVector) || data.is_a?(SparseVector)
13
+ data.to_s
14
+ else
15
+ Vector.new(data).to_s
16
+ end
9
17
  end
10
18
 
11
19
  def self.decode(string)
12
- string[1..-2].split(",").map(&:to_f)
20
+ if string[0] == "["
21
+ Vector.from_text(string).to_a
22
+ elsif string[0] == "{"
23
+ SparseVector.from_text(string)
24
+ else
25
+ string
26
+ end
13
27
  end
14
28
  end
@@ -0,0 +1,5 @@
1
+ require_relative "../plugins/pgvector"
2
+
3
+ module Sequel
4
+ Dataset.register_extension(:pgvector, Plugins::Pgvector::DatasetMethods)
5
+ end
@@ -22,6 +22,12 @@ module Sequel
22
22
  "<=>"
23
23
  when "euclidean"
24
24
  "<->"
25
+ when "taxicab"
26
+ "<+>"
27
+ when "hamming"
28
+ "<~>"
29
+ when "jaccard"
30
+ "<%>"
25
31
  end
26
32
 
27
33
  raise ArgumentError, "Invalid distance: #{distance}" unless operator
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pgvector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-10-04 00:00:00.000000000 Z
11
+ date: 2024-07-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -20,8 +20,13 @@ files:
20
20
  - LICENSE.txt
21
21
  - README.md
22
22
  - lib/pgvector.rb
23
+ - lib/pgvector/bit.rb
24
+ - lib/pgvector/half_vector.rb
23
25
  - lib/pgvector/pg.rb
26
+ - lib/pgvector/sparse_vector.rb
27
+ - lib/pgvector/vector.rb
24
28
  - lib/pgvector/version.rb
29
+ - lib/sequel/extensions/pgvector.rb
25
30
  - lib/sequel/plugins/pgvector.rb
26
31
  homepage: https://github.com/pgvector/pgvector-ruby
27
32
  licenses:
@@ -35,14 +40,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
35
40
  requirements:
36
41
  - - ">="
37
42
  - !ruby/object:Gem::Version
38
- version: '3'
43
+ version: '3.1'
39
44
  required_rubygems_version: !ruby/object:Gem::Requirement
40
45
  requirements:
41
46
  - - ">="
42
47
  - !ruby/object:Gem::Version
43
48
  version: '0'
44
49
  requirements: []
45
- rubygems_version: 3.4.10
50
+ rubygems_version: 3.5.11
46
51
  signing_key:
47
52
  specification_version: 4
48
53
  summary: pgvector support for Ruby