pgvector 0.2.2 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7485ea4be0d5be0177a972db911c696daf3438a661ddac61b08f4e8b2da3ac51
4
- data.tar.gz: 2532ef79f5db88aecb681d9455e38f3e5fc1d30bde015d0a1e9daaa9fe82635e
3
+ metadata.gz: 3d3ef6a53417383ff7f8a2a514df78513dfe9029e524f0f624df8828fb99dba0
4
+ data.tar.gz: f326a21d3942079cdadc22d9a61367e7a6651ae37c5eb07a3955182f9f569ad0
5
5
  SHA512:
6
- metadata.gz: be40e4c3e16dd904a200115794a8ffaa850b40c5055330bd873ec7a707164a53b29d22040defbb4dd8a9cff597d4e1ad5c659d37a580ccc201ce30a9eb17fef9
7
- data.tar.gz: f5d36289b043d987920911ab08d85d7d1066039f84dcc2a24436701c06b246adcb8fd3c32e3f76f3e1604001403574c14818f91a64357ce4ebd675458e57184c
6
+ metadata.gz: fa4d6519685d179e3d5712cbcf1e6989ca4f98e5b0902f5061eca2be55451162f199fcc75f19841dbd8ada8c18de69ac6fa39cad545d4b5a6c542e9cdd0109ea
7
+ data.tar.gz: fd71245492b2ff8a06a0af60dbd12e57e44025f2662d4ea77eb7176aaaa4c4cb3b39c1e159edce92944da5b54fddbb19d1b504acd76a12c4efdb9a0fb6e797da
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## 0.3.1 (2024-07-10)
2
+
3
+ - Added support for `bit` type to pg
4
+ - Added extension for Sequel
5
+
6
+ ## 0.3.0 (2024-06-25)
7
+
8
+ - Added support for `halfvec` and `sparsevec` types
9
+ - Added `taxicab`, `hamming`, and `jaccard` distances for Sequel
10
+ - Dropped support for Ruby < 3.1
11
+
1
12
  ## 0.2.2 (2023-10-03)
2
13
 
3
14
  - Added `nearest_neighbors` method to datasets with Sequel
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2022-2023 Andrew Kane
3
+ Copyright (c) 2022-2024 Andrew Kane
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -6,7 +6,7 @@ Supports [pg](https://github.com/ged/ruby-pg) and [Sequel](https://github.com/je
6
6
 
7
7
  For Rails, check out [Neighbor](https://github.com/ankane/neighbor)
8
8
 
9
- [![Build Status](https://github.com/pgvector/pgvector-ruby/workflows/build/badge.svg?branch=master)](https://github.com/pgvector/pgvector-ruby/actions)
9
+ [![Build Status](https://github.com/pgvector/pgvector-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector-ruby/actions)
10
10
 
11
11
  ## Installation
12
12
 
@@ -26,6 +26,7 @@ Or check out some examples:
26
26
  - [Embeddings](examples/openai_embeddings.rb) with OpenAI
27
27
  - [User-based recommendations](examples/disco_user_recs.rb) with Disco
28
28
  - [Item-based recommendations](examples/disco_item_recs.rb) with Disco
29
+ - [Bulk loading](examples/bulk_loading.rb) with `COPY`
29
30
 
30
31
  ## pg
31
32
 
@@ -35,7 +36,7 @@ Enable the extension
35
36
  conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
36
37
  ```
37
38
 
38
- Register the vector type with your connection
39
+ Optionally enable type casting for results
39
40
 
40
41
  ```ruby
41
42
  registry = PG::BasicTypeRegistry.new.define_default_types
@@ -43,6 +44,12 @@ Pgvector::PG.register_vector(registry)
43
44
  conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn, registry: registry)
44
45
  ```
45
46
 
47
+ Create a table
48
+
49
+ ```ruby
50
+ conn.exec("CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))")
51
+ ```
52
+
46
53
  Insert a vector
47
54
 
48
55
  ```ruby
@@ -56,6 +63,16 @@ Get the nearest neighbors to a vector
56
63
  conn.exec_params("SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5", [embedding]).to_a
57
64
  ```
58
65
 
66
+ Add an approximate index
67
+
68
+ ```ruby
69
+ conn.exec("CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)")
70
+ # or
71
+ conn.exec("CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)")
72
+ ```
73
+
74
+ Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
75
+
59
76
  ## Sequel
60
77
 
61
78
  Enable the extension
@@ -93,7 +110,7 @@ Get the nearest neighbors to a record
93
110
  item.nearest_neighbors(:embedding, distance: "euclidean").limit(5)
94
111
  ```
95
112
 
96
- Also supports `inner_product` and `cosine` distance
113
+ Also supports `inner_product`, `cosine`, `taxicab`, `hamming`, and `jaccard` distance
97
114
 
98
115
  Get the nearest neighbors to a vector
99
116
 
@@ -101,6 +118,14 @@ Get the nearest neighbors to a vector
101
118
  Item.nearest_neighbors(:embedding, [1, 1, 1], distance: "euclidean").limit(5)
102
119
  ```
103
120
 
121
+ Add an approximate index
122
+
123
+ ```ruby
124
+ DB.add_index :items, :embedding, type: "hnsw", opclass: "vector_l2_ops"
125
+ ```
126
+
127
+ Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
128
+
104
129
  ## History
105
130
 
106
131
  View the [changelog](https://github.com/pgvector/pgvector-ruby/blob/master/CHANGELOG.md)
@@ -0,0 +1,28 @@
1
+ module Pgvector
2
+ class Bit
3
+ def initialize(data)
4
+ if data.is_a?(Array)
5
+ @data = data.map { |v| v ? "1" : "0" }.join
6
+ else
7
+ @data = data.to_str
8
+ end
9
+ end
10
+
11
+ def self.from_text(string)
12
+ Bit.new(string)
13
+ end
14
+
15
+ def self.from_binary(string)
16
+ length = string[..3].unpack1("l>")
17
+ Bit.new(string[4..].unpack("B*").join[...length])
18
+ end
19
+
20
+ def to_s
21
+ @data
22
+ end
23
+
24
+ def to_a
25
+ @data.each_char.map { |v| v != "0" }
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,19 @@
1
+ module Pgvector
2
+ class HalfVector
3
+ def initialize(data)
4
+ @data = data.to_a.map(&:to_f)
5
+ end
6
+
7
+ def self.from_text(string)
8
+ new(string[1..-2].split(",").map(&:to_f))
9
+ end
10
+
11
+ def to_s
12
+ "[#{@data.to_a.map(&:to_f).join(",")}]"
13
+ end
14
+
15
+ def to_a
16
+ @data
17
+ end
18
+ end
19
+ end
data/lib/pgvector/pg.rb CHANGED
@@ -5,14 +5,33 @@ module Pgvector
5
5
  def self.register_vector(registry)
6
6
  registry.register_type(0, "vector", nil, TextDecoder::Vector)
7
7
  registry.register_type(1, "vector", nil, BinaryDecoder::Vector)
8
+
9
+ # no binary decoder for halfvec since unpack does not have directive for half-precision
10
+ registry.register_type(0, "halfvec", nil, TextDecoder::Halfvec)
11
+
12
+ registry.register_type(0, "bit", nil, TextDecoder::Bit)
13
+ registry.register_type(1, "bit", nil, BinaryDecoder::Bit)
14
+
15
+ registry.register_type(0, "sparsevec", nil, TextDecoder::Sparsevec)
16
+ registry.register_type(1, "sparsevec", nil, BinaryDecoder::Sparsevec)
8
17
  end
9
18
 
10
19
  module BinaryDecoder
11
20
  class Vector < ::PG::SimpleDecoder
12
21
  def decode(string, tuple = nil, field = nil)
13
- dim, unused = string[0, 4].unpack("nn")
14
- raise "expected unused to be 0" if unused != 0
15
- string[4..-1].unpack("g#{dim}")
22
+ ::Pgvector::Vector.from_binary(string).to_a
23
+ end
24
+ end
25
+
26
+ class Bit < ::PG::SimpleDecoder
27
+ def decode(string, tuple = nil, field = nil)
28
+ ::Pgvector::Bit.from_binary(string).to_s
29
+ end
30
+ end
31
+
32
+ class Sparsevec < ::PG::SimpleDecoder
33
+ def decode(string, tuple = nil, field = nil)
34
+ SparseVector.from_binary(string)
16
35
  end
17
36
  end
18
37
  end
@@ -20,7 +39,25 @@ module Pgvector
20
39
  module TextDecoder
21
40
  class Vector < ::PG::SimpleDecoder
22
41
  def decode(string, tuple = nil, field = nil)
23
- Pgvector.decode(string)
42
+ ::Pgvector::Vector.from_text(string).to_a
43
+ end
44
+ end
45
+
46
+ class Halfvec < ::PG::SimpleDecoder
47
+ def decode(string, tuple = nil, field = nil)
48
+ HalfVector.from_text(string).to_a
49
+ end
50
+ end
51
+
52
+ class Bit < ::PG::SimpleDecoder
53
+ def decode(string, tuple = nil, field = nil)
54
+ ::Pgvector::Bit.from_text(string).to_s
55
+ end
56
+ end
57
+
58
+ class Sparsevec < ::PG::SimpleDecoder
59
+ def decode(string, tuple = nil, field = nil)
60
+ SparseVector.from_text(string)
24
61
  end
25
62
  end
26
63
  end
@@ -0,0 +1,87 @@
1
+ module Pgvector
2
+ class SparseVector
3
+ attr_reader :dimensions, :indices, :values
4
+
5
+ NO_DEFAULT = Object.new
6
+
7
+ def initialize(value, dimensions = NO_DEFAULT)
8
+ if value.is_a?(Hash)
9
+ if dimensions == NO_DEFAULT
10
+ raise ArgumentError, "missing dimensions"
11
+ end
12
+ from_hash(value, dimensions)
13
+ else
14
+ unless dimensions == NO_DEFAULT
15
+ raise ArgumentError, "extra argument"
16
+ end
17
+ from_array(value)
18
+ end
19
+ end
20
+
21
+ def to_s
22
+ "{#{@indices.zip(@values).map { |i, v| "#{i.to_i + 1}:#{v.to_f}" }.join(",")}}/#{@dimensions.to_i}"
23
+ end
24
+
25
+ def to_a
26
+ arr = Array.new(dimensions, 0.0)
27
+ @indices.zip(@values) do |i, v|
28
+ arr[i] = v
29
+ end
30
+ arr
31
+ end
32
+
33
+ private
34
+
35
+ def from_hash(data, dimensions)
36
+ elements = data.select { |_, v| v != 0 }.sort
37
+ @dimensions = dimensions.to_i
38
+ @indices = elements.map { |v| v[0].to_i }
39
+ @values = elements.map { |v| v[1].to_f }
40
+ end
41
+
42
+ def from_array(arr)
43
+ arr = arr.to_a
44
+ @dimensions = arr.size
45
+ @indices = []
46
+ @values = []
47
+ arr.each_with_index do |v, i|
48
+ if v != 0
49
+ @indices << i
50
+ @values << v.to_f
51
+ end
52
+ end
53
+ end
54
+
55
+ class << self
56
+ def from_text(string)
57
+ elements, dimensions = string.split("/", 2)
58
+ indices = []
59
+ values = []
60
+ elements[1..-2].split(",").each do |e|
61
+ index, value = e.split(":", 2)
62
+ indices << index.to_i - 1
63
+ values << value.to_f
64
+ end
65
+ from_parts(dimensions.to_i, indices, values)
66
+ end
67
+
68
+ def from_binary(string)
69
+ dim, nnz, unused = string[0, 12].unpack("l>l>l>")
70
+ raise "expected unused to be 0" if unused != 0
71
+ indices = string[12, nnz * 4].unpack("l>#{nnz}")
72
+ values = string[(12 + nnz * 4)..-1].unpack("g#{nnz}")
73
+ from_parts(dim, indices, values)
74
+ end
75
+
76
+ private
77
+
78
+ def from_parts(dimensions, indices, values)
79
+ vec = allocate
80
+ vec.instance_variable_set(:@dimensions, dimensions)
81
+ vec.instance_variable_set(:@indices, indices)
82
+ vec.instance_variable_set(:@values, values)
83
+ vec
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,25 @@
1
+ module Pgvector
2
+ class Vector
3
+ def initialize(data)
4
+ @data = data.to_a.map(&:to_f)
5
+ end
6
+
7
+ def self.from_text(string)
8
+ Vector.new(string[1..-2].split(",").map(&:to_f))
9
+ end
10
+
11
+ def self.from_binary(string)
12
+ dim, unused = string[0, 4].unpack("nn")
13
+ raise "expected unused to be 0" if unused != 0
14
+ Vector.new(string[4..-1].unpack("g#{dim}"))
15
+ end
16
+
17
+ def to_s
18
+ "[#{@data.to_a.map(&:to_f).join(",")}]"
19
+ end
20
+
21
+ def to_a
22
+ @data
23
+ end
24
+ end
25
+ end
@@ -1,3 +1,3 @@
1
1
  module Pgvector
2
- VERSION = "0.2.2"
2
+ VERSION = "0.3.1"
3
3
  end
data/lib/pgvector.rb CHANGED
@@ -1,14 +1,28 @@
1
1
  # modules
2
+ require_relative "pgvector/bit"
3
+ require_relative "pgvector/half_vector"
4
+ require_relative "pgvector/sparse_vector"
5
+ require_relative "pgvector/vector"
2
6
  require_relative "pgvector/version"
3
7
 
4
8
  module Pgvector
5
9
  autoload :PG, "pgvector/pg"
6
10
 
7
11
  def self.encode(data)
8
- "[#{data.to_a.map(&:to_f).join(",")}]"
12
+ if data.is_a?(Vector) || data.is_a?(HalfVector) || data.is_a?(SparseVector)
13
+ data.to_s
14
+ else
15
+ Vector.new(data).to_s
16
+ end
9
17
  end
10
18
 
11
19
  def self.decode(string)
12
- string[1..-2].split(",").map(&:to_f)
20
+ if string[0] == "["
21
+ Vector.from_text(string).to_a
22
+ elsif string[0] == "{"
23
+ SparseVector.from_text(string)
24
+ else
25
+ string
26
+ end
13
27
  end
14
28
  end
@@ -0,0 +1,5 @@
1
+ require_relative "../plugins/pgvector"
2
+
3
+ module Sequel
4
+ Dataset.register_extension(:pgvector, Plugins::Pgvector::DatasetMethods)
5
+ end
@@ -22,6 +22,12 @@ module Sequel
22
22
  "<=>"
23
23
  when "euclidean"
24
24
  "<->"
25
+ when "taxicab"
26
+ "<+>"
27
+ when "hamming"
28
+ "<~>"
29
+ when "jaccard"
30
+ "<%>"
25
31
  end
26
32
 
27
33
  raise ArgumentError, "Invalid distance: #{distance}" unless operator
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pgvector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-10-04 00:00:00.000000000 Z
11
+ date: 2024-07-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -20,8 +20,13 @@ files:
20
20
  - LICENSE.txt
21
21
  - README.md
22
22
  - lib/pgvector.rb
23
+ - lib/pgvector/bit.rb
24
+ - lib/pgvector/half_vector.rb
23
25
  - lib/pgvector/pg.rb
26
+ - lib/pgvector/sparse_vector.rb
27
+ - lib/pgvector/vector.rb
24
28
  - lib/pgvector/version.rb
29
+ - lib/sequel/extensions/pgvector.rb
25
30
  - lib/sequel/plugins/pgvector.rb
26
31
  homepage: https://github.com/pgvector/pgvector-ruby
27
32
  licenses:
@@ -35,14 +40,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
35
40
  requirements:
36
41
  - - ">="
37
42
  - !ruby/object:Gem::Version
38
- version: '3'
43
+ version: '3.1'
39
44
  required_rubygems_version: !ruby/object:Gem::Requirement
40
45
  requirements:
41
46
  - - ">="
42
47
  - !ruby/object:Gem::Version
43
48
  version: '0'
44
49
  requirements: []
45
- rubygems_version: 3.4.10
50
+ rubygems_version: 3.5.11
46
51
  signing_key:
47
52
  specification_version: 4
48
53
  summary: pgvector support for Ruby