pgvector 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7485ea4be0d5be0177a972db911c696daf3438a661ddac61b08f4e8b2da3ac51
4
- data.tar.gz: 2532ef79f5db88aecb681d9455e38f3e5fc1d30bde015d0a1e9daaa9fe82635e
3
+ metadata.gz: 07a80636c13841d2fa97f8b740feb095c1c2f94fa3691b374d1472bba918bddf
4
+ data.tar.gz: 50c9e67781fbbe23fa3dbe2f87790025f18d3686e412b46f62d1cb0d2995ecb3
5
5
  SHA512:
6
- metadata.gz: be40e4c3e16dd904a200115794a8ffaa850b40c5055330bd873ec7a707164a53b29d22040defbb4dd8a9cff597d4e1ad5c659d37a580ccc201ce30a9eb17fef9
7
- data.tar.gz: f5d36289b043d987920911ab08d85d7d1066039f84dcc2a24436701c06b246adcb8fd3c32e3f76f3e1604001403574c14818f91a64357ce4ebd675458e57184c
6
+ metadata.gz: f0aa5b733e1bc4022c2052be6b04aa0656bf306f588baf2907827674aeadc7dc52507895043049246634999dea37de629c19a164bd289fbd979eb628aaab0c33
7
+ data.tar.gz: f52e2e9c1c074c4f809fe78d6926bcb630694d79199e288ebb33d136a05191c4b2fff6038f8d7fc56d4291f949caa1b536d942aa7d0912be2daccf5a7ee23f83
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.3.0 (2024-06-25)
2
+
3
+ - Added support for `halfvec` and `sparsevec` types
4
+ - Added `taxicab`, `hamming`, and `jaccard` distances for Sequel
5
+ - Dropped support for Ruby < 3.1
6
+
1
7
  ## 0.2.2 (2023-10-03)
2
8
 
3
9
  - Added `nearest_neighbors` method to datasets with Sequel
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2022-2023 Andrew Kane
3
+ Copyright (c) 2022-2024 Andrew Kane
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -6,7 +6,7 @@ Supports [pg](https://github.com/ged/ruby-pg) and [Sequel](https://github.com/je
6
6
 
7
7
  For Rails, check out [Neighbor](https://github.com/ankane/neighbor)
8
8
 
9
- [![Build Status](https://github.com/pgvector/pgvector-ruby/workflows/build/badge.svg?branch=master)](https://github.com/pgvector/pgvector-ruby/actions)
9
+ [![Build Status](https://github.com/pgvector/pgvector-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector-ruby/actions)
10
10
 
11
11
  ## Installation
12
12
 
@@ -26,6 +26,7 @@ Or check out some examples:
26
26
  - [Embeddings](examples/openai_embeddings.rb) with OpenAI
27
27
  - [User-based recommendations](examples/disco_user_recs.rb) with Disco
28
28
  - [Item-based recommendations](examples/disco_item_recs.rb) with Disco
29
+ - [Bulk loading](examples/bulk_loading.rb) with `COPY`
29
30
 
30
31
  ## pg
31
32
 
@@ -35,7 +36,7 @@ Enable the extension
35
36
  conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
36
37
  ```
37
38
 
38
- Register the vector type with your connection
39
+ Optionally enable type casting for results
39
40
 
40
41
  ```ruby
41
42
  registry = PG::BasicTypeRegistry.new.define_default_types
@@ -43,6 +44,12 @@ Pgvector::PG.register_vector(registry)
43
44
  conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn, registry: registry)
44
45
  ```
45
46
 
47
+ Create a table
48
+
49
+ ```ruby
50
+ conn.exec("CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))")
51
+ ```
52
+
46
53
  Insert a vector
47
54
 
48
55
  ```ruby
@@ -56,6 +63,16 @@ Get the nearest neighbors to a vector
56
63
  conn.exec_params("SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5", [embedding]).to_a
57
64
  ```
58
65
 
66
+ Add an approximate index
67
+
68
+ ```ruby
69
+ conn.exec("CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)")
70
+ # or
71
+ conn.exec("CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)")
72
+ ```
73
+
74
+ Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
75
+
59
76
  ## Sequel
60
77
 
61
78
  Enable the extension
@@ -93,7 +110,7 @@ Get the nearest neighbors to a record
93
110
  item.nearest_neighbors(:embedding, distance: "euclidean").limit(5)
94
111
  ```
95
112
 
96
- Also supports `inner_product` and `cosine` distance
113
+ Also supports `inner_product`, `cosine`, `taxicab`, `hamming`, and `jaccard` distance
97
114
 
98
115
  Get the nearest neighbors to a vector
99
116
 
@@ -101,6 +118,14 @@ Get the nearest neighbors to a vector
101
118
  Item.nearest_neighbors(:embedding, [1, 1, 1], distance: "euclidean").limit(5)
102
119
  ```
103
120
 
121
+ Add an approximate index
122
+
123
+ ```ruby
124
+ DB.add_index :items, :embedding, type: "hnsw", opclass: "vector_l2_ops"
125
+ ```
126
+
127
+ Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
128
+
104
129
  ## History
105
130
 
106
131
  View the [changelog](https://github.com/pgvector/pgvector-ruby/blob/master/CHANGELOG.md)
@@ -0,0 +1,19 @@
1
+ module Pgvector
2
+ class HalfVector
3
+ def initialize(data)
4
+ @data = data.to_a.map(&:to_f)
5
+ end
6
+
7
+ def self.from_text(string)
8
+ new(string[1..-2].split(",").map(&:to_f))
9
+ end
10
+
11
+ def to_s
12
+ "[#{@data.to_a.map(&:to_f).join(",")}]"
13
+ end
14
+
15
+ def to_a
16
+ @data
17
+ end
18
+ end
19
+ end
data/lib/pgvector/pg.rb CHANGED
@@ -5,14 +5,24 @@ module Pgvector
5
5
  def self.register_vector(registry)
6
6
  registry.register_type(0, "vector", nil, TextDecoder::Vector)
7
7
  registry.register_type(1, "vector", nil, BinaryDecoder::Vector)
8
+
9
+ # no binary decoder for halfvec since unpack does not have directive for half-precision
10
+ registry.register_type(0, "halfvec", nil, TextDecoder::Halfvec)
11
+
12
+ registry.register_type(0, "sparsevec", nil, TextDecoder::Sparsevec)
13
+ registry.register_type(1, "sparsevec", nil, BinaryDecoder::Sparsevec)
8
14
  end
9
15
 
10
16
  module BinaryDecoder
11
17
  class Vector < ::PG::SimpleDecoder
12
18
  def decode(string, tuple = nil, field = nil)
13
- dim, unused = string[0, 4].unpack("nn")
14
- raise "expected unused to be 0" if unused != 0
15
- string[4..-1].unpack("g#{dim}")
19
+ ::Pgvector::Vector.from_binary(string).to_a
20
+ end
21
+ end
22
+
23
+ class Sparsevec < ::PG::SimpleDecoder
24
+ def decode(string, tuple = nil, field = nil)
25
+ SparseVector.from_binary(string)
16
26
  end
17
27
  end
18
28
  end
@@ -20,7 +30,19 @@ module Pgvector
20
30
  module TextDecoder
21
31
  class Vector < ::PG::SimpleDecoder
22
32
  def decode(string, tuple = nil, field = nil)
23
- Pgvector.decode(string)
33
+ ::Pgvector::Vector.from_text(string).to_a
34
+ end
35
+ end
36
+
37
+ class Halfvec < ::PG::SimpleDecoder
38
+ def decode(string, tuple = nil, field = nil)
39
+ HalfVector.from_text(string).to_a
40
+ end
41
+ end
42
+
43
+ class Sparsevec < ::PG::SimpleDecoder
44
+ def decode(string, tuple = nil, field = nil)
45
+ SparseVector.from_text(string)
24
46
  end
25
47
  end
26
48
  end
@@ -0,0 +1,87 @@
1
+ module Pgvector
2
+ class SparseVector
3
+ attr_reader :dimensions, :indices, :values
4
+
5
+ NO_DEFAULT = Object.new
6
+
7
+ def initialize(value, dimensions = NO_DEFAULT)
8
+ if value.is_a?(Hash)
9
+ if dimensions == NO_DEFAULT
10
+ raise ArgumentError, "missing dimensions"
11
+ end
12
+ from_hash(value, dimensions)
13
+ else
14
+ unless dimensions == NO_DEFAULT
15
+ raise ArgumentError, "extra argument"
16
+ end
17
+ from_array(value)
18
+ end
19
+ end
20
+
21
+ def to_s
22
+ "{#{@indices.zip(@values).map { |i, v| "#{i.to_i + 1}:#{v.to_f}" }.join(",")}}/#{@dimensions.to_i}"
23
+ end
24
+
25
+ def to_a
26
+ arr = Array.new(dimensions, 0.0)
27
+ @indices.zip(@values) do |i, v|
28
+ arr[i] = v
29
+ end
30
+ arr
31
+ end
32
+
33
+ private
34
+
35
+ def from_hash(data, dimensions)
36
+ elements = data.select { |_, v| v != 0 }.sort
37
+ @dimensions = dimensions.to_i
38
+ @indices = elements.map { |v| v[0].to_i }
39
+ @values = elements.map { |v| v[1].to_f }
40
+ end
41
+
42
+ def from_array(arr)
43
+ arr = arr.to_a
44
+ @dimensions = arr.size
45
+ @indices = []
46
+ @values = []
47
+ arr.each_with_index do |v, i|
48
+ if v != 0
49
+ @indices << i
50
+ @values << v.to_f
51
+ end
52
+ end
53
+ end
54
+
55
+ class << self
56
+ def from_text(string)
57
+ elements, dimensions = string.split("/", 2)
58
+ indices = []
59
+ values = []
60
+ elements[1..-2].split(",").each do |e|
61
+ index, value = e.split(":", 2)
62
+ indices << index.to_i - 1
63
+ values << value.to_f
64
+ end
65
+ from_parts(dimensions.to_i, indices, values)
66
+ end
67
+
68
+ def from_binary(string)
69
+ dim, nnz, unused = string[0, 12].unpack("l>l>l>")
70
+ raise "expected unused to be 0" if unused != 0
71
+ indices = string[12, nnz * 4].unpack("l>#{nnz}")
72
+ values = string[(12 + nnz * 4)..-1].unpack("g#{nnz}")
73
+ from_parts(dim, indices, values)
74
+ end
75
+
76
+ private
77
+
78
+ def from_parts(dimensions, indices, values)
79
+ vec = allocate
80
+ vec.instance_variable_set(:@dimensions, dimensions)
81
+ vec.instance_variable_set(:@indices, indices)
82
+ vec.instance_variable_set(:@values, values)
83
+ vec
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,25 @@
1
+ module Pgvector
2
+ class Vector
3
+ def initialize(data)
4
+ @data = data.to_a.map(&:to_f)
5
+ end
6
+
7
+ def self.from_text(string)
8
+ Vector.new(string[1..-2].split(",").map(&:to_f))
9
+ end
10
+
11
+ def self.from_binary(string)
12
+ dim, unused = string[0, 4].unpack("nn")
13
+ raise "expected unused to be 0" if unused != 0
14
+ Vector.new(string[4..-1].unpack("g#{dim}"))
15
+ end
16
+
17
+ def to_s
18
+ "[#{@data.to_a.map(&:to_f).join(",")}]"
19
+ end
20
+
21
+ def to_a
22
+ @data
23
+ end
24
+ end
25
+ end
@@ -1,3 +1,3 @@
1
1
  module Pgvector
2
- VERSION = "0.2.2"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/pgvector.rb CHANGED
@@ -1,14 +1,27 @@
1
1
  # modules
2
+ require_relative "pgvector/half_vector"
3
+ require_relative "pgvector/sparse_vector"
4
+ require_relative "pgvector/vector"
2
5
  require_relative "pgvector/version"
3
6
 
4
7
  module Pgvector
5
8
  autoload :PG, "pgvector/pg"
6
9
 
7
10
  def self.encode(data)
8
- "[#{data.to_a.map(&:to_f).join(",")}]"
11
+ if data.is_a?(SparseVector)
12
+ data.to_s
13
+ else
14
+ Vector.new(data).to_s
15
+ end
9
16
  end
10
17
 
11
18
  def self.decode(string)
12
- string[1..-2].split(",").map(&:to_f)
19
+ if string[0] == "["
20
+ Vector.from_text(string).to_a
21
+ elsif string[0] == "{"
22
+ SparseVector.from_text(string)
23
+ else
24
+ string
25
+ end
13
26
  end
14
27
  end
@@ -22,6 +22,12 @@ module Sequel
22
22
  "<=>"
23
23
  when "euclidean"
24
24
  "<->"
25
+ when "taxicab"
26
+ "<+>"
27
+ when "hamming"
28
+ "<~>"
29
+ when "jaccard"
30
+ "<%>"
25
31
  end
26
32
 
27
33
  raise ArgumentError, "Invalid distance: #{distance}" unless operator
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pgvector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-10-04 00:00:00.000000000 Z
11
+ date: 2024-06-26 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -20,7 +20,10 @@ files:
20
20
  - LICENSE.txt
21
21
  - README.md
22
22
  - lib/pgvector.rb
23
+ - lib/pgvector/half_vector.rb
23
24
  - lib/pgvector/pg.rb
25
+ - lib/pgvector/sparse_vector.rb
26
+ - lib/pgvector/vector.rb
24
27
  - lib/pgvector/version.rb
25
28
  - lib/sequel/plugins/pgvector.rb
26
29
  homepage: https://github.com/pgvector/pgvector-ruby
@@ -35,14 +38,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
35
38
  requirements:
36
39
  - - ">="
37
40
  - !ruby/object:Gem::Version
38
- version: '3'
41
+ version: '3.1'
39
42
  required_rubygems_version: !ruby/object:Gem::Requirement
40
43
  requirements:
41
44
  - - ">="
42
45
  - !ruby/object:Gem::Version
43
46
  version: '0'
44
47
  requirements: []
45
- rubygems_version: 3.4.10
48
+ rubygems_version: 3.5.11
46
49
  signing_key:
47
50
  specification_version: 4
48
51
  summary: pgvector support for Ruby