pgvector 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7485ea4be0d5be0177a972db911c696daf3438a661ddac61b08f4e8b2da3ac51
4
- data.tar.gz: 2532ef79f5db88aecb681d9455e38f3e5fc1d30bde015d0a1e9daaa9fe82635e
3
+ metadata.gz: 07a80636c13841d2fa97f8b740feb095c1c2f94fa3691b374d1472bba918bddf
4
+ data.tar.gz: 50c9e67781fbbe23fa3dbe2f87790025f18d3686e412b46f62d1cb0d2995ecb3
5
5
  SHA512:
6
- metadata.gz: be40e4c3e16dd904a200115794a8ffaa850b40c5055330bd873ec7a707164a53b29d22040defbb4dd8a9cff597d4e1ad5c659d37a580ccc201ce30a9eb17fef9
7
- data.tar.gz: f5d36289b043d987920911ab08d85d7d1066039f84dcc2a24436701c06b246adcb8fd3c32e3f76f3e1604001403574c14818f91a64357ce4ebd675458e57184c
6
+ metadata.gz: f0aa5b733e1bc4022c2052be6b04aa0656bf306f588baf2907827674aeadc7dc52507895043049246634999dea37de629c19a164bd289fbd979eb628aaab0c33
7
+ data.tar.gz: f52e2e9c1c074c4f809fe78d6926bcb630694d79199e288ebb33d136a05191c4b2fff6038f8d7fc56d4291f949caa1b536d942aa7d0912be2daccf5a7ee23f83
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.3.0 (2024-06-25)
2
+
3
+ - Added support for `halfvec` and `sparsevec` types
4
+ - Added `taxicab`, `hamming`, and `jaccard` distances for Sequel
5
+ - Dropped support for Ruby < 3.1
6
+
1
7
  ## 0.2.2 (2023-10-03)
2
8
 
3
9
  - Added `nearest_neighbors` method to datasets with Sequel
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2022-2023 Andrew Kane
3
+ Copyright (c) 2022-2024 Andrew Kane
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -6,7 +6,7 @@ Supports [pg](https://github.com/ged/ruby-pg) and [Sequel](https://github.com/je
6
6
 
7
7
  For Rails, check out [Neighbor](https://github.com/ankane/neighbor)
8
8
 
9
- [![Build Status](https://github.com/pgvector/pgvector-ruby/workflows/build/badge.svg?branch=master)](https://github.com/pgvector/pgvector-ruby/actions)
9
+ [![Build Status](https://github.com/pgvector/pgvector-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector-ruby/actions)
10
10
 
11
11
  ## Installation
12
12
 
@@ -26,6 +26,7 @@ Or check out some examples:
26
26
  - [Embeddings](examples/openai_embeddings.rb) with OpenAI
27
27
  - [User-based recommendations](examples/disco_user_recs.rb) with Disco
28
28
  - [Item-based recommendations](examples/disco_item_recs.rb) with Disco
29
+ - [Bulk loading](examples/bulk_loading.rb) with `COPY`
29
30
 
30
31
  ## pg
31
32
 
@@ -35,7 +36,7 @@ Enable the extension
35
36
  conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
36
37
  ```
37
38
 
38
- Register the vector type with your connection
39
+ Optionally enable type casting for results
39
40
 
40
41
  ```ruby
41
42
  registry = PG::BasicTypeRegistry.new.define_default_types
@@ -43,6 +44,12 @@ Pgvector::PG.register_vector(registry)
43
44
  conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn, registry: registry)
44
45
  ```
45
46
 
47
+ Create a table
48
+
49
+ ```ruby
50
+ conn.exec("CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))")
51
+ ```
52
+
46
53
  Insert a vector
47
54
 
48
55
  ```ruby
@@ -56,6 +63,16 @@ Get the nearest neighbors to a vector
56
63
  conn.exec_params("SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5", [embedding]).to_a
57
64
  ```
58
65
 
66
+ Add an approximate index
67
+
68
+ ```ruby
69
+ conn.exec("CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)")
70
+ # or
71
+ conn.exec("CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)")
72
+ ```
73
+
74
+ Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
75
+
59
76
  ## Sequel
60
77
 
61
78
  Enable the extension
@@ -93,7 +110,7 @@ Get the nearest neighbors to a record
93
110
  item.nearest_neighbors(:embedding, distance: "euclidean").limit(5)
94
111
  ```
95
112
 
96
- Also supports `inner_product` and `cosine` distance
113
+ Also supports `inner_product`, `cosine`, `taxicab`, `hamming`, and `jaccard` distance
97
114
 
98
115
  Get the nearest neighbors to a vector
99
116
 
@@ -101,6 +118,14 @@ Get the nearest neighbors to a vector
101
118
  Item.nearest_neighbors(:embedding, [1, 1, 1], distance: "euclidean").limit(5)
102
119
  ```
103
120
 
121
+ Add an approximate index
122
+
123
+ ```ruby
124
+ DB.add_index :items, :embedding, type: "hnsw", opclass: "vector_l2_ops"
125
+ ```
126
+
127
+ Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance
128
+
104
129
  ## History
105
130
 
106
131
  View the [changelog](https://github.com/pgvector/pgvector-ruby/blob/master/CHANGELOG.md)
@@ -0,0 +1,19 @@
1
+ module Pgvector
2
+ class HalfVector
3
+ def initialize(data)
4
+ @data = data.to_a.map(&:to_f)
5
+ end
6
+
7
+ def self.from_text(string)
8
+ new(string[1..-2].split(",").map(&:to_f))
9
+ end
10
+
11
+ def to_s
12
+ "[#{@data.to_a.map(&:to_f).join(",")}]"
13
+ end
14
+
15
+ def to_a
16
+ @data
17
+ end
18
+ end
19
+ end
data/lib/pgvector/pg.rb CHANGED
@@ -5,14 +5,24 @@ module Pgvector
5
5
  def self.register_vector(registry)
6
6
  registry.register_type(0, "vector", nil, TextDecoder::Vector)
7
7
  registry.register_type(1, "vector", nil, BinaryDecoder::Vector)
8
+
9
+ # no binary decoder for halfvec since unpack does not have directive for half-precision
10
+ registry.register_type(0, "halfvec", nil, TextDecoder::Halfvec)
11
+
12
+ registry.register_type(0, "sparsevec", nil, TextDecoder::Sparsevec)
13
+ registry.register_type(1, "sparsevec", nil, BinaryDecoder::Sparsevec)
8
14
  end
9
15
 
10
16
  module BinaryDecoder
11
17
  class Vector < ::PG::SimpleDecoder
12
18
  def decode(string, tuple = nil, field = nil)
13
- dim, unused = string[0, 4].unpack("nn")
14
- raise "expected unused to be 0" if unused != 0
15
- string[4..-1].unpack("g#{dim}")
19
+ ::Pgvector::Vector.from_binary(string).to_a
20
+ end
21
+ end
22
+
23
+ class Sparsevec < ::PG::SimpleDecoder
24
+ def decode(string, tuple = nil, field = nil)
25
+ SparseVector.from_binary(string)
16
26
  end
17
27
  end
18
28
  end
@@ -20,7 +30,19 @@ module Pgvector
20
30
  module TextDecoder
21
31
  class Vector < ::PG::SimpleDecoder
22
32
  def decode(string, tuple = nil, field = nil)
23
- Pgvector.decode(string)
33
+ ::Pgvector::Vector.from_text(string).to_a
34
+ end
35
+ end
36
+
37
+ class Halfvec < ::PG::SimpleDecoder
38
+ def decode(string, tuple = nil, field = nil)
39
+ HalfVector.from_text(string).to_a
40
+ end
41
+ end
42
+
43
+ class Sparsevec < ::PG::SimpleDecoder
44
+ def decode(string, tuple = nil, field = nil)
45
+ SparseVector.from_text(string)
24
46
  end
25
47
  end
26
48
  end
@@ -0,0 +1,87 @@
1
+ module Pgvector
2
+ class SparseVector
3
+ attr_reader :dimensions, :indices, :values
4
+
5
+ NO_DEFAULT = Object.new
6
+
7
+ def initialize(value, dimensions = NO_DEFAULT)
8
+ if value.is_a?(Hash)
9
+ if dimensions == NO_DEFAULT
10
+ raise ArgumentError, "missing dimensions"
11
+ end
12
+ from_hash(value, dimensions)
13
+ else
14
+ unless dimensions == NO_DEFAULT
15
+ raise ArgumentError, "extra argument"
16
+ end
17
+ from_array(value)
18
+ end
19
+ end
20
+
21
+ def to_s
22
+ "{#{@indices.zip(@values).map { |i, v| "#{i.to_i + 1}:#{v.to_f}" }.join(",")}}/#{@dimensions.to_i}"
23
+ end
24
+
25
+ def to_a
26
+ arr = Array.new(dimensions, 0.0)
27
+ @indices.zip(@values) do |i, v|
28
+ arr[i] = v
29
+ end
30
+ arr
31
+ end
32
+
33
+ private
34
+
35
+ def from_hash(data, dimensions)
36
+ elements = data.select { |_, v| v != 0 }.sort
37
+ @dimensions = dimensions.to_i
38
+ @indices = elements.map { |v| v[0].to_i }
39
+ @values = elements.map { |v| v[1].to_f }
40
+ end
41
+
42
+ def from_array(arr)
43
+ arr = arr.to_a
44
+ @dimensions = arr.size
45
+ @indices = []
46
+ @values = []
47
+ arr.each_with_index do |v, i|
48
+ if v != 0
49
+ @indices << i
50
+ @values << v.to_f
51
+ end
52
+ end
53
+ end
54
+
55
+ class << self
56
+ def from_text(string)
57
+ elements, dimensions = string.split("/", 2)
58
+ indices = []
59
+ values = []
60
+ elements[1..-2].split(",").each do |e|
61
+ index, value = e.split(":", 2)
62
+ indices << index.to_i - 1
63
+ values << value.to_f
64
+ end
65
+ from_parts(dimensions.to_i, indices, values)
66
+ end
67
+
68
+ def from_binary(string)
69
+ dim, nnz, unused = string[0, 12].unpack("l>l>l>")
70
+ raise "expected unused to be 0" if unused != 0
71
+ indices = string[12, nnz * 4].unpack("l>#{nnz}")
72
+ values = string[(12 + nnz * 4)..-1].unpack("g#{nnz}")
73
+ from_parts(dim, indices, values)
74
+ end
75
+
76
+ private
77
+
78
+ def from_parts(dimensions, indices, values)
79
+ vec = allocate
80
+ vec.instance_variable_set(:@dimensions, dimensions)
81
+ vec.instance_variable_set(:@indices, indices)
82
+ vec.instance_variable_set(:@values, values)
83
+ vec
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,25 @@
1
+ module Pgvector
2
+ class Vector
3
+ def initialize(data)
4
+ @data = data.to_a.map(&:to_f)
5
+ end
6
+
7
+ def self.from_text(string)
8
+ Vector.new(string[1..-2].split(",").map(&:to_f))
9
+ end
10
+
11
+ def self.from_binary(string)
12
+ dim, unused = string[0, 4].unpack("nn")
13
+ raise "expected unused to be 0" if unused != 0
14
+ Vector.new(string[4..-1].unpack("g#{dim}"))
15
+ end
16
+
17
+ def to_s
18
+ "[#{@data.to_a.map(&:to_f).join(",")}]"
19
+ end
20
+
21
+ def to_a
22
+ @data
23
+ end
24
+ end
25
+ end
@@ -1,3 +1,3 @@
1
1
  module Pgvector
2
- VERSION = "0.2.2"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/pgvector.rb CHANGED
@@ -1,14 +1,27 @@
1
1
  # modules
2
+ require_relative "pgvector/half_vector"
3
+ require_relative "pgvector/sparse_vector"
4
+ require_relative "pgvector/vector"
2
5
  require_relative "pgvector/version"
3
6
 
4
7
  module Pgvector
5
8
  autoload :PG, "pgvector/pg"
6
9
 
7
10
  def self.encode(data)
8
- "[#{data.to_a.map(&:to_f).join(",")}]"
11
+ if data.is_a?(SparseVector)
12
+ data.to_s
13
+ else
14
+ Vector.new(data).to_s
15
+ end
9
16
  end
10
17
 
11
18
  def self.decode(string)
12
- string[1..-2].split(",").map(&:to_f)
19
+ if string[0] == "["
20
+ Vector.from_text(string).to_a
21
+ elsif string[0] == "{"
22
+ SparseVector.from_text(string)
23
+ else
24
+ string
25
+ end
13
26
  end
14
27
  end
@@ -22,6 +22,12 @@ module Sequel
22
22
  "<=>"
23
23
  when "euclidean"
24
24
  "<->"
25
+ when "taxicab"
26
+ "<+>"
27
+ when "hamming"
28
+ "<~>"
29
+ when "jaccard"
30
+ "<%>"
25
31
  end
26
32
 
27
33
  raise ArgumentError, "Invalid distance: #{distance}" unless operator
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pgvector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-10-04 00:00:00.000000000 Z
11
+ date: 2024-06-26 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -20,7 +20,10 @@ files:
20
20
  - LICENSE.txt
21
21
  - README.md
22
22
  - lib/pgvector.rb
23
+ - lib/pgvector/half_vector.rb
23
24
  - lib/pgvector/pg.rb
25
+ - lib/pgvector/sparse_vector.rb
26
+ - lib/pgvector/vector.rb
24
27
  - lib/pgvector/version.rb
25
28
  - lib/sequel/plugins/pgvector.rb
26
29
  homepage: https://github.com/pgvector/pgvector-ruby
@@ -35,14 +38,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
35
38
  requirements:
36
39
  - - ">="
37
40
  - !ruby/object:Gem::Version
38
- version: '3'
41
+ version: '3.1'
39
42
  required_rubygems_version: !ruby/object:Gem::Requirement
40
43
  requirements:
41
44
  - - ">="
42
45
  - !ruby/object:Gem::Version
43
46
  version: '0'
44
47
  requirements: []
45
- rubygems_version: 3.4.10
48
+ rubygems_version: 3.5.11
46
49
  signing_key:
47
50
  specification_version: 4
48
51
  summary: pgvector support for Ruby