neighbor 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/LICENSE.txt +1 -1
- data/README.md +11 -16
- data/lib/generators/neighbor/cube_generator.rb +1 -0
- data/lib/generators/neighbor/vector_generator.rb +1 -0
- data/lib/neighbor/model.rb +66 -37
- data/lib/neighbor/railtie.rb +4 -4
- data/lib/neighbor/sparse_vector.rb +79 -0
- data/lib/neighbor/type/cube.rb +42 -0
- data/lib/neighbor/type/halfvec.rb +28 -0
- data/lib/neighbor/type/sparsevec.rb +30 -0
- data/lib/neighbor/type/vector.rb +28 -0
- data/lib/neighbor/utils.rb +42 -0
- data/lib/neighbor/version.rb +1 -1
- data/lib/neighbor.rb +19 -4
- metadata +10 -5
- data/lib/neighbor/vector.rb +0 -65
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '09edc5a7eebbf6b14f06cb51340c5def49117a318340b4d2265321a8ce6a0bec'
|
4
|
+
data.tar.gz: fc8c8319cf715612f195836c84861eb327765355a0430f2d58fb5ab57857844e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: caa86d17e8a3f710988486264434767c33f8b197f9a8721d6dc762235a0bc959d5c186670f7518b9d628a771454861df1beb603a175ec804aa67cf6eb9e14361
|
7
|
+
data.tar.gz: 3ac9d60c57cc3e82b617820f205282b42684517070de22af5d94878959ef00e3758fb88f821ba3d3f2369602919a41d1706314f77436c8b3e5ef95acc38e3c17
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,21 @@
|
|
1
|
+
## 0.4.0 (2024-06-25)
|
2
|
+
|
3
|
+
- Added support for `halfvec` and `sparsevec` types
|
4
|
+
- Added support for `taxicab`, `hamming`, and `jaccard` distances with `vector` extension
|
5
|
+
- Added deserialization for `cube` and `vector` columns without `has_neighbor`
|
6
|
+
- Added support for composite primary keys
|
7
|
+
- Changed `nearest_neighbors` to replace previous `order` scopes
|
8
|
+
- Changed `normalize` option to use `before_save` callback
|
9
|
+
- Changed dimensions and finite values checks to use Active Record validations
|
10
|
+
- Fixed issue with `nearest_neighbors` scope overriding `select` values
|
11
|
+
- Removed default attribute name
|
12
|
+
- Dropped support for Ruby < 3.1
|
13
|
+
|
14
|
+
## 0.3.2 (2023-12-12)
|
15
|
+
|
16
|
+
- Added deprecation warning for `has_neighbors` without an attribute name
|
17
|
+
- Added deprecation warning for `nearest_neighbors` without an attribute name
|
18
|
+
|
1
19
|
## 0.3.1 (2023-09-25)
|
2
20
|
|
3
21
|
- Added support for passing multiple attributes to `has_neighbors`
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Nearest neighbor search for Rails and Postgres
|
4
4
|
|
5
|
-
[](https://github.com/ankane/neighbor/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -14,7 +14,7 @@ gem "neighbor"
|
|
14
14
|
|
15
15
|
## Choose An Extension
|
16
16
|
|
17
|
-
Neighbor supports two extensions: [cube](https://www.postgresql.org/docs/current/cube.html) and [vector](https://github.com/pgvector/pgvector). cube ships with Postgres, while vector supports approximate nearest neighbor search.
|
17
|
+
Neighbor supports two extensions: [cube](https://www.postgresql.org/docs/current/cube.html) and [vector](https://github.com/pgvector/pgvector). cube ships with Postgres, while vector supports more dimensions and approximate nearest neighbor search.
|
18
18
|
|
19
19
|
For cube, run:
|
20
20
|
|
@@ -35,7 +35,7 @@ rails db:migrate
|
|
35
35
|
Create a migration
|
36
36
|
|
37
37
|
```ruby
|
38
|
-
class
|
38
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.1]
|
39
39
|
def change
|
40
40
|
add_column :items, :embedding, :cube
|
41
41
|
# or
|
@@ -114,27 +114,27 @@ end
|
|
114
114
|
For vector, add an approximate index to speed up queries. Create a migration with:
|
115
115
|
|
116
116
|
```ruby
|
117
|
-
class
|
117
|
+
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.1]
|
118
118
|
def change
|
119
|
-
add_index :items, :embedding, using: :ivfflat, opclass: :vector_l2_ops
|
120
|
-
# or with pgvector 0.5.0+
|
121
119
|
add_index :items, :embedding, using: :hnsw, opclass: :vector_l2_ops
|
120
|
+
# or
|
121
|
+
add_index :items, :embedding, using: :ivfflat, opclass: :vector_l2_ops
|
122
122
|
end
|
123
123
|
end
|
124
124
|
```
|
125
125
|
|
126
126
|
Use `:vector_cosine_ops` for cosine distance and `:vector_ip_ops` for inner product.
|
127
127
|
|
128
|
-
Set the
|
128
|
+
Set the size of the dynamic candidate list with HNSW
|
129
129
|
|
130
130
|
```ruby
|
131
|
-
Item.connection.execute("SET
|
131
|
+
Item.connection.execute("SET hnsw.ef_search = 100")
|
132
132
|
```
|
133
133
|
|
134
|
-
Or the
|
134
|
+
Or the number of probes with IVFFlat
|
135
135
|
|
136
136
|
```ruby
|
137
|
-
Item.connection.execute("SET
|
137
|
+
Item.connection.execute("SET ivfflat.probes = 3")
|
138
138
|
```
|
139
139
|
|
140
140
|
## Examples
|
@@ -242,7 +242,7 @@ movies = []
|
|
242
242
|
recommender.item_ids.each do |item_id|
|
243
243
|
movies << {name: item_id, factors: recommender.item_factors(item_id)}
|
244
244
|
end
|
245
|
-
Movie.insert_all!(movies)
|
245
|
+
Movie.insert_all!(movies)
|
246
246
|
```
|
247
247
|
|
248
248
|
And get similar movies
|
@@ -286,10 +286,5 @@ git clone https://github.com/ankane/neighbor.git
|
|
286
286
|
cd neighbor
|
287
287
|
bundle install
|
288
288
|
createdb neighbor_test
|
289
|
-
|
290
|
-
# cube
|
291
289
|
bundle exec rake test
|
292
|
-
|
293
|
-
# vector
|
294
|
-
EXT=vector bundle exec rake test
|
295
290
|
```
|
data/lib/neighbor/model.rb
CHANGED
@@ -2,10 +2,9 @@ module Neighbor
|
|
2
2
|
module Model
|
3
3
|
def has_neighbors(*attribute_names, dimensions: nil, normalize: nil)
|
4
4
|
if attribute_names.empty?
|
5
|
-
|
6
|
-
else
|
7
|
-
attribute_names.map!(&:to_sym)
|
5
|
+
raise ArgumentError, "has_neighbors requires an attribute name"
|
8
6
|
end
|
7
|
+
attribute_names.map!(&:to_sym)
|
9
8
|
|
10
9
|
class_eval do
|
11
10
|
@neighbor_attributes ||= {}
|
@@ -26,29 +25,45 @@ module Neighbor
|
|
26
25
|
attribute_names.each do |attribute_name|
|
27
26
|
raise Error, "has_neighbors already called for #{attribute_name.inspect}" if neighbor_attributes[attribute_name]
|
28
27
|
@neighbor_attributes[attribute_name] = {dimensions: dimensions, normalize: normalize}
|
29
|
-
|
30
|
-
attribute attribute_name, Neighbor::Vector.new(dimensions: dimensions, normalize: normalize, model: self, attribute_name: attribute_name)
|
31
28
|
end
|
32
29
|
|
33
30
|
return if @neighbor_attributes.size != attribute_names.size
|
34
31
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
32
|
+
validate do
|
33
|
+
self.class.neighbor_attributes.each do |k, v|
|
34
|
+
value = read_attribute(k)
|
35
|
+
next if value.nil?
|
36
|
+
|
37
|
+
column_info = self.class.columns_hash[k.to_s]
|
38
|
+
dimensions = v[:dimensions] || column_info&.limit
|
39
|
+
|
40
|
+
if !Neighbor::Utils.validate_dimensions(value, column_info&.type, dimensions).nil?
|
41
|
+
errors.add(k, "must have #{dimensions} dimensions")
|
42
|
+
end
|
43
|
+
if !Neighbor::Utils.validate_finite(value, column_info&.type)
|
44
|
+
errors.add(k, "must have finite values")
|
45
|
+
end
|
41
46
|
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# TODO move to normalizes when Active Record < 7.1 no longer supported
|
50
|
+
before_save do
|
51
|
+
self.class.neighbor_attributes.each do |k, v|
|
52
|
+
next unless v[:normalize]
|
53
|
+
value = read_attribute(k)
|
54
|
+
next if value.nil?
|
55
|
+
self[k] = Neighbor::Utils.normalize(value, column_info: self.class.columns_hash[k.to_s])
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# cannot use keyword arguments with scope with Ruby 3.2 and Active Record 6.1
|
60
|
+
# https://github.com/rails/rails/issues/46934
|
61
|
+
scope :nearest_neighbors, ->(attribute_name, vector, options = nil) {
|
42
62
|
raise ArgumentError, "missing keyword: :distance" unless options.is_a?(Hash) && options.key?(:distance)
|
43
63
|
distance = options.delete(:distance)
|
44
64
|
raise ArgumentError, "unknown keywords: #{options.keys.map(&:inspect).join(", ")}" if options.any?
|
45
65
|
|
46
|
-
if vector.nil? && !attribute_name.nil? && attribute_name.respond_to?(:to_a)
|
47
|
-
vector = attribute_name
|
48
|
-
attribute_name = :neighbor_vector
|
49
|
-
end
|
50
66
|
attribute_name = attribute_name.to_sym
|
51
|
-
|
52
67
|
options = neighbor_attributes[attribute_name]
|
53
68
|
raise ArgumentError, "Invalid attribute" unless options
|
54
69
|
normalize = options[:normalize]
|
@@ -60,10 +75,21 @@ module Neighbor
|
|
60
75
|
|
61
76
|
quoted_attribute = "#{connection.quote_table_name(table_name)}.#{connection.quote_column_name(attribute_name)}"
|
62
77
|
|
63
|
-
column_info =
|
78
|
+
column_info = columns_hash[attribute_name.to_s]
|
79
|
+
column_type = column_info&.type
|
64
80
|
|
65
81
|
operator =
|
66
|
-
|
82
|
+
case column_type
|
83
|
+
when :bit
|
84
|
+
case distance
|
85
|
+
when "hamming"
|
86
|
+
"<~>"
|
87
|
+
when "jaccard"
|
88
|
+
"<%>"
|
89
|
+
when "hamming2"
|
90
|
+
"#"
|
91
|
+
end
|
92
|
+
when :vector, :halfvec, :sparsevec
|
67
93
|
case distance
|
68
94
|
when "inner_product"
|
69
95
|
"<#>"
|
@@ -71,8 +97,10 @@ module Neighbor
|
|
71
97
|
"<=>"
|
72
98
|
when "euclidean"
|
73
99
|
"<->"
|
100
|
+
when "taxicab"
|
101
|
+
"<+>"
|
74
102
|
end
|
75
|
-
|
103
|
+
when :cube
|
76
104
|
case distance
|
77
105
|
when "taxicab"
|
78
106
|
"<#>"
|
@@ -81,27 +109,27 @@ module Neighbor
|
|
81
109
|
when "euclidean", "cosine"
|
82
110
|
"<->"
|
83
111
|
end
|
112
|
+
else
|
113
|
+
raise ArgumentError, "Unsupported type: #{column_type}"
|
84
114
|
end
|
85
115
|
|
86
116
|
raise ArgumentError, "Invalid distance: #{distance}" unless operator
|
87
117
|
|
88
118
|
# ensure normalize set (can be true or false)
|
89
|
-
if distance == "cosine" &&
|
119
|
+
if distance == "cosine" && column_type == :cube && normalize.nil?
|
90
120
|
raise Neighbor::Error, "Set normalize for cosine distance with cube"
|
91
121
|
end
|
92
122
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
query =
|
98
|
-
if column_info[:type] == :vector
|
99
|
-
connection.quote("[#{vector.map(&:to_f).join(", ")}]")
|
100
|
-
else
|
101
|
-
"cube(array[#{vector.map(&:to_f).join(", ")}])"
|
102
|
-
end
|
123
|
+
column_attribute = klass.type_for_attribute(attribute_name)
|
124
|
+
vector = column_attribute.cast(vector)
|
125
|
+
Neighbor::Utils.validate(vector, dimensions: dimensions, column_info: column_info)
|
126
|
+
vector = Neighbor::Utils.normalize(vector, column_info: column_info) if normalize
|
103
127
|
|
128
|
+
query = connection.quote(column_attribute.serialize(vector))
|
104
129
|
order = "#{quoted_attribute} #{operator} #{query}"
|
130
|
+
if operator == "#"
|
131
|
+
order = "bit_count(#{order})"
|
132
|
+
end
|
105
133
|
|
106
134
|
# https://stats.stackexchange.com/questions/146221/is-cosine-similarity-identical-to-l2-normalized-euclidean-distance
|
107
135
|
# with normalized vectors:
|
@@ -109,27 +137,28 @@ module Neighbor
|
|
109
137
|
# cosine distance = 1 - cosine similarity
|
110
138
|
# this transformation doesn't change the order, so only needed for select
|
111
139
|
neighbor_distance =
|
112
|
-
if
|
140
|
+
if column_type == :cube && distance == "cosine"
|
113
141
|
"POWER(#{order}, 2) / 2.0"
|
114
|
-
elsif
|
142
|
+
elsif [:vector, :halfvec, :sparsevec].include?(column_type) && distance == "inner_product"
|
115
143
|
"(#{order}) * -1"
|
116
144
|
else
|
117
145
|
order
|
118
146
|
end
|
119
147
|
|
120
148
|
# for select, use column_names instead of * to account for ignored columns
|
121
|
-
|
149
|
+
select_columns = select_values.any? ? [] : column_names
|
150
|
+
select(*select_columns, "#{neighbor_distance} AS neighbor_distance")
|
122
151
|
.where.not(attribute_name => nil)
|
123
|
-
.
|
152
|
+
.reorder(Arel.sql(order))
|
124
153
|
}
|
125
154
|
|
126
|
-
def nearest_neighbors(attribute_name
|
155
|
+
def nearest_neighbors(attribute_name, **options)
|
127
156
|
attribute_name = attribute_name.to_sym
|
128
|
-
# important! check if neighbor attribute before
|
157
|
+
# important! check if neighbor attribute before accessing
|
129
158
|
raise ArgumentError, "Invalid attribute" unless self.class.neighbor_attributes[attribute_name]
|
130
159
|
|
131
160
|
self.class
|
132
|
-
.where.not(self.class.primary_key
|
161
|
+
.where.not(Array(self.class.primary_key).to_h { |k| [k, self[k]] })
|
133
162
|
.nearest_neighbors(attribute_name, self[attribute_name], **options)
|
134
163
|
end
|
135
164
|
end
|
data/lib/neighbor/railtie.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
module Neighbor
|
2
2
|
class Railtie < Rails::Railtie
|
3
3
|
generators do
|
4
|
+
require "rails/generators/generated_attribute"
|
5
|
+
|
4
6
|
# rails generate model Item embedding:vector{3}
|
5
|
-
|
6
|
-
Rails::Generators::GeneratedAttribute.singleton_class.prepend(Neighbor::GeneratedAttribute)
|
7
|
-
end
|
7
|
+
Rails::Generators::GeneratedAttribute.singleton_class.prepend(Neighbor::GeneratedAttribute)
|
8
8
|
end
|
9
9
|
end
|
10
10
|
|
11
11
|
module GeneratedAttribute
|
12
12
|
def parse_type_and_options(type, *, **)
|
13
|
-
if type =~ /\A(vector)\{(\d+)\}\z/
|
13
|
+
if type =~ /\A(vector|halfvec|sparsevec)\{(\d+)\}\z/
|
14
14
|
return $1, limit: $2.to_i
|
15
15
|
end
|
16
16
|
super
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Neighbor
|
2
|
+
class SparseVector
|
3
|
+
attr_reader :dimensions, :indices, :values
|
4
|
+
|
5
|
+
NO_DEFAULT = Object.new
|
6
|
+
|
7
|
+
def initialize(value, dimensions = NO_DEFAULT)
|
8
|
+
if value.is_a?(Hash)
|
9
|
+
if dimensions == NO_DEFAULT
|
10
|
+
raise ArgumentError, "missing dimensions"
|
11
|
+
end
|
12
|
+
from_hash(value, dimensions)
|
13
|
+
else
|
14
|
+
unless dimensions == NO_DEFAULT
|
15
|
+
raise ArgumentError, "extra argument"
|
16
|
+
end
|
17
|
+
from_array(value)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_s
|
22
|
+
"{#{@indices.zip(@values).map { |i, v| "#{i.to_i + 1}:#{v.to_f}" }.join(",")}}/#{@dimensions.to_i}"
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_a
|
26
|
+
arr = Array.new(dimensions, 0.0)
|
27
|
+
@indices.zip(@values) do |i, v|
|
28
|
+
arr[i] = v
|
29
|
+
end
|
30
|
+
arr
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def from_hash(data, dimensions)
|
36
|
+
elements = data.select { |_, v| v != 0 }.sort
|
37
|
+
@dimensions = dimensions.to_i
|
38
|
+
@indices = elements.map { |v| v[0].to_i }
|
39
|
+
@values = elements.map { |v| v[1].to_f }
|
40
|
+
end
|
41
|
+
|
42
|
+
def from_array(arr)
|
43
|
+
arr = arr.to_a
|
44
|
+
@dimensions = arr.size
|
45
|
+
@indices = []
|
46
|
+
@values = []
|
47
|
+
arr.each_with_index do |v, i|
|
48
|
+
if v != 0
|
49
|
+
@indices << i
|
50
|
+
@values << v.to_f
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class << self
|
56
|
+
def from_text(string)
|
57
|
+
elements, dimensions = string.split("/", 2)
|
58
|
+
indices = []
|
59
|
+
values = []
|
60
|
+
elements[1..-2].split(",").each do |e|
|
61
|
+
index, value = e.split(":", 2)
|
62
|
+
indices << index.to_i - 1
|
63
|
+
values << value.to_f
|
64
|
+
end
|
65
|
+
from_parts(dimensions.to_i, indices, values)
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def from_parts(dimensions, indices, values)
|
71
|
+
vec = allocate
|
72
|
+
vec.instance_variable_set(:@dimensions, dimensions)
|
73
|
+
vec.instance_variable_set(:@indices, indices)
|
74
|
+
vec.instance_variable_set(:@values, values)
|
75
|
+
vec
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Neighbor
|
2
|
+
module Type
|
3
|
+
class Cube < ActiveRecord::Type::Value
|
4
|
+
def type
|
5
|
+
:cube
|
6
|
+
end
|
7
|
+
|
8
|
+
def serialize(value)
|
9
|
+
if value.is_a?(Array)
|
10
|
+
if value.first.is_a?(Array)
|
11
|
+
value = value.map { |v| serialize_point(v) }.join(", ")
|
12
|
+
else
|
13
|
+
value = serialize_point(value)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
super(value)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def cast_value(value)
|
22
|
+
if value.is_a?(Array)
|
23
|
+
value
|
24
|
+
elsif value.is_a?(Numeric)
|
25
|
+
[value]
|
26
|
+
elsif value.is_a?(String)
|
27
|
+
if value.include?("),(")
|
28
|
+
value[1..-1].split("),(").map { |v| v.split(",").map(&:to_f) }
|
29
|
+
else
|
30
|
+
value[1..-1].split(",").map(&:to_f)
|
31
|
+
end
|
32
|
+
else
|
33
|
+
raise "can't cast #{value.class.name} to cube"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def serialize_point(value)
|
38
|
+
"(#{value.map(&:to_f).join(", ")})"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Neighbor
|
2
|
+
module Type
|
3
|
+
class Halfvec < ActiveRecord::Type::Value
|
4
|
+
def type
|
5
|
+
:halfvec
|
6
|
+
end
|
7
|
+
|
8
|
+
def serialize(value)
|
9
|
+
if value.is_a?(Array)
|
10
|
+
value = "[#{value.map(&:to_f).join(",")}]"
|
11
|
+
end
|
12
|
+
super(value)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def cast_value(value)
|
18
|
+
if value.is_a?(String)
|
19
|
+
value[1..-1].split(",").map(&:to_f)
|
20
|
+
elsif value.is_a?(Array)
|
21
|
+
value
|
22
|
+
else
|
23
|
+
raise "can't cast #{value.class.name} to halfvec"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Neighbor
|
2
|
+
module Type
|
3
|
+
class Sparsevec < ActiveRecord::Type::Value
|
4
|
+
def type
|
5
|
+
:sparsevec
|
6
|
+
end
|
7
|
+
|
8
|
+
def serialize(value)
|
9
|
+
if value.is_a?(SparseVector)
|
10
|
+
value = "{#{value.indices.zip(value.values).map { |i, v| "#{i.to_i + 1}:#{v.to_f}" }.join(",")}}/#{value.dimensions.to_i}"
|
11
|
+
end
|
12
|
+
super(value)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def cast_value(value)
|
18
|
+
if value.is_a?(SparseVector)
|
19
|
+
value
|
20
|
+
elsif value.is_a?(String)
|
21
|
+
SparseVector.from_text(value)
|
22
|
+
elsif value.is_a?(Array)
|
23
|
+
value = SparseVector.new(value)
|
24
|
+
else
|
25
|
+
raise "can't cast #{value.class.name} to sparsevec"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Neighbor
|
2
|
+
module Type
|
3
|
+
class Vector < ActiveRecord::Type::Value
|
4
|
+
def type
|
5
|
+
:vector
|
6
|
+
end
|
7
|
+
|
8
|
+
def serialize(value)
|
9
|
+
if value.is_a?(Array)
|
10
|
+
value = "[#{value.map(&:to_f).join(",")}]"
|
11
|
+
end
|
12
|
+
super(value)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def cast_value(value)
|
18
|
+
if value.is_a?(String)
|
19
|
+
value[1..-1].split(",").map(&:to_f)
|
20
|
+
elsif value.is_a?(Array)
|
21
|
+
value
|
22
|
+
else
|
23
|
+
raise "can't cast #{value.class.name} to vector"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Neighbor
|
2
|
+
module Utils
|
3
|
+
def self.validate_dimensions(value, type, expected)
|
4
|
+
dimensions = type == :sparsevec ? value.dimensions : value.size
|
5
|
+
if expected && dimensions != expected
|
6
|
+
"Expected #{expected} dimensions, not #{dimensions}"
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.validate_finite(value, type)
|
11
|
+
case type
|
12
|
+
when :bit
|
13
|
+
true
|
14
|
+
when :sparsevec
|
15
|
+
value.values.all?(&:finite?)
|
16
|
+
else
|
17
|
+
value.all?(&:finite?)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.validate(value, dimensions:, column_info:)
|
22
|
+
if (message = validate_dimensions(value, column_info&.type, dimensions || column_info&.limit))
|
23
|
+
raise Error, message
|
24
|
+
end
|
25
|
+
|
26
|
+
if !validate_finite(value, column_info&.type)
|
27
|
+
raise Error, "Values must be finite"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.normalize(value, column_info:)
|
32
|
+
raise Error, "Normalize not supported for type" unless [:cube, :vector, :halfvec].include?(column_info&.type)
|
33
|
+
|
34
|
+
norm = Math.sqrt(value.sum { |v| v * v })
|
35
|
+
|
36
|
+
# store zero vector as all zeros
|
37
|
+
# since NaN makes the distance always 0
|
38
|
+
# could also throw error
|
39
|
+
norm > 0 ? value.map { |v| v / norm } : value
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/lib/neighbor/version.rb
CHANGED
data/lib/neighbor.rb
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
require "active_support"
|
3
3
|
|
4
4
|
# modules
|
5
|
+
require_relative "neighbor/sparse_vector"
|
6
|
+
require_relative "neighbor/utils"
|
5
7
|
require_relative "neighbor/version"
|
6
8
|
|
7
9
|
module Neighbor
|
@@ -10,10 +12,18 @@ module Neighbor
|
|
10
12
|
module RegisterTypes
|
11
13
|
def initialize_type_map(m = type_map)
|
12
14
|
super
|
13
|
-
m.register_type "cube",
|
15
|
+
m.register_type "cube", Type::Cube.new
|
16
|
+
m.register_type "halfvec" do |_, _, sql_type|
|
17
|
+
limit = extract_limit(sql_type)
|
18
|
+
Type::Halfvec.new(limit: limit)
|
19
|
+
end
|
20
|
+
m.register_type "sparsevec" do |_, _, sql_type|
|
21
|
+
limit = extract_limit(sql_type)
|
22
|
+
Type::Sparsevec.new(limit: limit)
|
23
|
+
end
|
14
24
|
m.register_type "vector" do |_, _, sql_type|
|
15
25
|
limit = extract_limit(sql_type)
|
16
|
-
|
26
|
+
Type::Vector.new(limit: limit)
|
17
27
|
end
|
18
28
|
end
|
19
29
|
end
|
@@ -21,7 +31,10 @@ end
|
|
21
31
|
|
22
32
|
ActiveSupport.on_load(:active_record) do
|
23
33
|
require_relative "neighbor/model"
|
24
|
-
require_relative "neighbor/
|
34
|
+
require_relative "neighbor/type/cube"
|
35
|
+
require_relative "neighbor/type/halfvec"
|
36
|
+
require_relative "neighbor/type/sparsevec"
|
37
|
+
require_relative "neighbor/type/vector"
|
25
38
|
|
26
39
|
extend Neighbor::Model
|
27
40
|
|
@@ -29,10 +42,12 @@ ActiveSupport.on_load(:active_record) do
|
|
29
42
|
|
30
43
|
# ensure schema can be dumped
|
31
44
|
ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:cube] = {name: "cube"}
|
45
|
+
ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:halfvec] = {name: "halfvec"}
|
46
|
+
ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:sparsevec] = {name: "sparsevec"}
|
32
47
|
ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:vector] = {name: "vector"}
|
33
48
|
|
34
49
|
# ensure schema can be loaded
|
35
|
-
ActiveRecord::ConnectionAdapters::TableDefinition.send(:define_column_methods, :cube, :vector)
|
50
|
+
ActiveRecord::ConnectionAdapters::TableDefinition.send(:define_column_methods, :cube, :halfvec, :sparsevec, :vector)
|
36
51
|
|
37
52
|
# prevent unknown OID warning
|
38
53
|
if ActiveRecord::VERSION::MAJOR >= 7
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: neighbor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-06-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -40,7 +40,12 @@ files:
|
|
40
40
|
- lib/neighbor.rb
|
41
41
|
- lib/neighbor/model.rb
|
42
42
|
- lib/neighbor/railtie.rb
|
43
|
-
- lib/neighbor/
|
43
|
+
- lib/neighbor/sparse_vector.rb
|
44
|
+
- lib/neighbor/type/cube.rb
|
45
|
+
- lib/neighbor/type/halfvec.rb
|
46
|
+
- lib/neighbor/type/sparsevec.rb
|
47
|
+
- lib/neighbor/type/vector.rb
|
48
|
+
- lib/neighbor/utils.rb
|
44
49
|
- lib/neighbor/version.rb
|
45
50
|
homepage: https://github.com/ankane/neighbor
|
46
51
|
licenses:
|
@@ -54,14 +59,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
54
59
|
requirements:
|
55
60
|
- - ">="
|
56
61
|
- !ruby/object:Gem::Version
|
57
|
-
version: '3'
|
62
|
+
version: '3.1'
|
58
63
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
64
|
requirements:
|
60
65
|
- - ">="
|
61
66
|
- !ruby/object:Gem::Version
|
62
67
|
version: '0'
|
63
68
|
requirements: []
|
64
|
-
rubygems_version: 3.
|
69
|
+
rubygems_version: 3.5.11
|
65
70
|
signing_key:
|
66
71
|
specification_version: 4
|
67
72
|
summary: Nearest neighbor search for Rails and Postgres
|
data/lib/neighbor/vector.rb
DELETED
@@ -1,65 +0,0 @@
|
|
1
|
-
module Neighbor
|
2
|
-
class Vector < ActiveRecord::Type::Value
|
3
|
-
def initialize(dimensions:, normalize:, model:, attribute_name:)
|
4
|
-
super()
|
5
|
-
@dimensions = dimensions
|
6
|
-
@normalize = normalize
|
7
|
-
@model = model
|
8
|
-
@attribute_name = attribute_name
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.cast(value, dimensions:, normalize:, column_info:)
|
12
|
-
value = value.to_a.map(&:to_f)
|
13
|
-
|
14
|
-
dimensions ||= column_info[:dimensions]
|
15
|
-
raise Error, "Expected #{dimensions} dimensions, not #{value.size}" if dimensions && value.size != dimensions
|
16
|
-
|
17
|
-
raise Error, "Values must be finite" unless value.all?(&:finite?)
|
18
|
-
|
19
|
-
if normalize
|
20
|
-
norm = Math.sqrt(value.sum { |v| v * v })
|
21
|
-
|
22
|
-
# store zero vector as all zeros
|
23
|
-
# since NaN makes the distance always 0
|
24
|
-
# could also throw error
|
25
|
-
|
26
|
-
# safe to update in-place since earlier map dups
|
27
|
-
value.map! { |v| v / norm } if norm > 0
|
28
|
-
end
|
29
|
-
|
30
|
-
value
|
31
|
-
end
|
32
|
-
|
33
|
-
def self.column_info(model, attribute_name)
|
34
|
-
attribute_name = attribute_name.to_s
|
35
|
-
column = model.columns.detect { |c| c.name == attribute_name }
|
36
|
-
{
|
37
|
-
type: column.try(:type),
|
38
|
-
dimensions: column.try(:limit)
|
39
|
-
}
|
40
|
-
end
|
41
|
-
|
42
|
-
# need to be careful to avoid loading column info before needed
|
43
|
-
def column_info
|
44
|
-
@column_info ||= self.class.column_info(@model, @attribute_name)
|
45
|
-
end
|
46
|
-
|
47
|
-
def cast(value)
|
48
|
-
self.class.cast(value, dimensions: @dimensions, normalize: @normalize, column_info: column_info) unless value.nil?
|
49
|
-
end
|
50
|
-
|
51
|
-
def serialize(value)
|
52
|
-
unless value.nil?
|
53
|
-
if column_info[:type] == :vector
|
54
|
-
"[#{cast(value).join(", ")}]"
|
55
|
-
else
|
56
|
-
"(#{cast(value).join(", ")})"
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
def deserialize(value)
|
62
|
-
value[1..-1].split(",").map(&:to_f) unless value.nil?
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|