neighbor 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +299 -20
- data/lib/neighbor/model.rb +14 -1
- data/lib/neighbor/railtie.rb +1 -1
- data/lib/neighbor/type/cube.rb +4 -3
- data/lib/neighbor/type/halfvec.rb +4 -4
- data/lib/neighbor/type/sparsevec.rb +2 -2
- data/lib/neighbor/type/vector.rb +4 -4
- data/lib/neighbor/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8aa6de2790d94de9411b0142836b2ad181a411e299fce4b98357b96ac4161183
|
4
|
+
data.tar.gz: 2924d7f15f5b36bc89ee72372c1bfeb373d99481269696a9a9dcc41f90201f38
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2bc1b3ee6d5b1ee0ab175b017e753cf958bd8ceb1ef2a23ba769770dfebf54eec251ac59c8f5f3b6ca56efcbad1763c34622b94924a017622c2f78fc8740f762
|
7
|
+
data.tar.gz: d946dda99833964582f63863b2d898fea6bf065312cf60aec873631df96195e1a54375606ad9c9cc0f767937cdb7ea38b0d9990efcbbeab15ccbb11f8a2020ef
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -35,7 +35,7 @@ rails db:migrate
|
|
35
35
|
Create a migration
|
36
36
|
|
37
37
|
```ruby
|
38
|
-
class AddEmbeddingToItems < ActiveRecord::Migration[7.
|
38
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
|
39
39
|
def change
|
40
40
|
add_column :items, :embedding, :cube
|
41
41
|
# or
|
@@ -76,9 +76,11 @@ Supported values are:
|
|
76
76
|
|
77
77
|
- `euclidean`
|
78
78
|
- `cosine`
|
79
|
-
- `taxicab`
|
79
|
+
- `taxicab`
|
80
80
|
- `chebyshev` (cube only)
|
81
81
|
- `inner_product` (vector only)
|
82
|
+
- `hamming` (vector only)
|
83
|
+
- `jaccard` (vector only)
|
82
84
|
|
83
85
|
For cosine distance with cube, vectors must be normalized before being stored.
|
84
86
|
|
@@ -114,7 +116,7 @@ end
|
|
114
116
|
For vector, add an approximate index to speed up queries. Create a migration with:
|
115
117
|
|
116
118
|
```ruby
|
117
|
-
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.
|
119
|
+
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
|
118
120
|
def change
|
119
121
|
add_index :items, :embedding, using: :hnsw, opclass: :vector_l2_ops
|
120
122
|
# or
|
@@ -137,9 +139,91 @@ Or the number of probes with IVFFlat
|
|
137
139
|
Item.connection.execute("SET ivfflat.probes = 3")
|
138
140
|
```
|
139
141
|
|
142
|
+
## Half-Precision Vectors
|
143
|
+
|
144
|
+
Use the `halfvec` type to store half-precision vectors
|
145
|
+
|
146
|
+
```ruby
|
147
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
|
148
|
+
def change
|
149
|
+
add_column :items, :embedding, :halfvec, limit: 3 # dimensions
|
150
|
+
end
|
151
|
+
end
|
152
|
+
```
|
153
|
+
|
154
|
+
## Half-Precision Indexing
|
155
|
+
|
156
|
+
Index vectors at half precision for smaller indexes
|
157
|
+
|
158
|
+
```ruby
|
159
|
+
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
|
160
|
+
def change
|
161
|
+
add_index :items, "(embedding::halfvec(3)) vector_l2_ops", using: :hnsw
|
162
|
+
end
|
163
|
+
end
|
164
|
+
```
|
165
|
+
|
166
|
+
Get the nearest neighbors
|
167
|
+
|
168
|
+
```ruby
|
169
|
+
Item.nearest_neighbors(:embedding, [0.9, 1.3, 1.1], distance: "euclidean", precision: "half").first(5)
|
170
|
+
```
|
171
|
+
|
172
|
+
## Binary Vectors
|
173
|
+
|
174
|
+
Use the `bit` type to store binary vectors
|
175
|
+
|
176
|
+
```ruby
|
177
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
|
178
|
+
def change
|
179
|
+
add_column :items, :embedding, :bit, limit: 3 # dimensions
|
180
|
+
end
|
181
|
+
end
|
182
|
+
```
|
183
|
+
|
184
|
+
Get the nearest neighbors by Hamming distance
|
185
|
+
|
186
|
+
```ruby
|
187
|
+
Item.nearest_neighbors(:embedding, "101", distance: "hamming").first(5)
|
188
|
+
```
|
189
|
+
|
190
|
+
## Binary Quantization
|
191
|
+
|
192
|
+
Use expression indexing for binary quantization
|
193
|
+
|
194
|
+
```ruby
|
195
|
+
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
|
196
|
+
def change
|
197
|
+
add_index :items, "(binary_quantize(embedding)::bit(3)) bit_hamming_ops", using: :hnsw
|
198
|
+
end
|
199
|
+
end
|
200
|
+
```
|
201
|
+
|
202
|
+
## Sparse Vectors
|
203
|
+
|
204
|
+
Use the `sparsevec` type to store sparse vectors
|
205
|
+
|
206
|
+
```ruby
|
207
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
|
208
|
+
def change
|
209
|
+
add_column :items, :embedding, :sparsevec, limit: 3 # dimensions
|
210
|
+
end
|
211
|
+
end
|
212
|
+
```
|
213
|
+
|
214
|
+
Get the nearest neighbors
|
215
|
+
|
216
|
+
```ruby
|
217
|
+
embedding = Neighbor::SparseVector.new({0 => 0.9, 1 => 1.3, 2 => 1.1}, 3)
|
218
|
+
Item.nearest_neighbors(:embedding, embedding, distance: "euclidean").first(5)
|
219
|
+
```
|
220
|
+
|
140
221
|
## Examples
|
141
222
|
|
142
223
|
- [OpenAI Embeddings](#openai-embeddings)
|
224
|
+
- [Cohere Embeddings](#cohere-embeddings)
|
225
|
+
- [Sentence Embeddings](#sentence-embeddings)
|
226
|
+
- [Sparse Embeddings](#sparse-embeddings)
|
143
227
|
- [Disco Recommendations](#disco-recommendations)
|
144
228
|
|
145
229
|
### OpenAI Embeddings
|
@@ -170,10 +254,10 @@ def fetch_embeddings(input)
|
|
170
254
|
}
|
171
255
|
data = {
|
172
256
|
input: input,
|
173
|
-
model: "text-embedding-
|
257
|
+
model: "text-embedding-3-small"
|
174
258
|
}
|
175
259
|
|
176
|
-
response = Net::HTTP.post(URI(url), data.to_json, headers)
|
260
|
+
response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value)
|
177
261
|
JSON.parse(response.body)["data"].map { |v| v["embedding"] }
|
178
262
|
end
|
179
263
|
```
|
@@ -199,14 +283,221 @@ end
|
|
199
283
|
Document.insert_all!(documents)
|
200
284
|
```
|
201
285
|
|
202
|
-
And get similar
|
286
|
+
And get similar documents
|
203
287
|
|
204
288
|
```ruby
|
205
289
|
document = Document.first
|
206
290
|
document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content)
|
207
291
|
```
|
208
292
|
|
209
|
-
See the [complete code](examples/
|
293
|
+
See the [complete code](examples/openai/example.rb)
|
294
|
+
|
295
|
+
### Cohere Embeddings
|
296
|
+
|
297
|
+
Generate a model
|
298
|
+
|
299
|
+
```sh
|
300
|
+
rails generate model Document content:text embedding:bit{1024}
|
301
|
+
rails db:migrate
|
302
|
+
```
|
303
|
+
|
304
|
+
And add `has_neighbors`
|
305
|
+
|
306
|
+
```ruby
|
307
|
+
class Document < ApplicationRecord
|
308
|
+
has_neighbors :embedding
|
309
|
+
end
|
310
|
+
```
|
311
|
+
|
312
|
+
Create a method to call the [embed API](https://docs.cohere.com/reference/embed)
|
313
|
+
|
314
|
+
```ruby
|
315
|
+
def fetch_embeddings(input, input_type)
|
316
|
+
url = "https://api.cohere.com/v1/embed"
|
317
|
+
headers = {
|
318
|
+
"Authorization" => "Bearer #{ENV.fetch("CO_API_KEY")}",
|
319
|
+
"Content-Type" => "application/json"
|
320
|
+
}
|
321
|
+
data = {
|
322
|
+
texts: input,
|
323
|
+
model: "embed-english-v3.0",
|
324
|
+
input_type: input_type,
|
325
|
+
embedding_types: ["ubinary"]
|
326
|
+
}
|
327
|
+
|
328
|
+
response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value)
|
329
|
+
JSON.parse(response.body)["embeddings"]["ubinary"].map { |e| e.map { |v| v.chr.unpack1("B*") }.join }
|
330
|
+
end
|
331
|
+
```
|
332
|
+
|
333
|
+
Pass your input
|
334
|
+
|
335
|
+
```ruby
|
336
|
+
input = [
|
337
|
+
"The dog is barking",
|
338
|
+
"The cat is purring",
|
339
|
+
"The bear is growling"
|
340
|
+
]
|
341
|
+
embeddings = fetch_embeddings(input, "search_document")
|
342
|
+
```
|
343
|
+
|
344
|
+
Store the embeddings
|
345
|
+
|
346
|
+
```ruby
|
347
|
+
documents = []
|
348
|
+
input.zip(embeddings) do |content, embedding|
|
349
|
+
documents << {content: content, embedding: embedding}
|
350
|
+
end
|
351
|
+
Document.insert_all!(documents)
|
352
|
+
```
|
353
|
+
|
354
|
+
Embed the search query
|
355
|
+
|
356
|
+
```ruby
|
357
|
+
query = "forest"
|
358
|
+
query_embedding = fetch_embeddings([query], "search_query")[0]
|
359
|
+
```
|
360
|
+
|
361
|
+
And search the documents
|
362
|
+
|
363
|
+
```ruby
|
364
|
+
Document.nearest_neighbors(:embedding, query_embedding, distance: "hamming").first(5).map(&:content)
|
365
|
+
```
|
366
|
+
|
367
|
+
See the [complete code](examples/cohere/example.rb)
|
368
|
+
|
369
|
+
### Sentence Embeddings
|
370
|
+
|
371
|
+
You can generate embeddings locally with [Informers](https://github.com/ankane/informers).
|
372
|
+
|
373
|
+
Generate a model
|
374
|
+
|
375
|
+
```sh
|
376
|
+
rails generate model Document content:text embedding:vector{384}
|
377
|
+
rails db:migrate
|
378
|
+
```
|
379
|
+
|
380
|
+
And add `has_neighbors`
|
381
|
+
|
382
|
+
```ruby
|
383
|
+
class Document < ApplicationRecord
|
384
|
+
has_neighbors :embedding
|
385
|
+
end
|
386
|
+
```
|
387
|
+
|
388
|
+
Load a [model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
|
389
|
+
|
390
|
+
```ruby
|
391
|
+
model = Informers::Model.new("sentence-transformers/all-MiniLM-L6-v2")
|
392
|
+
```
|
393
|
+
|
394
|
+
Pass your input
|
395
|
+
|
396
|
+
```ruby
|
397
|
+
input = [
|
398
|
+
"The dog is barking",
|
399
|
+
"The cat is purring",
|
400
|
+
"The bear is growling"
|
401
|
+
]
|
402
|
+
embeddings = model.embed(input)
|
403
|
+
```
|
404
|
+
|
405
|
+
Store the embeddings
|
406
|
+
|
407
|
+
```ruby
|
408
|
+
documents = []
|
409
|
+
input.zip(embeddings) do |content, embedding|
|
410
|
+
documents << {content: content, embedding: embedding}
|
411
|
+
end
|
412
|
+
Document.insert_all!(documents)
|
413
|
+
```
|
414
|
+
|
415
|
+
And get similar documents
|
416
|
+
|
417
|
+
```ruby
|
418
|
+
document = Document.first
|
419
|
+
document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content)
|
420
|
+
```
|
421
|
+
|
422
|
+
See the [complete code](examples/informers/example.rb)
|
423
|
+
|
424
|
+
### Sparse Embeddings
|
425
|
+
|
426
|
+
You can generate sparse embeddings locally with [Transformers.rb](https://github.com/ankane/transformers-ruby).
|
427
|
+
|
428
|
+
Generate a model
|
429
|
+
|
430
|
+
```sh
|
431
|
+
rails generate model Document content:text embedding:sparsevec{30522}
|
432
|
+
rails db:migrate
|
433
|
+
```
|
434
|
+
|
435
|
+
And add `has_neighbors`
|
436
|
+
|
437
|
+
```ruby
|
438
|
+
class Document < ApplicationRecord
|
439
|
+
has_neighbors :embedding
|
440
|
+
end
|
441
|
+
```
|
442
|
+
|
443
|
+
Load a [model](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1) to generate embeddings
|
444
|
+
|
445
|
+
```ruby
|
446
|
+
class EmbeddingModel
|
447
|
+
def initialize(model_id)
|
448
|
+
@model = Transformers::AutoModelForMaskedLM.from_pretrained(model_id)
|
449
|
+
@tokenizer = Transformers::AutoTokenizer.from_pretrained(model_id)
|
450
|
+
@special_token_ids = @tokenizer.special_tokens_map.map { |_, token| @tokenizer.vocab[token] }
|
451
|
+
end
|
452
|
+
|
453
|
+
def embed(input)
|
454
|
+
feature = @tokenizer.(input, padding: true, truncation: true, return_tensors: "pt", return_token_type_ids: false)
|
455
|
+
output = @model.(**feature)[0]
|
456
|
+
values = Torch.max(output * feature[:attention_mask].unsqueeze(-1), dim: 1)[0]
|
457
|
+
values = Torch.log(1 + Torch.relu(values))
|
458
|
+
values[0.., @special_token_ids] = 0
|
459
|
+
values.to_a
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
model = EmbeddingModel.new("opensearch-project/opensearch-neural-sparse-encoding-v1")
|
464
|
+
```
|
465
|
+
|
466
|
+
Pass your input
|
467
|
+
|
468
|
+
```ruby
|
469
|
+
input = [
|
470
|
+
"The dog is barking",
|
471
|
+
"The cat is purring",
|
472
|
+
"The bear is growling"
|
473
|
+
]
|
474
|
+
embeddings = model.embed(input)
|
475
|
+
```
|
476
|
+
|
477
|
+
Store the embeddings
|
478
|
+
|
479
|
+
```ruby
|
480
|
+
documents = []
|
481
|
+
input.zip(embeddings) do |content, embedding|
|
482
|
+
documents << {content: content, embedding: Neighbor::SparseVector.new(embedding)}
|
483
|
+
end
|
484
|
+
Document.insert_all!(documents)
|
485
|
+
```
|
486
|
+
|
487
|
+
Embed the search query
|
488
|
+
|
489
|
+
```ruby
|
490
|
+
query = "forest"
|
491
|
+
query_embedding = model.embed([query])[0]
|
492
|
+
```
|
493
|
+
|
494
|
+
And search the documents
|
495
|
+
|
496
|
+
```ruby
|
497
|
+
Document.nearest_neighbors(:embedding, Neighbor::SparseVector.new(query_embedding), distance: "inner_product").first(5).map(&:content)
|
498
|
+
```
|
499
|
+
|
500
|
+
See the [complete code](examples/sparse/example.rb)
|
210
501
|
|
211
502
|
### Disco Recommendations
|
212
503
|
|
@@ -252,19 +543,7 @@ movie = Movie.find_by(name: "Star Wars (1977)")
|
|
252
543
|
movie.nearest_neighbors(:factors, distance: "cosine").first(5).map(&:name)
|
253
544
|
```
|
254
545
|
|
255
|
-
See the complete code for [cube](examples/
|
256
|
-
|
257
|
-
## Upgrading
|
258
|
-
|
259
|
-
### 0.2.0
|
260
|
-
|
261
|
-
The `distance` option has been moved from `has_neighbors` to `nearest_neighbors`, and there is no longer a default. If you use cosine distance, set:
|
262
|
-
|
263
|
-
```ruby
|
264
|
-
class Item < ApplicationRecord
|
265
|
-
has_neighbors normalize: true
|
266
|
-
end
|
267
|
-
```
|
546
|
+
See the complete code for [cube](examples/disco/item_recs_cube.rb) and [vector](examples/disco/item_recs_vector.rb)
|
268
547
|
|
269
548
|
## History
|
270
549
|
|
data/lib/neighbor/model.rb
CHANGED
@@ -49,7 +49,7 @@ module Neighbor
|
|
49
49
|
# TODO move to normalizes when Active Record < 7.1 no longer supported
|
50
50
|
before_save do
|
51
51
|
self.class.neighbor_attributes.each do |k, v|
|
52
|
-
next unless v[:normalize]
|
52
|
+
next unless v[:normalize] && attribute_changed?(k)
|
53
53
|
value = read_attribute(k)
|
54
54
|
next if value.nil?
|
55
55
|
self[k] = Neighbor::Utils.normalize(value, column_info: self.class.columns_hash[k.to_s])
|
@@ -61,6 +61,7 @@ module Neighbor
|
|
61
61
|
scope :nearest_neighbors, ->(attribute_name, vector, options = nil) {
|
62
62
|
raise ArgumentError, "missing keyword: :distance" unless options.is_a?(Hash) && options.key?(:distance)
|
63
63
|
distance = options.delete(:distance)
|
64
|
+
precision = options.delete(:precision)
|
64
65
|
raise ArgumentError, "unknown keywords: #{options.keys.map(&:inspect).join(", ")}" if options.any?
|
65
66
|
|
66
67
|
attribute_name = attribute_name.to_sym
|
@@ -126,6 +127,18 @@ module Neighbor
|
|
126
127
|
vector = Neighbor::Utils.normalize(vector, column_info: column_info) if normalize
|
127
128
|
|
128
129
|
query = connection.quote(column_attribute.serialize(vector))
|
130
|
+
|
131
|
+
if !precision.nil?
|
132
|
+
case precision.to_s
|
133
|
+
when "half"
|
134
|
+
cast_dimensions = dimensions || column_info&.limit
|
135
|
+
raise ArgumentError, "Unknown dimensions" unless cast_dimensions
|
136
|
+
quoted_attribute += "::halfvec(#{connection.quote(cast_dimensions.to_i)})"
|
137
|
+
else
|
138
|
+
raise ArgumentError, "Invalid precision"
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
129
142
|
order = "#{quoted_attribute} #{operator} #{query}"
|
130
143
|
if operator == "#"
|
131
144
|
order = "bit_count(#{order})"
|
data/lib/neighbor/railtie.rb
CHANGED
data/lib/neighbor/type/cube.rb
CHANGED
@@ -6,7 +6,8 @@ module Neighbor
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def serialize(value)
|
9
|
-
if value.
|
9
|
+
if value.respond_to?(:to_a)
|
10
|
+
value = value.to_a
|
10
11
|
if value.first.is_a?(Array)
|
11
12
|
value = value.map { |v| serialize_point(v) }.join(", ")
|
12
13
|
else
|
@@ -19,8 +20,8 @@ module Neighbor
|
|
19
20
|
private
|
20
21
|
|
21
22
|
def cast_value(value)
|
22
|
-
if value.
|
23
|
-
value
|
23
|
+
if value.respond_to?(:to_a)
|
24
|
+
value.to_a
|
24
25
|
elsif value.is_a?(Numeric)
|
25
26
|
[value]
|
26
27
|
elsif value.is_a?(String)
|
@@ -6,8 +6,8 @@ module Neighbor
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def serialize(value)
|
9
|
-
if value.
|
10
|
-
value = "[#{value.map(&:to_f).join(",")}]"
|
9
|
+
if value.respond_to?(:to_a)
|
10
|
+
value = "[#{value.to_a.map(&:to_f).join(",")}]"
|
11
11
|
end
|
12
12
|
super(value)
|
13
13
|
end
|
@@ -17,8 +17,8 @@ module Neighbor
|
|
17
17
|
def cast_value(value)
|
18
18
|
if value.is_a?(String)
|
19
19
|
value[1..-1].split(",").map(&:to_f)
|
20
|
-
elsif value.
|
21
|
-
value
|
20
|
+
elsif value.respond_to?(:to_a)
|
21
|
+
value.to_a
|
22
22
|
else
|
23
23
|
raise "can't cast #{value.class.name} to halfvec"
|
24
24
|
end
|
@@ -19,8 +19,8 @@ module Neighbor
|
|
19
19
|
value
|
20
20
|
elsif value.is_a?(String)
|
21
21
|
SparseVector.from_text(value)
|
22
|
-
elsif value.
|
23
|
-
value = SparseVector.new(value)
|
22
|
+
elsif value.respond_to?(:to_a)
|
23
|
+
value = SparseVector.new(value.to_a)
|
24
24
|
else
|
25
25
|
raise "can't cast #{value.class.name} to sparsevec"
|
26
26
|
end
|
data/lib/neighbor/type/vector.rb
CHANGED
@@ -6,8 +6,8 @@ module Neighbor
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def serialize(value)
|
9
|
-
if value.
|
10
|
-
value = "[#{value.map(&:to_f).join(",")}]"
|
9
|
+
if value.respond_to?(:to_a)
|
10
|
+
value = "[#{value.to_a.map(&:to_f).join(",")}]"
|
11
11
|
end
|
12
12
|
super(value)
|
13
13
|
end
|
@@ -17,8 +17,8 @@ module Neighbor
|
|
17
17
|
def cast_value(value)
|
18
18
|
if value.is_a?(String)
|
19
19
|
value[1..-1].split(",").map(&:to_f)
|
20
|
-
elsif value.
|
21
|
-
value
|
20
|
+
elsif value.respond_to?(:to_a)
|
21
|
+
value.to_a
|
22
22
|
else
|
23
23
|
raise "can't cast #{value.class.name} to vector"
|
24
24
|
end
|
data/lib/neighbor/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: neighbor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-08-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|