neighbor 0.4.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +299 -20
- data/lib/neighbor/model.rb +14 -1
- data/lib/neighbor/railtie.rb +1 -1
- data/lib/neighbor/type/cube.rb +4 -3
- data/lib/neighbor/type/halfvec.rb +4 -4
- data/lib/neighbor/type/sparsevec.rb +2 -2
- data/lib/neighbor/type/vector.rb +4 -4
- data/lib/neighbor/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8aa6de2790d94de9411b0142836b2ad181a411e299fce4b98357b96ac4161183
|
4
|
+
data.tar.gz: 2924d7f15f5b36bc89ee72372c1bfeb373d99481269696a9a9dcc41f90201f38
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2bc1b3ee6d5b1ee0ab175b017e753cf958bd8ceb1ef2a23ba769770dfebf54eec251ac59c8f5f3b6ca56efcbad1763c34622b94924a017622c2f78fc8740f762
|
7
|
+
data.tar.gz: d946dda99833964582f63863b2d898fea6bf065312cf60aec873631df96195e1a54375606ad9c9cc0f767937cdb7ea38b0d9990efcbbeab15ccbb11f8a2020ef
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -35,7 +35,7 @@ rails db:migrate
|
|
35
35
|
Create a migration
|
36
36
|
|
37
37
|
```ruby
|
38
|
-
class AddEmbeddingToItems < ActiveRecord::Migration[7.
|
38
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
|
39
39
|
def change
|
40
40
|
add_column :items, :embedding, :cube
|
41
41
|
# or
|
@@ -76,9 +76,11 @@ Supported values are:
|
|
76
76
|
|
77
77
|
- `euclidean`
|
78
78
|
- `cosine`
|
79
|
-
- `taxicab`
|
79
|
+
- `taxicab`
|
80
80
|
- `chebyshev` (cube only)
|
81
81
|
- `inner_product` (vector only)
|
82
|
+
- `hamming` (vector only)
|
83
|
+
- `jaccard` (vector only)
|
82
84
|
|
83
85
|
For cosine distance with cube, vectors must be normalized before being stored.
|
84
86
|
|
@@ -114,7 +116,7 @@ end
|
|
114
116
|
For vector, add an approximate index to speed up queries. Create a migration with:
|
115
117
|
|
116
118
|
```ruby
|
117
|
-
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.
|
119
|
+
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
|
118
120
|
def change
|
119
121
|
add_index :items, :embedding, using: :hnsw, opclass: :vector_l2_ops
|
120
122
|
# or
|
@@ -137,9 +139,91 @@ Or the number of probes with IVFFlat
|
|
137
139
|
Item.connection.execute("SET ivfflat.probes = 3")
|
138
140
|
```
|
139
141
|
|
142
|
+
## Half-Precision Vectors
|
143
|
+
|
144
|
+
Use the `halfvec` type to store half-precision vectors
|
145
|
+
|
146
|
+
```ruby
|
147
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
|
148
|
+
def change
|
149
|
+
add_column :items, :embedding, :halfvec, limit: 3 # dimensions
|
150
|
+
end
|
151
|
+
end
|
152
|
+
```
|
153
|
+
|
154
|
+
## Half-Precision Indexing
|
155
|
+
|
156
|
+
Index vectors at half precision for smaller indexes
|
157
|
+
|
158
|
+
```ruby
|
159
|
+
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
|
160
|
+
def change
|
161
|
+
add_index :items, "(embedding::halfvec(3)) vector_l2_ops", using: :hnsw
|
162
|
+
end
|
163
|
+
end
|
164
|
+
```
|
165
|
+
|
166
|
+
Get the nearest neighbors
|
167
|
+
|
168
|
+
```ruby
|
169
|
+
Item.nearest_neighbors(:embedding, [0.9, 1.3, 1.1], distance: "euclidean", precision: "half").first(5)
|
170
|
+
```
|
171
|
+
|
172
|
+
## Binary Vectors
|
173
|
+
|
174
|
+
Use the `bit` type to store binary vectors
|
175
|
+
|
176
|
+
```ruby
|
177
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
|
178
|
+
def change
|
179
|
+
add_column :items, :embedding, :bit, limit: 3 # dimensions
|
180
|
+
end
|
181
|
+
end
|
182
|
+
```
|
183
|
+
|
184
|
+
Get the nearest neighbors by Hamming distance
|
185
|
+
|
186
|
+
```ruby
|
187
|
+
Item.nearest_neighbors(:embedding, "101", distance: "hamming").first(5)
|
188
|
+
```
|
189
|
+
|
190
|
+
## Binary Quantization
|
191
|
+
|
192
|
+
Use expression indexing for binary quantization
|
193
|
+
|
194
|
+
```ruby
|
195
|
+
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
|
196
|
+
def change
|
197
|
+
add_index :items, "(binary_quantize(embedding)::bit(3)) bit_hamming_ops", using: :hnsw
|
198
|
+
end
|
199
|
+
end
|
200
|
+
```
|
201
|
+
|
202
|
+
## Sparse Vectors
|
203
|
+
|
204
|
+
Use the `sparsevec` type to store sparse vectors
|
205
|
+
|
206
|
+
```ruby
|
207
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
|
208
|
+
def change
|
209
|
+
add_column :items, :embedding, :sparsevec, limit: 3 # dimensions
|
210
|
+
end
|
211
|
+
end
|
212
|
+
```
|
213
|
+
|
214
|
+
Get the nearest neighbors
|
215
|
+
|
216
|
+
```ruby
|
217
|
+
embedding = Neighbor::SparseVector.new({0 => 0.9, 1 => 1.3, 2 => 1.1}, 3)
|
218
|
+
Item.nearest_neighbors(:embedding, embedding, distance: "euclidean").first(5)
|
219
|
+
```
|
220
|
+
|
140
221
|
## Examples
|
141
222
|
|
142
223
|
- [OpenAI Embeddings](#openai-embeddings)
|
224
|
+
- [Cohere Embeddings](#cohere-embeddings)
|
225
|
+
- [Sentence Embeddings](#sentence-embeddings)
|
226
|
+
- [Sparse Embeddings](#sparse-embeddings)
|
143
227
|
- [Disco Recommendations](#disco-recommendations)
|
144
228
|
|
145
229
|
### OpenAI Embeddings
|
@@ -170,10 +254,10 @@ def fetch_embeddings(input)
|
|
170
254
|
}
|
171
255
|
data = {
|
172
256
|
input: input,
|
173
|
-
model: "text-embedding-
|
257
|
+
model: "text-embedding-3-small"
|
174
258
|
}
|
175
259
|
|
176
|
-
response = Net::HTTP.post(URI(url), data.to_json, headers)
|
260
|
+
response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value)
|
177
261
|
JSON.parse(response.body)["data"].map { |v| v["embedding"] }
|
178
262
|
end
|
179
263
|
```
|
@@ -199,14 +283,221 @@ end
|
|
199
283
|
Document.insert_all!(documents)
|
200
284
|
```
|
201
285
|
|
202
|
-
And get similar
|
286
|
+
And get similar documents
|
203
287
|
|
204
288
|
```ruby
|
205
289
|
document = Document.first
|
206
290
|
document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content)
|
207
291
|
```
|
208
292
|
|
209
|
-
See the [complete code](examples/
|
293
|
+
See the [complete code](examples/openai/example.rb)
|
294
|
+
|
295
|
+
### Cohere Embeddings
|
296
|
+
|
297
|
+
Generate a model
|
298
|
+
|
299
|
+
```sh
|
300
|
+
rails generate model Document content:text embedding:bit{1024}
|
301
|
+
rails db:migrate
|
302
|
+
```
|
303
|
+
|
304
|
+
And add `has_neighbors`
|
305
|
+
|
306
|
+
```ruby
|
307
|
+
class Document < ApplicationRecord
|
308
|
+
has_neighbors :embedding
|
309
|
+
end
|
310
|
+
```
|
311
|
+
|
312
|
+
Create a method to call the [embed API](https://docs.cohere.com/reference/embed)
|
313
|
+
|
314
|
+
```ruby
|
315
|
+
def fetch_embeddings(input, input_type)
|
316
|
+
url = "https://api.cohere.com/v1/embed"
|
317
|
+
headers = {
|
318
|
+
"Authorization" => "Bearer #{ENV.fetch("CO_API_KEY")}",
|
319
|
+
"Content-Type" => "application/json"
|
320
|
+
}
|
321
|
+
data = {
|
322
|
+
texts: input,
|
323
|
+
model: "embed-english-v3.0",
|
324
|
+
input_type: input_type,
|
325
|
+
embedding_types: ["ubinary"]
|
326
|
+
}
|
327
|
+
|
328
|
+
response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value)
|
329
|
+
JSON.parse(response.body)["embeddings"]["ubinary"].map { |e| e.map { |v| v.chr.unpack1("B*") }.join }
|
330
|
+
end
|
331
|
+
```
|
332
|
+
|
333
|
+
Pass your input
|
334
|
+
|
335
|
+
```ruby
|
336
|
+
input = [
|
337
|
+
"The dog is barking",
|
338
|
+
"The cat is purring",
|
339
|
+
"The bear is growling"
|
340
|
+
]
|
341
|
+
embeddings = fetch_embeddings(input, "search_document")
|
342
|
+
```
|
343
|
+
|
344
|
+
Store the embeddings
|
345
|
+
|
346
|
+
```ruby
|
347
|
+
documents = []
|
348
|
+
input.zip(embeddings) do |content, embedding|
|
349
|
+
documents << {content: content, embedding: embedding}
|
350
|
+
end
|
351
|
+
Document.insert_all!(documents)
|
352
|
+
```
|
353
|
+
|
354
|
+
Embed the search query
|
355
|
+
|
356
|
+
```ruby
|
357
|
+
query = "forest"
|
358
|
+
query_embedding = fetch_embeddings([query], "search_query")[0]
|
359
|
+
```
|
360
|
+
|
361
|
+
And search the documents
|
362
|
+
|
363
|
+
```ruby
|
364
|
+
Document.nearest_neighbors(:embedding, query_embedding, distance: "hamming").first(5).map(&:content)
|
365
|
+
```
|
366
|
+
|
367
|
+
See the [complete code](examples/cohere/example.rb)
|
368
|
+
|
369
|
+
### Sentence Embeddings
|
370
|
+
|
371
|
+
You can generate embeddings locally with [Informers](https://github.com/ankane/informers).
|
372
|
+
|
373
|
+
Generate a model
|
374
|
+
|
375
|
+
```sh
|
376
|
+
rails generate model Document content:text embedding:vector{384}
|
377
|
+
rails db:migrate
|
378
|
+
```
|
379
|
+
|
380
|
+
And add `has_neighbors`
|
381
|
+
|
382
|
+
```ruby
|
383
|
+
class Document < ApplicationRecord
|
384
|
+
has_neighbors :embedding
|
385
|
+
end
|
386
|
+
```
|
387
|
+
|
388
|
+
Load a [model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
|
389
|
+
|
390
|
+
```ruby
|
391
|
+
model = Informers::Model.new("sentence-transformers/all-MiniLM-L6-v2")
|
392
|
+
```
|
393
|
+
|
394
|
+
Pass your input
|
395
|
+
|
396
|
+
```ruby
|
397
|
+
input = [
|
398
|
+
"The dog is barking",
|
399
|
+
"The cat is purring",
|
400
|
+
"The bear is growling"
|
401
|
+
]
|
402
|
+
embeddings = model.embed(input)
|
403
|
+
```
|
404
|
+
|
405
|
+
Store the embeddings
|
406
|
+
|
407
|
+
```ruby
|
408
|
+
documents = []
|
409
|
+
input.zip(embeddings) do |content, embedding|
|
410
|
+
documents << {content: content, embedding: embedding}
|
411
|
+
end
|
412
|
+
Document.insert_all!(documents)
|
413
|
+
```
|
414
|
+
|
415
|
+
And get similar documents
|
416
|
+
|
417
|
+
```ruby
|
418
|
+
document = Document.first
|
419
|
+
document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content)
|
420
|
+
```
|
421
|
+
|
422
|
+
See the [complete code](examples/informers/example.rb)
|
423
|
+
|
424
|
+
### Sparse Embeddings
|
425
|
+
|
426
|
+
You can generate sparse embeddings locally with [Transformers.rb](https://github.com/ankane/transformers-ruby).
|
427
|
+
|
428
|
+
Generate a model
|
429
|
+
|
430
|
+
```sh
|
431
|
+
rails generate model Document content:text embedding:sparsevec{30522}
|
432
|
+
rails db:migrate
|
433
|
+
```
|
434
|
+
|
435
|
+
And add `has_neighbors`
|
436
|
+
|
437
|
+
```ruby
|
438
|
+
class Document < ApplicationRecord
|
439
|
+
has_neighbors :embedding
|
440
|
+
end
|
441
|
+
```
|
442
|
+
|
443
|
+
Load a [model](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1) to generate embeddings
|
444
|
+
|
445
|
+
```ruby
|
446
|
+
class EmbeddingModel
|
447
|
+
def initialize(model_id)
|
448
|
+
@model = Transformers::AutoModelForMaskedLM.from_pretrained(model_id)
|
449
|
+
@tokenizer = Transformers::AutoTokenizer.from_pretrained(model_id)
|
450
|
+
@special_token_ids = @tokenizer.special_tokens_map.map { |_, token| @tokenizer.vocab[token] }
|
451
|
+
end
|
452
|
+
|
453
|
+
def embed(input)
|
454
|
+
feature = @tokenizer.(input, padding: true, truncation: true, return_tensors: "pt", return_token_type_ids: false)
|
455
|
+
output = @model.(**feature)[0]
|
456
|
+
values = Torch.max(output * feature[:attention_mask].unsqueeze(-1), dim: 1)[0]
|
457
|
+
values = Torch.log(1 + Torch.relu(values))
|
458
|
+
values[0.., @special_token_ids] = 0
|
459
|
+
values.to_a
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
model = EmbeddingModel.new("opensearch-project/opensearch-neural-sparse-encoding-v1")
|
464
|
+
```
|
465
|
+
|
466
|
+
Pass your input
|
467
|
+
|
468
|
+
```ruby
|
469
|
+
input = [
|
470
|
+
"The dog is barking",
|
471
|
+
"The cat is purring",
|
472
|
+
"The bear is growling"
|
473
|
+
]
|
474
|
+
embeddings = model.embed(input)
|
475
|
+
```
|
476
|
+
|
477
|
+
Store the embeddings
|
478
|
+
|
479
|
+
```ruby
|
480
|
+
documents = []
|
481
|
+
input.zip(embeddings) do |content, embedding|
|
482
|
+
documents << {content: content, embedding: Neighbor::SparseVector.new(embedding)}
|
483
|
+
end
|
484
|
+
Document.insert_all!(documents)
|
485
|
+
```
|
486
|
+
|
487
|
+
Embed the search query
|
488
|
+
|
489
|
+
```ruby
|
490
|
+
query = "forest"
|
491
|
+
query_embedding = model.embed([query])[0]
|
492
|
+
```
|
493
|
+
|
494
|
+
And search the documents
|
495
|
+
|
496
|
+
```ruby
|
497
|
+
Document.nearest_neighbors(:embedding, Neighbor::SparseVector.new(query_embedding), distance: "inner_product").first(5).map(&:content)
|
498
|
+
```
|
499
|
+
|
500
|
+
See the [complete code](examples/sparse/example.rb)
|
210
501
|
|
211
502
|
### Disco Recommendations
|
212
503
|
|
@@ -252,19 +543,7 @@ movie = Movie.find_by(name: "Star Wars (1977)")
|
|
252
543
|
movie.nearest_neighbors(:factors, distance: "cosine").first(5).map(&:name)
|
253
544
|
```
|
254
545
|
|
255
|
-
See the complete code for [cube](examples/
|
256
|
-
|
257
|
-
## Upgrading
|
258
|
-
|
259
|
-
### 0.2.0
|
260
|
-
|
261
|
-
The `distance` option has been moved from `has_neighbors` to `nearest_neighbors`, and there is no longer a default. If you use cosine distance, set:
|
262
|
-
|
263
|
-
```ruby
|
264
|
-
class Item < ApplicationRecord
|
265
|
-
has_neighbors normalize: true
|
266
|
-
end
|
267
|
-
```
|
546
|
+
See the complete code for [cube](examples/disco/item_recs_cube.rb) and [vector](examples/disco/item_recs_vector.rb)
|
268
547
|
|
269
548
|
## History
|
270
549
|
|
data/lib/neighbor/model.rb
CHANGED
@@ -49,7 +49,7 @@ module Neighbor
|
|
49
49
|
# TODO move to normalizes when Active Record < 7.1 no longer supported
|
50
50
|
before_save do
|
51
51
|
self.class.neighbor_attributes.each do |k, v|
|
52
|
-
next unless v[:normalize]
|
52
|
+
next unless v[:normalize] && attribute_changed?(k)
|
53
53
|
value = read_attribute(k)
|
54
54
|
next if value.nil?
|
55
55
|
self[k] = Neighbor::Utils.normalize(value, column_info: self.class.columns_hash[k.to_s])
|
@@ -61,6 +61,7 @@ module Neighbor
|
|
61
61
|
scope :nearest_neighbors, ->(attribute_name, vector, options = nil) {
|
62
62
|
raise ArgumentError, "missing keyword: :distance" unless options.is_a?(Hash) && options.key?(:distance)
|
63
63
|
distance = options.delete(:distance)
|
64
|
+
precision = options.delete(:precision)
|
64
65
|
raise ArgumentError, "unknown keywords: #{options.keys.map(&:inspect).join(", ")}" if options.any?
|
65
66
|
|
66
67
|
attribute_name = attribute_name.to_sym
|
@@ -126,6 +127,18 @@ module Neighbor
|
|
126
127
|
vector = Neighbor::Utils.normalize(vector, column_info: column_info) if normalize
|
127
128
|
|
128
129
|
query = connection.quote(column_attribute.serialize(vector))
|
130
|
+
|
131
|
+
if !precision.nil?
|
132
|
+
case precision.to_s
|
133
|
+
when "half"
|
134
|
+
cast_dimensions = dimensions || column_info&.limit
|
135
|
+
raise ArgumentError, "Unknown dimensions" unless cast_dimensions
|
136
|
+
quoted_attribute += "::halfvec(#{connection.quote(cast_dimensions.to_i)})"
|
137
|
+
else
|
138
|
+
raise ArgumentError, "Invalid precision"
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
129
142
|
order = "#{quoted_attribute} #{operator} #{query}"
|
130
143
|
if operator == "#"
|
131
144
|
order = "bit_count(#{order})"
|
data/lib/neighbor/railtie.rb
CHANGED
data/lib/neighbor/type/cube.rb
CHANGED
@@ -6,7 +6,8 @@ module Neighbor
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def serialize(value)
|
9
|
-
if value.
|
9
|
+
if value.respond_to?(:to_a)
|
10
|
+
value = value.to_a
|
10
11
|
if value.first.is_a?(Array)
|
11
12
|
value = value.map { |v| serialize_point(v) }.join(", ")
|
12
13
|
else
|
@@ -19,8 +20,8 @@ module Neighbor
|
|
19
20
|
private
|
20
21
|
|
21
22
|
def cast_value(value)
|
22
|
-
if value.
|
23
|
-
value
|
23
|
+
if value.respond_to?(:to_a)
|
24
|
+
value.to_a
|
24
25
|
elsif value.is_a?(Numeric)
|
25
26
|
[value]
|
26
27
|
elsif value.is_a?(String)
|
@@ -6,8 +6,8 @@ module Neighbor
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def serialize(value)
|
9
|
-
if value.
|
10
|
-
value = "[#{value.map(&:to_f).join(",")}]"
|
9
|
+
if value.respond_to?(:to_a)
|
10
|
+
value = "[#{value.to_a.map(&:to_f).join(",")}]"
|
11
11
|
end
|
12
12
|
super(value)
|
13
13
|
end
|
@@ -17,8 +17,8 @@ module Neighbor
|
|
17
17
|
def cast_value(value)
|
18
18
|
if value.is_a?(String)
|
19
19
|
value[1..-1].split(",").map(&:to_f)
|
20
|
-
elsif value.
|
21
|
-
value
|
20
|
+
elsif value.respond_to?(:to_a)
|
21
|
+
value.to_a
|
22
22
|
else
|
23
23
|
raise "can't cast #{value.class.name} to halfvec"
|
24
24
|
end
|
@@ -19,8 +19,8 @@ module Neighbor
|
|
19
19
|
value
|
20
20
|
elsif value.is_a?(String)
|
21
21
|
SparseVector.from_text(value)
|
22
|
-
elsif value.
|
23
|
-
value = SparseVector.new(value)
|
22
|
+
elsif value.respond_to?(:to_a)
|
23
|
+
value = SparseVector.new(value.to_a)
|
24
24
|
else
|
25
25
|
raise "can't cast #{value.class.name} to sparsevec"
|
26
26
|
end
|
data/lib/neighbor/type/vector.rb
CHANGED
@@ -6,8 +6,8 @@ module Neighbor
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def serialize(value)
|
9
|
-
if value.
|
10
|
-
value = "[#{value.map(&:to_f).join(",")}]"
|
9
|
+
if value.respond_to?(:to_a)
|
10
|
+
value = "[#{value.to_a.map(&:to_f).join(",")}]"
|
11
11
|
end
|
12
12
|
super(value)
|
13
13
|
end
|
@@ -17,8 +17,8 @@ module Neighbor
|
|
17
17
|
def cast_value(value)
|
18
18
|
if value.is_a?(String)
|
19
19
|
value[1..-1].split(",").map(&:to_f)
|
20
|
-
elsif value.
|
21
|
-
value
|
20
|
+
elsif value.respond_to?(:to_a)
|
21
|
+
value.to_a
|
22
22
|
else
|
23
23
|
raise "can't cast #{value.class.name} to vector"
|
24
24
|
end
|
data/lib/neighbor/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: neighbor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-08-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|