neighbor 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +89 -9
- data/lib/neighbor/reranking.rb +27 -0
- data/lib/neighbor/version.rb +1 -1
- data/lib/neighbor.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e8d611fd277cd48d309b2a087fdeb22f39f43d8ef81fcab57763bd5b4b2e48b3
|
4
|
+
data.tar.gz: fe0a5f7e4aa1ebd81f8c5849be67dc0f3c948d53d313b80259def04b9b7e9e84
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0d9d9d0be9f2929f1eab7e5df52a0606ca8c220ea74e6ef349c2ab77e5da44bd4b98ed4d2271345b2d679882f1790ac54b8373c13a69f75607935322bdb68754
|
7
|
+
data.tar.gz: 834c7d6e26be9b6d8fc262280048e5104cd022587fed30d030f6c0edd849966a49a08bc793f1b49764b1fc5afc014c1463dd8acacfb5739bc507f4a77d3281a1
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -243,11 +243,12 @@ Item.nearest_neighbors(:embedding, embedding, distance: "euclidean").first(5)
|
|
243
243
|
|
244
244
|
## Examples
|
245
245
|
|
246
|
-
- [
|
247
|
-
- [
|
248
|
-
- [Sentence
|
249
|
-
- [
|
250
|
-
- [
|
246
|
+
- [Embeddings](#openai-embeddings) with OpenAI
|
247
|
+
- [Binary embeddings](#cohere-embeddings) with Cohere
|
248
|
+
- [Sentence embeddings](#sentence-embeddings) with Informers
|
249
|
+
- [Hybrid search](#hybrid-search) with Informers
|
250
|
+
- [Sparse search](#sparse-search) with Transformers.rb
|
251
|
+
- [Recommendations](#disco-recommendations) with Disco
|
251
252
|
|
252
253
|
### OpenAI Embeddings
|
253
254
|
|
@@ -411,7 +412,7 @@ end
|
|
411
412
|
Load a [model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
|
412
413
|
|
413
414
|
```ruby
|
414
|
-
model = Informers
|
415
|
+
model = Informers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
|
415
416
|
```
|
416
417
|
|
417
418
|
Pass your input
|
@@ -422,7 +423,7 @@ input = [
|
|
422
423
|
"The cat is purring",
|
423
424
|
"The bear is growling"
|
424
425
|
]
|
425
|
-
embeddings = model.
|
426
|
+
embeddings = model.(input)
|
426
427
|
```
|
427
428
|
|
428
429
|
Store the embeddings
|
@@ -444,7 +445,86 @@ document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:conten
|
|
444
445
|
|
445
446
|
See the [complete code](examples/informers/example.rb)
|
446
447
|
|
447
|
-
###
|
448
|
+
### Hybrid Search
|
449
|
+
|
450
|
+
You can use Neighbor for hybrid search with [Informers](https://github.com/ankane/informers).
|
451
|
+
|
452
|
+
Generate a model
|
453
|
+
|
454
|
+
```sh
|
455
|
+
rails generate model Document content:text embedding:vector{768}
|
456
|
+
rails db:migrate
|
457
|
+
```
|
458
|
+
|
459
|
+
And add `has_neighbors` and a scope for keyword search
|
460
|
+
|
461
|
+
```ruby
|
462
|
+
class Document < ApplicationRecord
|
463
|
+
has_neighbors :embedding
|
464
|
+
|
465
|
+
scope :search, ->(query) {
|
466
|
+
where("to_tsvector(content) @@ plainto_tsquery(?)", query)
|
467
|
+
.order(Arel.sql("ts_rank_cd(to_tsvector(content), plainto_tsquery(?)) DESC", query))
|
468
|
+
}
|
469
|
+
end
|
470
|
+
```
|
471
|
+
|
472
|
+
Create some documents
|
473
|
+
|
474
|
+
```ruby
|
475
|
+
texts = [
|
476
|
+
"The dog is barking",
|
477
|
+
"The cat is purring",
|
478
|
+
"The bear is growling"
|
479
|
+
]
|
480
|
+
documents = Document.create!(texts.map { |v| {content: v} })
|
481
|
+
```
|
482
|
+
|
483
|
+
Generate an embedding for each document
|
484
|
+
|
485
|
+
```ruby
|
486
|
+
embed = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5")
|
487
|
+
embed_options = {model_output: "sentence_embedding", pooling: "none"} # specific to embedding model
|
488
|
+
embeddings = embed.(documents.map(&:content), **embed_options)
|
489
|
+
|
490
|
+
documents.zip(embeddings) do |document, embedding|
|
491
|
+
document.update!(embedding: embedding)
|
492
|
+
end
|
493
|
+
```
|
494
|
+
|
495
|
+
Perform keyword search
|
496
|
+
|
497
|
+
```ruby
|
498
|
+
query = "growling bear"
|
499
|
+
keyword_results = Document.search(query).limit(20).load_async
|
500
|
+
```
|
501
|
+
|
502
|
+
And semantic search in parallel (the query prefix is specific to the [embedding model](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5))
|
503
|
+
|
504
|
+
```ruby
|
505
|
+
query_prefix = "Represent this sentence for searching relevant passages: "
|
506
|
+
query_embedding = embed.(query_prefix + query, **embed_options)
|
507
|
+
semantic_results =
|
508
|
+
Document.nearest_neighbors(:embedding, query_embedding, distance: "cosine").limit(20).load_async
|
509
|
+
```
|
510
|
+
|
511
|
+
To combine the results, use Reciprocal Rank Fusion (RRF)
|
512
|
+
|
513
|
+
```ruby
|
514
|
+
Neighbor::Reranking.rrf(keyword_results, semantic_results)
|
515
|
+
```
|
516
|
+
|
517
|
+
Or a reranking model
|
518
|
+
|
519
|
+
```ruby
|
520
|
+
rerank = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-xsmall-v1")
|
521
|
+
results = (keyword_results + semantic_results).uniq
|
522
|
+
rerank.(query, results.map(&:content), top_k: 5).map { |v| results[v[:doc_id]] }
|
523
|
+
```
|
524
|
+
|
525
|
+
See the [complete code](examples/hybrid/example.rb)
|
526
|
+
|
527
|
+
### Sparse Search
|
448
528
|
|
449
529
|
You can generate sparse embeddings locally with [Transformers.rb](https://github.com/ankane/transformers-ruby).
|
450
530
|
|
@@ -556,7 +636,7 @@ movies = []
|
|
556
636
|
recommender.item_ids.each do |item_id|
|
557
637
|
movies << {name: item_id, factors: recommender.item_factors(item_id)}
|
558
638
|
end
|
559
|
-
Movie.
|
639
|
+
Movie.create!(movies)
|
560
640
|
```
|
561
641
|
|
562
642
|
And get similar movies
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Neighbor
|
2
|
+
module Reranking
|
3
|
+
def self.rrf(first_ranking, *rankings, k: 60)
|
4
|
+
rankings.unshift(first_ranking)
|
5
|
+
|
6
|
+
ranks = []
|
7
|
+
results = []
|
8
|
+
rankings.each do |ranking|
|
9
|
+
ranks << ranking.map.with_index.to_h { |v, i| [v, i + 1] }
|
10
|
+
results.concat(ranking)
|
11
|
+
end
|
12
|
+
|
13
|
+
results =
|
14
|
+
results.uniq.map do |result|
|
15
|
+
score =
|
16
|
+
ranks.sum do |rank|
|
17
|
+
r = rank[result]
|
18
|
+
r ? 1.0 / (k + r) : 0.0
|
19
|
+
end
|
20
|
+
|
21
|
+
{result: result, score: score}
|
22
|
+
end
|
23
|
+
|
24
|
+
results.sort_by { |v| -v[:score] }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/neighbor/version.rb
CHANGED
data/lib/neighbor.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: neighbor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -40,6 +40,7 @@ files:
|
|
40
40
|
- lib/neighbor.rb
|
41
41
|
- lib/neighbor/model.rb
|
42
42
|
- lib/neighbor/railtie.rb
|
43
|
+
- lib/neighbor/reranking.rb
|
43
44
|
- lib/neighbor/sparse_vector.rb
|
44
45
|
- lib/neighbor/type/cube.rb
|
45
46
|
- lib/neighbor/type/halfvec.rb
|