neighbor 0.4.2 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +89 -9
- data/lib/neighbor/reranking.rb +27 -0
- data/lib/neighbor/version.rb +1 -1
- data/lib/neighbor.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e8d611fd277cd48d309b2a087fdeb22f39f43d8ef81fcab57763bd5b4b2e48b3
|
4
|
+
data.tar.gz: fe0a5f7e4aa1ebd81f8c5849be67dc0f3c948d53d313b80259def04b9b7e9e84
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0d9d9d0be9f2929f1eab7e5df52a0606ca8c220ea74e6ef349c2ab77e5da44bd4b98ed4d2271345b2d679882f1790ac54b8373c13a69f75607935322bdb68754
|
7
|
+
data.tar.gz: 834c7d6e26be9b6d8fc262280048e5104cd022587fed30d030f6c0edd849966a49a08bc793f1b49764b1fc5afc014c1463dd8acacfb5739bc507f4a77d3281a1
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -243,11 +243,12 @@ Item.nearest_neighbors(:embedding, embedding, distance: "euclidean").first(5)
|
|
243
243
|
|
244
244
|
## Examples
|
245
245
|
|
246
|
-
- [
|
247
|
-
- [
|
248
|
-
- [Sentence
|
249
|
-
- [
|
250
|
-
- [
|
246
|
+
- [Embeddings](#openai-embeddings) with OpenAI
|
247
|
+
- [Binary embeddings](#cohere-embeddings) with Cohere
|
248
|
+
- [Sentence embeddings](#sentence-embeddings) with Informers
|
249
|
+
- [Hybrid search](#hybrid-search) with Informers
|
250
|
+
- [Sparse search](#sparse-search) with Transformers.rb
|
251
|
+
- [Recommendations](#disco-recommendations) with Disco
|
251
252
|
|
252
253
|
### OpenAI Embeddings
|
253
254
|
|
@@ -411,7 +412,7 @@ end
|
|
411
412
|
Load a [model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
|
412
413
|
|
413
414
|
```ruby
|
414
|
-
model = Informers
|
415
|
+
model = Informers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
|
415
416
|
```
|
416
417
|
|
417
418
|
Pass your input
|
@@ -422,7 +423,7 @@ input = [
|
|
422
423
|
"The cat is purring",
|
423
424
|
"The bear is growling"
|
424
425
|
]
|
425
|
-
embeddings = model.
|
426
|
+
embeddings = model.(input)
|
426
427
|
```
|
427
428
|
|
428
429
|
Store the embeddings
|
@@ -444,7 +445,86 @@ document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:conten
|
|
444
445
|
|
445
446
|
See the [complete code](examples/informers/example.rb)
|
446
447
|
|
447
|
-
###
|
448
|
+
### Hybrid Search
|
449
|
+
|
450
|
+
You can use Neighbor for hybrid search with [Informers](https://github.com/ankane/informers).
|
451
|
+
|
452
|
+
Generate a model
|
453
|
+
|
454
|
+
```sh
|
455
|
+
rails generate model Document content:text embedding:vector{768}
|
456
|
+
rails db:migrate
|
457
|
+
```
|
458
|
+
|
459
|
+
And add `has_neighbors` and a scope for keyword search
|
460
|
+
|
461
|
+
```ruby
|
462
|
+
class Document < ApplicationRecord
|
463
|
+
has_neighbors :embedding
|
464
|
+
|
465
|
+
scope :search, ->(query) {
|
466
|
+
where("to_tsvector(content) @@ plainto_tsquery(?)", query)
|
467
|
+
.order(Arel.sql("ts_rank_cd(to_tsvector(content), plainto_tsquery(?)) DESC", query))
|
468
|
+
}
|
469
|
+
end
|
470
|
+
```
|
471
|
+
|
472
|
+
Create some documents
|
473
|
+
|
474
|
+
```ruby
|
475
|
+
texts = [
|
476
|
+
"The dog is barking",
|
477
|
+
"The cat is purring",
|
478
|
+
"The bear is growling"
|
479
|
+
]
|
480
|
+
documents = Document.create!(texts.map { |v| {content: v} })
|
481
|
+
```
|
482
|
+
|
483
|
+
Generate an embedding for each document
|
484
|
+
|
485
|
+
```ruby
|
486
|
+
embed = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5")
|
487
|
+
embed_options = {model_output: "sentence_embedding", pooling: "none"} # specific to embedding model
|
488
|
+
embeddings = embed.(documents.map(&:content), **embed_options)
|
489
|
+
|
490
|
+
documents.zip(embeddings) do |document, embedding|
|
491
|
+
document.update!(embedding: embedding)
|
492
|
+
end
|
493
|
+
```
|
494
|
+
|
495
|
+
Perform keyword search
|
496
|
+
|
497
|
+
```ruby
|
498
|
+
query = "growling bear"
|
499
|
+
keyword_results = Document.search(query).limit(20).load_async
|
500
|
+
```
|
501
|
+
|
502
|
+
And semantic search in parallel (the query prefix is specific to the [embedding model](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5))
|
503
|
+
|
504
|
+
```ruby
|
505
|
+
query_prefix = "Represent this sentence for searching relevant passages: "
|
506
|
+
query_embedding = embed.(query_prefix + query, **embed_options)
|
507
|
+
semantic_results =
|
508
|
+
Document.nearest_neighbors(:embedding, query_embedding, distance: "cosine").limit(20).load_async
|
509
|
+
```
|
510
|
+
|
511
|
+
To combine the results, use Reciprocal Rank Fusion (RRF)
|
512
|
+
|
513
|
+
```ruby
|
514
|
+
Neighbor::Reranking.rrf(keyword_results, semantic_results)
|
515
|
+
```
|
516
|
+
|
517
|
+
Or a reranking model
|
518
|
+
|
519
|
+
```ruby
|
520
|
+
rerank = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-xsmall-v1")
|
521
|
+
results = (keyword_results + semantic_results).uniq
|
522
|
+
rerank.(query, results.map(&:content), top_k: 5).map { |v| results[v[:doc_id]] }
|
523
|
+
```
|
524
|
+
|
525
|
+
See the [complete code](examples/hybrid/example.rb)
|
526
|
+
|
527
|
+
### Sparse Search
|
448
528
|
|
449
529
|
You can generate sparse embeddings locally with [Transformers.rb](https://github.com/ankane/transformers-ruby).
|
450
530
|
|
@@ -556,7 +636,7 @@ movies = []
|
|
556
636
|
recommender.item_ids.each do |item_id|
|
557
637
|
movies << {name: item_id, factors: recommender.item_factors(item_id)}
|
558
638
|
end
|
559
|
-
Movie.
|
639
|
+
Movie.create!(movies)
|
560
640
|
```
|
561
641
|
|
562
642
|
And get similar movies
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Neighbor
|
2
|
+
module Reranking
|
3
|
+
def self.rrf(first_ranking, *rankings, k: 60)
|
4
|
+
rankings.unshift(first_ranking)
|
5
|
+
|
6
|
+
ranks = []
|
7
|
+
results = []
|
8
|
+
rankings.each do |ranking|
|
9
|
+
ranks << ranking.map.with_index.to_h { |v, i| [v, i + 1] }
|
10
|
+
results.concat(ranking)
|
11
|
+
end
|
12
|
+
|
13
|
+
results =
|
14
|
+
results.uniq.map do |result|
|
15
|
+
score =
|
16
|
+
ranks.sum do |rank|
|
17
|
+
r = rank[result]
|
18
|
+
r ? 1.0 / (k + r) : 0.0
|
19
|
+
end
|
20
|
+
|
21
|
+
{result: result, score: score}
|
22
|
+
end
|
23
|
+
|
24
|
+
results.sort_by { |v| -v[:score] }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/neighbor/version.rb
CHANGED
data/lib/neighbor.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: neighbor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -40,6 +40,7 @@ files:
|
|
40
40
|
- lib/neighbor.rb
|
41
41
|
- lib/neighbor/model.rb
|
42
42
|
- lib/neighbor/railtie.rb
|
43
|
+
- lib/neighbor/reranking.rb
|
43
44
|
- lib/neighbor/sparse_vector.rb
|
44
45
|
- lib/neighbor/type/cube.rb
|
45
46
|
- lib/neighbor/type/halfvec.rb
|