neighbor 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dfc4af6302c7098ea40f96e9d8a19706aff46a2506cad541ff18ee07fcd11019
4
- data.tar.gz: a79b59895ca3b99a7c048eddd20cb3602b2660425ab44463e7021ab763a26f62
3
+ metadata.gz: e8d611fd277cd48d309b2a087fdeb22f39f43d8ef81fcab57763bd5b4b2e48b3
4
+ data.tar.gz: fe0a5f7e4aa1ebd81f8c5849be67dc0f3c948d53d313b80259def04b9b7e9e84
5
5
  SHA512:
6
- metadata.gz: 11081e687de4c79428351095477137f9140bc6c0363d09c54ece8fd5f7bbe2df802d740332f4474357f9e9e57157bd1f1f4dd3671c106d24dc7e01e2f0d84e2a
7
- data.tar.gz: f18d787b22df7bbc00c69b1f9f6262e19c0214f6ef28814a5c05d9e8c3dae357f32596994bef60112d3f53dac0c1b51f6c99af6345c7e01b2f26cef1a7b42226
6
+ metadata.gz: 0d9d9d0be9f2929f1eab7e5df52a0606ca8c220ea74e6ef349c2ab77e5da44bd4b98ed4d2271345b2d679882f1790ac54b8373c13a69f75607935322bdb68754
7
+ data.tar.gz: 834c7d6e26be9b6d8fc262280048e5104cd022587fed30d030f6c0edd849966a49a08bc793f1b49764b1fc5afc014c1463dd8acacfb5739bc507f4a77d3281a1
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.4.3 (2024-09-02)
2
+
3
+ - Added `rrf` method
4
+
1
5
  ## 0.4.2 (2024-08-27)
2
6
 
3
7
  - Fixed error with `nil` values
data/README.md CHANGED
@@ -243,11 +243,12 @@ Item.nearest_neighbors(:embedding, embedding, distance: "euclidean").first(5)
243
243
 
244
244
  ## Examples
245
245
 
246
- - [OpenAI Embeddings](#openai-embeddings)
247
- - [Cohere Embeddings](#cohere-embeddings)
248
- - [Sentence Embeddings](#sentence-embeddings)
249
- - [Sparse Embeddings](#sparse-embeddings)
250
- - [Disco Recommendations](#disco-recommendations)
246
+ - [Embeddings](#openai-embeddings) with OpenAI
247
+ - [Binary embeddings](#cohere-embeddings) with Cohere
248
+ - [Sentence embeddings](#sentence-embeddings) with Informers
249
+ - [Hybrid search](#hybrid-search) with Informers
250
+ - [Sparse search](#sparse-search) with Transformers.rb
251
+ - [Recommendations](#disco-recommendations) with Disco
251
252
 
252
253
  ### OpenAI Embeddings
253
254
 
@@ -411,7 +412,7 @@ end
411
412
  Load a [model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
412
413
 
413
414
  ```ruby
414
- model = Informers::Model.new("sentence-transformers/all-MiniLM-L6-v2")
415
+ model = Informers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
415
416
  ```
416
417
 
417
418
  Pass your input
@@ -422,7 +423,7 @@ input = [
422
423
  "The cat is purring",
423
424
  "The bear is growling"
424
425
  ]
425
- embeddings = model.embed(input)
426
+ embeddings = model.(input)
426
427
  ```
427
428
 
428
429
  Store the embeddings
@@ -444,7 +445,86 @@ document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:conten
444
445
 
445
446
  See the [complete code](examples/informers/example.rb)
446
447
 
447
- ### Sparse Embeddings
448
+ ### Hybrid Search
449
+
450
+ You can use Neighbor for hybrid search with [Informers](https://github.com/ankane/informers).
451
+
452
+ Generate a model
453
+
454
+ ```sh
455
+ rails generate model Document content:text embedding:vector{768}
456
+ rails db:migrate
457
+ ```
458
+
459
+ And add `has_neighbors` and a scope for keyword search
460
+
461
+ ```ruby
462
+ class Document < ApplicationRecord
463
+ has_neighbors :embedding
464
+
465
+ scope :search, ->(query) {
466
+ where("to_tsvector(content) @@ plainto_tsquery(?)", query)
467
+ .order(Arel.sql("ts_rank_cd(to_tsvector(content), plainto_tsquery(?)) DESC", query))
468
+ }
469
+ end
470
+ ```
471
+
472
+ Create some documents
473
+
474
+ ```ruby
475
+ texts = [
476
+ "The dog is barking",
477
+ "The cat is purring",
478
+ "The bear is growling"
479
+ ]
480
+ documents = Document.create!(texts.map { |v| {content: v} })
481
+ ```
482
+
483
+ Generate an embedding for each document
484
+
485
+ ```ruby
486
+ embed = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5")
487
+ embed_options = {model_output: "sentence_embedding", pooling: "none"} # specific to embedding model
488
+ embeddings = embed.(documents.map(&:content), **embed_options)
489
+
490
+ documents.zip(embeddings) do |document, embedding|
491
+ document.update!(embedding: embedding)
492
+ end
493
+ ```
494
+
495
+ Perform keyword search
496
+
497
+ ```ruby
498
+ query = "growling bear"
499
+ keyword_results = Document.search(query).limit(20).load_async
500
+ ```
501
+
502
+ And semantic search in parallel (the query prefix is specific to the [embedding model](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5))
503
+
504
+ ```ruby
505
+ query_prefix = "Represent this sentence for searching relevant passages: "
506
+ query_embedding = embed.(query_prefix + query, **embed_options)
507
+ semantic_results =
508
+ Document.nearest_neighbors(:embedding, query_embedding, distance: "cosine").limit(20).load_async
509
+ ```
510
+
511
+ To combine the results, use Reciprocal Rank Fusion (RRF)
512
+
513
+ ```ruby
514
+ Neighbor::Reranking.rrf(keyword_results, semantic_results)
515
+ ```
516
+
517
+ Or a reranking model
518
+
519
+ ```ruby
520
+ rerank = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-xsmall-v1")
521
+ results = (keyword_results + semantic_results).uniq
522
+ rerank.(query, results.map(&:content), top_k: 5).map { |v| results[v[:doc_id]] }
523
+ ```
524
+
525
+ See the [complete code](examples/hybrid/example.rb)
526
+
527
+ ### Sparse Search
448
528
 
449
529
  You can generate sparse embeddings locally with [Transformers.rb](https://github.com/ankane/transformers-ruby).
450
530
 
@@ -556,7 +636,7 @@ movies = []
556
636
  recommender.item_ids.each do |item_id|
557
637
  movies << {name: item_id, factors: recommender.item_factors(item_id)}
558
638
  end
559
- Movie.insert_all!(movies)
639
+ Movie.create!(movies)
560
640
  ```
561
641
 
562
642
  And get similar movies
@@ -0,0 +1,27 @@
1
+ module Neighbor
2
+ module Reranking
3
+ def self.rrf(first_ranking, *rankings, k: 60)
4
+ rankings.unshift(first_ranking)
5
+
6
+ ranks = []
7
+ results = []
8
+ rankings.each do |ranking|
9
+ ranks << ranking.map.with_index.to_h { |v, i| [v, i + 1] }
10
+ results.concat(ranking)
11
+ end
12
+
13
+ results =
14
+ results.uniq.map do |result|
15
+ score =
16
+ ranks.sum do |rank|
17
+ r = rank[result]
18
+ r ? 1.0 / (k + r) : 0.0
19
+ end
20
+
21
+ {result: result, score: score}
22
+ end
23
+
24
+ results.sort_by { |v| -v[:score] }
25
+ end
26
+ end
27
+ end
@@ -1,3 +1,3 @@
1
1
  module Neighbor
2
- VERSION = "0.4.2"
2
+ VERSION = "0.4.3"
3
3
  end
data/lib/neighbor.rb CHANGED
@@ -2,6 +2,7 @@
2
2
  require "active_support"
3
3
 
4
4
  # modules
5
+ require_relative "neighbor/reranking"
5
6
  require_relative "neighbor/sparse_vector"
6
7
  require_relative "neighbor/utils"
7
8
  require_relative "neighbor/version"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: neighbor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-27 00:00:00.000000000 Z
11
+ date: 2024-09-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
@@ -40,6 +40,7 @@ files:
40
40
  - lib/neighbor.rb
41
41
  - lib/neighbor/model.rb
42
42
  - lib/neighbor/railtie.rb
43
+ - lib/neighbor/reranking.rb
43
44
  - lib/neighbor/sparse_vector.rb
44
45
  - lib/neighbor/type/cube.rb
45
46
  - lib/neighbor/type/halfvec.rb