neighbor 0.4.2 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dfc4af6302c7098ea40f96e9d8a19706aff46a2506cad541ff18ee07fcd11019
4
- data.tar.gz: a79b59895ca3b99a7c048eddd20cb3602b2660425ab44463e7021ab763a26f62
3
+ metadata.gz: e8d611fd277cd48d309b2a087fdeb22f39f43d8ef81fcab57763bd5b4b2e48b3
4
+ data.tar.gz: fe0a5f7e4aa1ebd81f8c5849be67dc0f3c948d53d313b80259def04b9b7e9e84
5
5
  SHA512:
6
- metadata.gz: 11081e687de4c79428351095477137f9140bc6c0363d09c54ece8fd5f7bbe2df802d740332f4474357f9e9e57157bd1f1f4dd3671c106d24dc7e01e2f0d84e2a
7
- data.tar.gz: f18d787b22df7bbc00c69b1f9f6262e19c0214f6ef28814a5c05d9e8c3dae357f32596994bef60112d3f53dac0c1b51f6c99af6345c7e01b2f26cef1a7b42226
6
+ metadata.gz: 0d9d9d0be9f2929f1eab7e5df52a0606ca8c220ea74e6ef349c2ab77e5da44bd4b98ed4d2271345b2d679882f1790ac54b8373c13a69f75607935322bdb68754
7
+ data.tar.gz: 834c7d6e26be9b6d8fc262280048e5104cd022587fed30d030f6c0edd849966a49a08bc793f1b49764b1fc5afc014c1463dd8acacfb5739bc507f4a77d3281a1
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.4.3 (2024-09-02)
2
+
3
+ - Added `rrf` method
4
+
1
5
  ## 0.4.2 (2024-08-27)
2
6
 
3
7
  - Fixed error with `nil` values
data/README.md CHANGED
@@ -243,11 +243,12 @@ Item.nearest_neighbors(:embedding, embedding, distance: "euclidean").first(5)
243
243
 
244
244
  ## Examples
245
245
 
246
- - [OpenAI Embeddings](#openai-embeddings)
247
- - [Cohere Embeddings](#cohere-embeddings)
248
- - [Sentence Embeddings](#sentence-embeddings)
249
- - [Sparse Embeddings](#sparse-embeddings)
250
- - [Disco Recommendations](#disco-recommendations)
246
+ - [Embeddings](#openai-embeddings) with OpenAI
247
+ - [Binary embeddings](#cohere-embeddings) with Cohere
248
+ - [Sentence embeddings](#sentence-embeddings) with Informers
249
+ - [Hybrid search](#hybrid-search) with Informers
250
+ - [Sparse search](#sparse-search) with Transformers.rb
251
+ - [Recommendations](#disco-recommendations) with Disco
251
252
 
252
253
  ### OpenAI Embeddings
253
254
 
@@ -411,7 +412,7 @@ end
411
412
  Load a [model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
412
413
 
413
414
  ```ruby
414
- model = Informers::Model.new("sentence-transformers/all-MiniLM-L6-v2")
415
+ model = Informers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
415
416
  ```
416
417
 
417
418
  Pass your input
@@ -422,7 +423,7 @@ input = [
422
423
  "The cat is purring",
423
424
  "The bear is growling"
424
425
  ]
425
- embeddings = model.embed(input)
426
+ embeddings = model.(input)
426
427
  ```
427
428
 
428
429
  Store the embeddings
@@ -444,7 +445,86 @@ document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:conten
444
445
 
445
446
  See the [complete code](examples/informers/example.rb)
446
447
 
447
- ### Sparse Embeddings
448
+ ### Hybrid Search
449
+
450
+ You can use Neighbor for hybrid search with [Informers](https://github.com/ankane/informers).
451
+
452
+ Generate a model
453
+
454
+ ```sh
455
+ rails generate model Document content:text embedding:vector{768}
456
+ rails db:migrate
457
+ ```
458
+
459
+ And add `has_neighbors` and a scope for keyword search
460
+
461
+ ```ruby
462
+ class Document < ApplicationRecord
463
+ has_neighbors :embedding
464
+
465
+ scope :search, ->(query) {
466
+ where("to_tsvector(content) @@ plainto_tsquery(?)", query)
467
+ .order(Arel.sql("ts_rank_cd(to_tsvector(content), plainto_tsquery(?)) DESC", query))
468
+ }
469
+ end
470
+ ```
471
+
472
+ Create some documents
473
+
474
+ ```ruby
475
+ texts = [
476
+ "The dog is barking",
477
+ "The cat is purring",
478
+ "The bear is growling"
479
+ ]
480
+ documents = Document.create!(texts.map { |v| {content: v} })
481
+ ```
482
+
483
+ Generate an embedding for each document
484
+
485
+ ```ruby
486
+ embed = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5")
487
+ embed_options = {model_output: "sentence_embedding", pooling: "none"} # specific to embedding model
488
+ embeddings = embed.(documents.map(&:content), **embed_options)
489
+
490
+ documents.zip(embeddings) do |document, embedding|
491
+ document.update!(embedding: embedding)
492
+ end
493
+ ```
494
+
495
+ Perform keyword search
496
+
497
+ ```ruby
498
+ query = "growling bear"
499
+ keyword_results = Document.search(query).limit(20).load_async
500
+ ```
501
+
502
+ And semantic search in parallel (the query prefix is specific to the [embedding model](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5))
503
+
504
+ ```ruby
505
+ query_prefix = "Represent this sentence for searching relevant passages: "
506
+ query_embedding = embed.(query_prefix + query, **embed_options)
507
+ semantic_results =
508
+ Document.nearest_neighbors(:embedding, query_embedding, distance: "cosine").limit(20).load_async
509
+ ```
510
+
511
+ To combine the results, use Reciprocal Rank Fusion (RRF)
512
+
513
+ ```ruby
514
+ Neighbor::Reranking.rrf(keyword_results, semantic_results)
515
+ ```
516
+
517
+ Or a reranking model
518
+
519
+ ```ruby
520
+ rerank = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-xsmall-v1")
521
+ results = (keyword_results + semantic_results).uniq
522
+ rerank.(query, results.map(&:content), top_k: 5).map { |v| results[v[:doc_id]] }
523
+ ```
524
+
525
+ See the [complete code](examples/hybrid/example.rb)
526
+
527
+ ### Sparse Search
448
528
 
449
529
  You can generate sparse embeddings locally with [Transformers.rb](https://github.com/ankane/transformers-ruby).
450
530
 
@@ -556,7 +636,7 @@ movies = []
556
636
  recommender.item_ids.each do |item_id|
557
637
  movies << {name: item_id, factors: recommender.item_factors(item_id)}
558
638
  end
559
- Movie.insert_all!(movies)
639
+ Movie.create!(movies)
560
640
  ```
561
641
 
562
642
  And get similar movies
@@ -0,0 +1,27 @@
1
+ module Neighbor
2
+ module Reranking
3
+ def self.rrf(first_ranking, *rankings, k: 60)
4
+ rankings.unshift(first_ranking)
5
+
6
+ ranks = []
7
+ results = []
8
+ rankings.each do |ranking|
9
+ ranks << ranking.map.with_index.to_h { |v, i| [v, i + 1] }
10
+ results.concat(ranking)
11
+ end
12
+
13
+ results =
14
+ results.uniq.map do |result|
15
+ score =
16
+ ranks.sum do |rank|
17
+ r = rank[result]
18
+ r ? 1.0 / (k + r) : 0.0
19
+ end
20
+
21
+ {result: result, score: score}
22
+ end
23
+
24
+ results.sort_by { |v| -v[:score] }
25
+ end
26
+ end
27
+ end
@@ -1,3 +1,3 @@
1
1
  module Neighbor
2
- VERSION = "0.4.2"
2
+ VERSION = "0.4.3"
3
3
  end
data/lib/neighbor.rb CHANGED
@@ -2,6 +2,7 @@
2
2
  require "active_support"
3
3
 
4
4
  # modules
5
+ require_relative "neighbor/reranking"
5
6
  require_relative "neighbor/sparse_vector"
6
7
  require_relative "neighbor/utils"
7
8
  require_relative "neighbor/version"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: neighbor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-27 00:00:00.000000000 Z
11
+ date: 2024-09-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
@@ -40,6 +40,7 @@ files:
40
40
  - lib/neighbor.rb
41
41
  - lib/neighbor/model.rb
42
42
  - lib/neighbor/railtie.rb
43
+ - lib/neighbor/reranking.rb
43
44
  - lib/neighbor/sparse_vector.rb
44
45
  - lib/neighbor/type/cube.rb
45
46
  - lib/neighbor/type/halfvec.rb