neighbor 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '09edc5a7eebbf6b14f06cb51340c5def49117a318340b4d2265321a8ce6a0bec'
4
- data.tar.gz: fc8c8319cf715612f195836c84861eb327765355a0430f2d58fb5ab57857844e
3
+ metadata.gz: 8aa6de2790d94de9411b0142836b2ad181a411e299fce4b98357b96ac4161183
4
+ data.tar.gz: 2924d7f15f5b36bc89ee72372c1bfeb373d99481269696a9a9dcc41f90201f38
5
5
  SHA512:
6
- metadata.gz: caa86d17e8a3f710988486264434767c33f8b197f9a8721d6dc762235a0bc959d5c186670f7518b9d628a771454861df1beb603a175ec804aa67cf6eb9e14361
7
- data.tar.gz: 3ac9d60c57cc3e82b617820f205282b42684517070de22af5d94878959ef00e3758fb88f821ba3d3f2369602919a41d1706314f77436c8b3e5ef95acc38e3c17
6
+ metadata.gz: 2bc1b3ee6d5b1ee0ab175b017e753cf958bd8ceb1ef2a23ba769770dfebf54eec251ac59c8f5f3b6ca56efcbad1763c34622b94924a017622c2f78fc8740f762
7
+ data.tar.gz: d946dda99833964582f63863b2d898fea6bf065312cf60aec873631df96195e1a54375606ad9c9cc0f767937cdb7ea38b0d9990efcbbeab15ccbb11f8a2020ef
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.4.1 (2024-08-26)
2
+
3
+ - Added `precision` option
4
+ - Added support for `bit` dimensions to model generator
5
+ - Fixed error with Numo arrays
6
+
1
7
  ## 0.4.0 (2024-06-25)
2
8
 
3
9
  - Added support for `halfvec` and `sparsevec` types
data/README.md CHANGED
@@ -35,7 +35,7 @@ rails db:migrate
35
35
  Create a migration
36
36
 
37
37
  ```ruby
38
- class AddEmbeddingToItems < ActiveRecord::Migration[7.1]
38
+ class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
39
39
  def change
40
40
  add_column :items, :embedding, :cube
41
41
  # or
@@ -76,9 +76,11 @@ Supported values are:
76
76
 
77
77
  - `euclidean`
78
78
  - `cosine`
79
- - `taxicab` (cube only)
79
+ - `taxicab`
80
80
  - `chebyshev` (cube only)
81
81
  - `inner_product` (vector only)
82
+ - `hamming` (vector only)
83
+ - `jaccard` (vector only)
82
84
 
83
85
  For cosine distance with cube, vectors must be normalized before being stored.
84
86
 
@@ -114,7 +116,7 @@ end
114
116
  For vector, add an approximate index to speed up queries. Create a migration with:
115
117
 
116
118
  ```ruby
117
- class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.1]
119
+ class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
118
120
  def change
119
121
  add_index :items, :embedding, using: :hnsw, opclass: :vector_l2_ops
120
122
  # or
@@ -137,9 +139,91 @@ Or the number of probes with IVFFlat
137
139
  Item.connection.execute("SET ivfflat.probes = 3")
138
140
  ```
139
141
 
142
+ ## Half-Precision Vectors
143
+
144
+ Use the `halfvec` type to store half-precision vectors
145
+
146
+ ```ruby
147
+ class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
148
+ def change
149
+ add_column :items, :embedding, :halfvec, limit: 3 # dimensions
150
+ end
151
+ end
152
+ ```
153
+
154
+ ## Half-Precision Indexing
155
+
156
+ Index vectors at half precision for smaller indexes
157
+
158
+ ```ruby
159
+ class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
160
+ def change
161
+ add_index :items, "(embedding::halfvec(3)) vector_l2_ops", using: :hnsw
162
+ end
163
+ end
164
+ ```
165
+
166
+ Get the nearest neighbors
167
+
168
+ ```ruby
169
+ Item.nearest_neighbors(:embedding, [0.9, 1.3, 1.1], distance: "euclidean", precision: "half").first(5)
170
+ ```
171
+
172
+ ## Binary Vectors
173
+
174
+ Use the `bit` type to store binary vectors
175
+
176
+ ```ruby
177
+ class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
178
+ def change
179
+ add_column :items, :embedding, :bit, limit: 3 # dimensions
180
+ end
181
+ end
182
+ ```
183
+
184
+ Get the nearest neighbors by Hamming distance
185
+
186
+ ```ruby
187
+ Item.nearest_neighbors(:embedding, "101", distance: "hamming").first(5)
188
+ ```
189
+
190
+ ## Binary Quantization
191
+
192
+ Use expression indexing for binary quantization
193
+
194
+ ```ruby
195
+ class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
196
+ def change
197
+ add_index :items, "(binary_quantize(embedding)::bit(3)) bit_hamming_ops", using: :hnsw
198
+ end
199
+ end
200
+ ```
201
+
202
+ ## Sparse Vectors
203
+
204
+ Use the `sparsevec` type to store sparse vectors
205
+
206
+ ```ruby
207
+ class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
208
+ def change
209
+ add_column :items, :embedding, :sparsevec, limit: 3 # dimensions
210
+ end
211
+ end
212
+ ```
213
+
214
+ Get the nearest neighbors
215
+
216
+ ```ruby
217
+ embedding = Neighbor::SparseVector.new({0 => 0.9, 1 => 1.3, 2 => 1.1}, 3)
218
+ Item.nearest_neighbors(:embedding, embedding, distance: "euclidean").first(5)
219
+ ```
220
+
140
221
  ## Examples
141
222
 
142
223
  - [OpenAI Embeddings](#openai-embeddings)
224
+ - [Cohere Embeddings](#cohere-embeddings)
225
+ - [Sentence Embeddings](#sentence-embeddings)
226
+ - [Sparse Embeddings](#sparse-embeddings)
143
227
  - [Disco Recommendations](#disco-recommendations)
144
228
 
145
229
  ### OpenAI Embeddings
@@ -170,10 +254,10 @@ def fetch_embeddings(input)
170
254
  }
171
255
  data = {
172
256
  input: input,
173
- model: "text-embedding-ada-002"
257
+ model: "text-embedding-3-small"
174
258
  }
175
259
 
176
- response = Net::HTTP.post(URI(url), data.to_json, headers)
260
+ response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value)
177
261
  JSON.parse(response.body)["data"].map { |v| v["embedding"] }
178
262
  end
179
263
  ```
@@ -199,14 +283,221 @@ end
199
283
  Document.insert_all!(documents)
200
284
  ```
201
285
 
202
- And get similar articles
286
+ And get similar documents
203
287
 
204
288
  ```ruby
205
289
  document = Document.first
206
290
  document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content)
207
291
  ```
208
292
 
209
- See the [complete code](examples/openai_embeddings.rb)
293
+ See the [complete code](examples/openai/example.rb)
294
+
295
+ ### Cohere Embeddings
296
+
297
+ Generate a model
298
+
299
+ ```sh
300
+ rails generate model Document content:text embedding:bit{1024}
301
+ rails db:migrate
302
+ ```
303
+
304
+ And add `has_neighbors`
305
+
306
+ ```ruby
307
+ class Document < ApplicationRecord
308
+ has_neighbors :embedding
309
+ end
310
+ ```
311
+
312
+ Create a method to call the [embed API](https://docs.cohere.com/reference/embed)
313
+
314
+ ```ruby
315
+ def fetch_embeddings(input, input_type)
316
+ url = "https://api.cohere.com/v1/embed"
317
+ headers = {
318
+ "Authorization" => "Bearer #{ENV.fetch("CO_API_KEY")}",
319
+ "Content-Type" => "application/json"
320
+ }
321
+ data = {
322
+ texts: input,
323
+ model: "embed-english-v3.0",
324
+ input_type: input_type,
325
+ embedding_types: ["ubinary"]
326
+ }
327
+
328
+ response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value)
329
+ JSON.parse(response.body)["embeddings"]["ubinary"].map { |e| e.map { |v| v.chr.unpack1("B*") }.join }
330
+ end
331
+ ```
332
+
333
+ Pass your input
334
+
335
+ ```ruby
336
+ input = [
337
+ "The dog is barking",
338
+ "The cat is purring",
339
+ "The bear is growling"
340
+ ]
341
+ embeddings = fetch_embeddings(input, "search_document")
342
+ ```
343
+
344
+ Store the embeddings
345
+
346
+ ```ruby
347
+ documents = []
348
+ input.zip(embeddings) do |content, embedding|
349
+ documents << {content: content, embedding: embedding}
350
+ end
351
+ Document.insert_all!(documents)
352
+ ```
353
+
354
+ Embed the search query
355
+
356
+ ```ruby
357
+ query = "forest"
358
+ query_embedding = fetch_embeddings([query], "search_query")[0]
359
+ ```
360
+
361
+ And search the documents
362
+
363
+ ```ruby
364
+ Document.nearest_neighbors(:embedding, query_embedding, distance: "hamming").first(5).map(&:content)
365
+ ```
366
+
367
+ See the [complete code](examples/cohere/example.rb)
368
+
369
+ ### Sentence Embeddings
370
+
371
+ You can generate embeddings locally with [Informers](https://github.com/ankane/informers).
372
+
373
+ Generate a model
374
+
375
+ ```sh
376
+ rails generate model Document content:text embedding:vector{384}
377
+ rails db:migrate
378
+ ```
379
+
380
+ And add `has_neighbors`
381
+
382
+ ```ruby
383
+ class Document < ApplicationRecord
384
+ has_neighbors :embedding
385
+ end
386
+ ```
387
+
388
+ Load a [model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
389
+
390
+ ```ruby
391
+ model = Informers::Model.new("sentence-transformers/all-MiniLM-L6-v2")
392
+ ```
393
+
394
+ Pass your input
395
+
396
+ ```ruby
397
+ input = [
398
+ "The dog is barking",
399
+ "The cat is purring",
400
+ "The bear is growling"
401
+ ]
402
+ embeddings = model.embed(input)
403
+ ```
404
+
405
+ Store the embeddings
406
+
407
+ ```ruby
408
+ documents = []
409
+ input.zip(embeddings) do |content, embedding|
410
+ documents << {content: content, embedding: embedding}
411
+ end
412
+ Document.insert_all!(documents)
413
+ ```
414
+
415
+ And get similar documents
416
+
417
+ ```ruby
418
+ document = Document.first
419
+ document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content)
420
+ ```
421
+
422
+ See the [complete code](examples/informers/example.rb)
423
+
424
+ ### Sparse Embeddings
425
+
426
+ You can generate sparse embeddings locally with [Transformers.rb](https://github.com/ankane/transformers-ruby).
427
+
428
+ Generate a model
429
+
430
+ ```sh
431
+ rails generate model Document content:text embedding:sparsevec{30522}
432
+ rails db:migrate
433
+ ```
434
+
435
+ And add `has_neighbors`
436
+
437
+ ```ruby
438
+ class Document < ApplicationRecord
439
+ has_neighbors :embedding
440
+ end
441
+ ```
442
+
443
+ Load a [model](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1) to generate embeddings
444
+
445
+ ```ruby
446
+ class EmbeddingModel
447
+ def initialize(model_id)
448
+ @model = Transformers::AutoModelForMaskedLM.from_pretrained(model_id)
449
+ @tokenizer = Transformers::AutoTokenizer.from_pretrained(model_id)
450
+ @special_token_ids = @tokenizer.special_tokens_map.map { |_, token| @tokenizer.vocab[token] }
451
+ end
452
+
453
+ def embed(input)
454
+ feature = @tokenizer.(input, padding: true, truncation: true, return_tensors: "pt", return_token_type_ids: false)
455
+ output = @model.(**feature)[0]
456
+ values = Torch.max(output * feature[:attention_mask].unsqueeze(-1), dim: 1)[0]
457
+ values = Torch.log(1 + Torch.relu(values))
458
+ values[0.., @special_token_ids] = 0
459
+ values.to_a
460
+ end
461
+ end
462
+
463
+ model = EmbeddingModel.new("opensearch-project/opensearch-neural-sparse-encoding-v1")
464
+ ```
465
+
466
+ Pass your input
467
+
468
+ ```ruby
469
+ input = [
470
+ "The dog is barking",
471
+ "The cat is purring",
472
+ "The bear is growling"
473
+ ]
474
+ embeddings = model.embed(input)
475
+ ```
476
+
477
+ Store the embeddings
478
+
479
+ ```ruby
480
+ documents = []
481
+ input.zip(embeddings) do |content, embedding|
482
+ documents << {content: content, embedding: Neighbor::SparseVector.new(embedding)}
483
+ end
484
+ Document.insert_all!(documents)
485
+ ```
486
+
487
+ Embed the search query
488
+
489
+ ```ruby
490
+ query = "forest"
491
+ query_embedding = model.embed([query])[0]
492
+ ```
493
+
494
+ And search the documents
495
+
496
+ ```ruby
497
+ Document.nearest_neighbors(:embedding, Neighbor::SparseVector.new(query_embedding), distance: "inner_product").first(5).map(&:content)
498
+ ```
499
+
500
+ See the [complete code](examples/sparse/example.rb)
210
501
 
211
502
  ### Disco Recommendations
212
503
 
@@ -252,19 +543,7 @@ movie = Movie.find_by(name: "Star Wars (1977)")
252
543
  movie.nearest_neighbors(:factors, distance: "cosine").first(5).map(&:name)
253
544
  ```
254
545
 
255
- See the complete code for [cube](examples/disco_item_recs_cube.rb) and [vector](examples/disco_item_recs_vector.rb)
256
-
257
- ## Upgrading
258
-
259
- ### 0.2.0
260
-
261
- The `distance` option has been moved from `has_neighbors` to `nearest_neighbors`, and there is no longer a default. If you use cosine distance, set:
262
-
263
- ```ruby
264
- class Item < ApplicationRecord
265
- has_neighbors normalize: true
266
- end
267
- ```
546
+ See the complete code for [cube](examples/disco/item_recs_cube.rb) and [vector](examples/disco/item_recs_vector.rb)
268
547
 
269
548
  ## History
270
549
 
@@ -49,7 +49,7 @@ module Neighbor
49
49
  # TODO move to normalizes when Active Record < 7.1 no longer supported
50
50
  before_save do
51
51
  self.class.neighbor_attributes.each do |k, v|
52
- next unless v[:normalize]
52
+ next unless v[:normalize] && attribute_changed?(k)
53
53
  value = read_attribute(k)
54
54
  next if value.nil?
55
55
  self[k] = Neighbor::Utils.normalize(value, column_info: self.class.columns_hash[k.to_s])
@@ -61,6 +61,7 @@ module Neighbor
61
61
  scope :nearest_neighbors, ->(attribute_name, vector, options = nil) {
62
62
  raise ArgumentError, "missing keyword: :distance" unless options.is_a?(Hash) && options.key?(:distance)
63
63
  distance = options.delete(:distance)
64
+ precision = options.delete(:precision)
64
65
  raise ArgumentError, "unknown keywords: #{options.keys.map(&:inspect).join(", ")}" if options.any?
65
66
 
66
67
  attribute_name = attribute_name.to_sym
@@ -126,6 +127,18 @@ module Neighbor
126
127
  vector = Neighbor::Utils.normalize(vector, column_info: column_info) if normalize
127
128
 
128
129
  query = connection.quote(column_attribute.serialize(vector))
130
+
131
+ if !precision.nil?
132
+ case precision.to_s
133
+ when "half"
134
+ cast_dimensions = dimensions || column_info&.limit
135
+ raise ArgumentError, "Unknown dimensions" unless cast_dimensions
136
+ quoted_attribute += "::halfvec(#{connection.quote(cast_dimensions.to_i)})"
137
+ else
138
+ raise ArgumentError, "Invalid precision"
139
+ end
140
+ end
141
+
129
142
  order = "#{quoted_attribute} #{operator} #{query}"
130
143
  if operator == "#"
131
144
  order = "bit_count(#{order})"
@@ -10,7 +10,7 @@ module Neighbor
10
10
 
11
11
  module GeneratedAttribute
12
12
  def parse_type_and_options(type, *, **)
13
- if type =~ /\A(vector|halfvec|sparsevec)\{(\d+)\}\z/
13
+ if type =~ /\A(vector|halfvec|bit|sparsevec)\{(\d+)\}\z/
14
14
  return $1, limit: $2.to_i
15
15
  end
16
16
  super
@@ -6,7 +6,8 @@ module Neighbor
6
6
  end
7
7
 
8
8
  def serialize(value)
9
- if value.is_a?(Array)
9
+ if value.respond_to?(:to_a)
10
+ value = value.to_a
10
11
  if value.first.is_a?(Array)
11
12
  value = value.map { |v| serialize_point(v) }.join(", ")
12
13
  else
@@ -19,8 +20,8 @@ module Neighbor
19
20
  private
20
21
 
21
22
  def cast_value(value)
22
- if value.is_a?(Array)
23
- value
23
+ if value.respond_to?(:to_a)
24
+ value.to_a
24
25
  elsif value.is_a?(Numeric)
25
26
  [value]
26
27
  elsif value.is_a?(String)
@@ -6,8 +6,8 @@ module Neighbor
6
6
  end
7
7
 
8
8
  def serialize(value)
9
- if value.is_a?(Array)
10
- value = "[#{value.map(&:to_f).join(",")}]"
9
+ if value.respond_to?(:to_a)
10
+ value = "[#{value.to_a.map(&:to_f).join(",")}]"
11
11
  end
12
12
  super(value)
13
13
  end
@@ -17,8 +17,8 @@ module Neighbor
17
17
  def cast_value(value)
18
18
  if value.is_a?(String)
19
19
  value[1..-1].split(",").map(&:to_f)
20
- elsif value.is_a?(Array)
21
- value
20
+ elsif value.respond_to?(:to_a)
21
+ value.to_a
22
22
  else
23
23
  raise "can't cast #{value.class.name} to halfvec"
24
24
  end
@@ -19,8 +19,8 @@ module Neighbor
19
19
  value
20
20
  elsif value.is_a?(String)
21
21
  SparseVector.from_text(value)
22
- elsif value.is_a?(Array)
23
- value = SparseVector.new(value)
22
+ elsif value.respond_to?(:to_a)
23
+ value = SparseVector.new(value.to_a)
24
24
  else
25
25
  raise "can't cast #{value.class.name} to sparsevec"
26
26
  end
@@ -6,8 +6,8 @@ module Neighbor
6
6
  end
7
7
 
8
8
  def serialize(value)
9
- if value.is_a?(Array)
10
- value = "[#{value.map(&:to_f).join(",")}]"
9
+ if value.respond_to?(:to_a)
10
+ value = "[#{value.to_a.map(&:to_f).join(",")}]"
11
11
  end
12
12
  super(value)
13
13
  end
@@ -17,8 +17,8 @@ module Neighbor
17
17
  def cast_value(value)
18
18
  if value.is_a?(String)
19
19
  value[1..-1].split(",").map(&:to_f)
20
- elsif value.is_a?(Array)
21
- value
20
+ elsif value.respond_to?(:to_a)
21
+ value.to_a
22
22
  else
23
23
  raise "can't cast #{value.class.name} to vector"
24
24
  end
@@ -1,3 +1,3 @@
1
1
  module Neighbor
2
- VERSION = "0.4.0"
2
+ VERSION = "0.4.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: neighbor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-06-26 00:00:00.000000000 Z
11
+ date: 2024-08-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord