neighbor 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '09edc5a7eebbf6b14f06cb51340c5def49117a318340b4d2265321a8ce6a0bec'
4
- data.tar.gz: fc8c8319cf715612f195836c84861eb327765355a0430f2d58fb5ab57857844e
3
+ metadata.gz: 8aa6de2790d94de9411b0142836b2ad181a411e299fce4b98357b96ac4161183
4
+ data.tar.gz: 2924d7f15f5b36bc89ee72372c1bfeb373d99481269696a9a9dcc41f90201f38
5
5
  SHA512:
6
- metadata.gz: caa86d17e8a3f710988486264434767c33f8b197f9a8721d6dc762235a0bc959d5c186670f7518b9d628a771454861df1beb603a175ec804aa67cf6eb9e14361
7
- data.tar.gz: 3ac9d60c57cc3e82b617820f205282b42684517070de22af5d94878959ef00e3758fb88f821ba3d3f2369602919a41d1706314f77436c8b3e5ef95acc38e3c17
6
+ metadata.gz: 2bc1b3ee6d5b1ee0ab175b017e753cf958bd8ceb1ef2a23ba769770dfebf54eec251ac59c8f5f3b6ca56efcbad1763c34622b94924a017622c2f78fc8740f762
7
+ data.tar.gz: d946dda99833964582f63863b2d898fea6bf065312cf60aec873631df96195e1a54375606ad9c9cc0f767937cdb7ea38b0d9990efcbbeab15ccbb11f8a2020ef
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.4.1 (2024-08-26)
2
+
3
+ - Added `precision` option
4
+ - Added support for `bit` dimensions to model generator
5
+ - Fixed error with Numo arrays
6
+
1
7
  ## 0.4.0 (2024-06-25)
2
8
 
3
9
  - Added support for `halfvec` and `sparsevec` types
data/README.md CHANGED
@@ -35,7 +35,7 @@ rails db:migrate
35
35
  Create a migration
36
36
 
37
37
  ```ruby
38
- class AddEmbeddingToItems < ActiveRecord::Migration[7.1]
38
+ class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
39
39
  def change
40
40
  add_column :items, :embedding, :cube
41
41
  # or
@@ -76,9 +76,11 @@ Supported values are:
76
76
 
77
77
  - `euclidean`
78
78
  - `cosine`
79
- - `taxicab` (cube only)
79
+ - `taxicab`
80
80
  - `chebyshev` (cube only)
81
81
  - `inner_product` (vector only)
82
+ - `hamming` (vector only)
83
+ - `jaccard` (vector only)
82
84
 
83
85
  For cosine distance with cube, vectors must be normalized before being stored.
84
86
 
@@ -114,7 +116,7 @@ end
114
116
  For vector, add an approximate index to speed up queries. Create a migration with:
115
117
 
116
118
  ```ruby
117
- class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.1]
119
+ class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
118
120
  def change
119
121
  add_index :items, :embedding, using: :hnsw, opclass: :vector_l2_ops
120
122
  # or
@@ -137,9 +139,91 @@ Or the number of probes with IVFFlat
137
139
  Item.connection.execute("SET ivfflat.probes = 3")
138
140
  ```
139
141
 
142
+ ## Half-Precision Vectors
143
+
144
+ Use the `halfvec` type to store half-precision vectors
145
+
146
+ ```ruby
147
+ class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
148
+ def change
149
+ add_column :items, :embedding, :halfvec, limit: 3 # dimensions
150
+ end
151
+ end
152
+ ```
153
+
154
+ ## Half-Precision Indexing
155
+
156
+ Index vectors at half precision for smaller indexes
157
+
158
+ ```ruby
159
+ class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
160
+ def change
161
+ add_index :items, "(embedding::halfvec(3)) vector_l2_ops", using: :hnsw
162
+ end
163
+ end
164
+ ```
165
+
166
+ Get the nearest neighbors
167
+
168
+ ```ruby
169
+ Item.nearest_neighbors(:embedding, [0.9, 1.3, 1.1], distance: "euclidean", precision: "half").first(5)
170
+ ```
171
+
172
+ ## Binary Vectors
173
+
174
+ Use the `bit` type to store binary vectors
175
+
176
+ ```ruby
177
+ class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
178
+ def change
179
+ add_column :items, :embedding, :bit, limit: 3 # dimensions
180
+ end
181
+ end
182
+ ```
183
+
184
+ Get the nearest neighbors by Hamming distance
185
+
186
+ ```ruby
187
+ Item.nearest_neighbors(:embedding, "101", distance: "hamming").first(5)
188
+ ```
189
+
190
+ ## Binary Quantization
191
+
192
+ Use expression indexing for binary quantization
193
+
194
+ ```ruby
195
+ class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
196
+ def change
197
+ add_index :items, "(binary_quantize(embedding)::bit(3)) bit_hamming_ops", using: :hnsw
198
+ end
199
+ end
200
+ ```
201
+
202
+ ## Sparse Vectors
203
+
204
+ Use the `sparsevec` type to store sparse vectors
205
+
206
+ ```ruby
207
+ class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
208
+ def change
209
+ add_column :items, :embedding, :sparsevec, limit: 3 # dimensions
210
+ end
211
+ end
212
+ ```
213
+
214
+ Get the nearest neighbors
215
+
216
+ ```ruby
217
+ embedding = Neighbor::SparseVector.new({0 => 0.9, 1 => 1.3, 2 => 1.1}, 3)
218
+ Item.nearest_neighbors(:embedding, embedding, distance: "euclidean").first(5)
219
+ ```
220
+
140
221
  ## Examples
141
222
 
142
223
  - [OpenAI Embeddings](#openai-embeddings)
224
+ - [Cohere Embeddings](#cohere-embeddings)
225
+ - [Sentence Embeddings](#sentence-embeddings)
226
+ - [Sparse Embeddings](#sparse-embeddings)
143
227
  - [Disco Recommendations](#disco-recommendations)
144
228
 
145
229
  ### OpenAI Embeddings
@@ -170,10 +254,10 @@ def fetch_embeddings(input)
170
254
  }
171
255
  data = {
172
256
  input: input,
173
- model: "text-embedding-ada-002"
257
+ model: "text-embedding-3-small"
174
258
  }
175
259
 
176
- response = Net::HTTP.post(URI(url), data.to_json, headers)
260
+ response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value)
177
261
  JSON.parse(response.body)["data"].map { |v| v["embedding"] }
178
262
  end
179
263
  ```
@@ -199,14 +283,221 @@ end
199
283
  Document.insert_all!(documents)
200
284
  ```
201
285
 
202
- And get similar articles
286
+ And get similar documents
203
287
 
204
288
  ```ruby
205
289
  document = Document.first
206
290
  document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content)
207
291
  ```
208
292
 
209
- See the [complete code](examples/openai_embeddings.rb)
293
+ See the [complete code](examples/openai/example.rb)
294
+
295
+ ### Cohere Embeddings
296
+
297
+ Generate a model
298
+
299
+ ```sh
300
+ rails generate model Document content:text embedding:bit{1024}
301
+ rails db:migrate
302
+ ```
303
+
304
+ And add `has_neighbors`
305
+
306
+ ```ruby
307
+ class Document < ApplicationRecord
308
+ has_neighbors :embedding
309
+ end
310
+ ```
311
+
312
+ Create a method to call the [embed API](https://docs.cohere.com/reference/embed)
313
+
314
+ ```ruby
315
+ def fetch_embeddings(input, input_type)
316
+ url = "https://api.cohere.com/v1/embed"
317
+ headers = {
318
+ "Authorization" => "Bearer #{ENV.fetch("CO_API_KEY")}",
319
+ "Content-Type" => "application/json"
320
+ }
321
+ data = {
322
+ texts: input,
323
+ model: "embed-english-v3.0",
324
+ input_type: input_type,
325
+ embedding_types: ["ubinary"]
326
+ }
327
+
328
+ response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value)
329
+ JSON.parse(response.body)["embeddings"]["ubinary"].map { |e| e.map { |v| v.chr.unpack1("B*") }.join }
330
+ end
331
+ ```
332
+
333
+ Pass your input
334
+
335
+ ```ruby
336
+ input = [
337
+ "The dog is barking",
338
+ "The cat is purring",
339
+ "The bear is growling"
340
+ ]
341
+ embeddings = fetch_embeddings(input, "search_document")
342
+ ```
343
+
344
+ Store the embeddings
345
+
346
+ ```ruby
347
+ documents = []
348
+ input.zip(embeddings) do |content, embedding|
349
+ documents << {content: content, embedding: embedding}
350
+ end
351
+ Document.insert_all!(documents)
352
+ ```
353
+
354
+ Embed the search query
355
+
356
+ ```ruby
357
+ query = "forest"
358
+ query_embedding = fetch_embeddings([query], "search_query")[0]
359
+ ```
360
+
361
+ And search the documents
362
+
363
+ ```ruby
364
+ Document.nearest_neighbors(:embedding, query_embedding, distance: "hamming").first(5).map(&:content)
365
+ ```
366
+
367
+ See the [complete code](examples/cohere/example.rb)
368
+
369
+ ### Sentence Embeddings
370
+
371
+ You can generate embeddings locally with [Informers](https://github.com/ankane/informers).
372
+
373
+ Generate a model
374
+
375
+ ```sh
376
+ rails generate model Document content:text embedding:vector{384}
377
+ rails db:migrate
378
+ ```
379
+
380
+ And add `has_neighbors`
381
+
382
+ ```ruby
383
+ class Document < ApplicationRecord
384
+ has_neighbors :embedding
385
+ end
386
+ ```
387
+
388
+ Load a [model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
389
+
390
+ ```ruby
391
+ model = Informers::Model.new("sentence-transformers/all-MiniLM-L6-v2")
392
+ ```
393
+
394
+ Pass your input
395
+
396
+ ```ruby
397
+ input = [
398
+ "The dog is barking",
399
+ "The cat is purring",
400
+ "The bear is growling"
401
+ ]
402
+ embeddings = model.embed(input)
403
+ ```
404
+
405
+ Store the embeddings
406
+
407
+ ```ruby
408
+ documents = []
409
+ input.zip(embeddings) do |content, embedding|
410
+ documents << {content: content, embedding: embedding}
411
+ end
412
+ Document.insert_all!(documents)
413
+ ```
414
+
415
+ And get similar documents
416
+
417
+ ```ruby
418
+ document = Document.first
419
+ document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content)
420
+ ```
421
+
422
+ See the [complete code](examples/informers/example.rb)
423
+
424
+ ### Sparse Embeddings
425
+
426
+ You can generate sparse embeddings locally with [Transformers.rb](https://github.com/ankane/transformers-ruby).
427
+
428
+ Generate a model
429
+
430
+ ```sh
431
+ rails generate model Document content:text embedding:sparsevec{30522}
432
+ rails db:migrate
433
+ ```
434
+
435
+ And add `has_neighbors`
436
+
437
+ ```ruby
438
+ class Document < ApplicationRecord
439
+ has_neighbors :embedding
440
+ end
441
+ ```
442
+
443
+ Load a [model](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1) to generate embeddings
444
+
445
+ ```ruby
446
+ class EmbeddingModel
447
+ def initialize(model_id)
448
+ @model = Transformers::AutoModelForMaskedLM.from_pretrained(model_id)
449
+ @tokenizer = Transformers::AutoTokenizer.from_pretrained(model_id)
450
+ @special_token_ids = @tokenizer.special_tokens_map.map { |_, token| @tokenizer.vocab[token] }
451
+ end
452
+
453
+ def embed(input)
454
+ feature = @tokenizer.(input, padding: true, truncation: true, return_tensors: "pt", return_token_type_ids: false)
455
+ output = @model.(**feature)[0]
456
+ values = Torch.max(output * feature[:attention_mask].unsqueeze(-1), dim: 1)[0]
457
+ values = Torch.log(1 + Torch.relu(values))
458
+ values[0.., @special_token_ids] = 0
459
+ values.to_a
460
+ end
461
+ end
462
+
463
+ model = EmbeddingModel.new("opensearch-project/opensearch-neural-sparse-encoding-v1")
464
+ ```
465
+
466
+ Pass your input
467
+
468
+ ```ruby
469
+ input = [
470
+ "The dog is barking",
471
+ "The cat is purring",
472
+ "The bear is growling"
473
+ ]
474
+ embeddings = model.embed(input)
475
+ ```
476
+
477
+ Store the embeddings
478
+
479
+ ```ruby
480
+ documents = []
481
+ input.zip(embeddings) do |content, embedding|
482
+ documents << {content: content, embedding: Neighbor::SparseVector.new(embedding)}
483
+ end
484
+ Document.insert_all!(documents)
485
+ ```
486
+
487
+ Embed the search query
488
+
489
+ ```ruby
490
+ query = "forest"
491
+ query_embedding = model.embed([query])[0]
492
+ ```
493
+
494
+ And search the documents
495
+
496
+ ```ruby
497
+ Document.nearest_neighbors(:embedding, Neighbor::SparseVector.new(query_embedding), distance: "inner_product").first(5).map(&:content)
498
+ ```
499
+
500
+ See the [complete code](examples/sparse/example.rb)
210
501
 
211
502
  ### Disco Recommendations
212
503
 
@@ -252,19 +543,7 @@ movie = Movie.find_by(name: "Star Wars (1977)")
252
543
  movie.nearest_neighbors(:factors, distance: "cosine").first(5).map(&:name)
253
544
  ```
254
545
 
255
- See the complete code for [cube](examples/disco_item_recs_cube.rb) and [vector](examples/disco_item_recs_vector.rb)
256
-
257
- ## Upgrading
258
-
259
- ### 0.2.0
260
-
261
- The `distance` option has been moved from `has_neighbors` to `nearest_neighbors`, and there is no longer a default. If you use cosine distance, set:
262
-
263
- ```ruby
264
- class Item < ApplicationRecord
265
- has_neighbors normalize: true
266
- end
267
- ```
546
+ See the complete code for [cube](examples/disco/item_recs_cube.rb) and [vector](examples/disco/item_recs_vector.rb)
268
547
 
269
548
  ## History
270
549
 
@@ -49,7 +49,7 @@ module Neighbor
49
49
  # TODO move to normalizes when Active Record < 7.1 no longer supported
50
50
  before_save do
51
51
  self.class.neighbor_attributes.each do |k, v|
52
- next unless v[:normalize]
52
+ next unless v[:normalize] && attribute_changed?(k)
53
53
  value = read_attribute(k)
54
54
  next if value.nil?
55
55
  self[k] = Neighbor::Utils.normalize(value, column_info: self.class.columns_hash[k.to_s])
@@ -61,6 +61,7 @@ module Neighbor
61
61
  scope :nearest_neighbors, ->(attribute_name, vector, options = nil) {
62
62
  raise ArgumentError, "missing keyword: :distance" unless options.is_a?(Hash) && options.key?(:distance)
63
63
  distance = options.delete(:distance)
64
+ precision = options.delete(:precision)
64
65
  raise ArgumentError, "unknown keywords: #{options.keys.map(&:inspect).join(", ")}" if options.any?
65
66
 
66
67
  attribute_name = attribute_name.to_sym
@@ -126,6 +127,18 @@ module Neighbor
126
127
  vector = Neighbor::Utils.normalize(vector, column_info: column_info) if normalize
127
128
 
128
129
  query = connection.quote(column_attribute.serialize(vector))
130
+
131
+ if !precision.nil?
132
+ case precision.to_s
133
+ when "half"
134
+ cast_dimensions = dimensions || column_info&.limit
135
+ raise ArgumentError, "Unknown dimensions" unless cast_dimensions
136
+ quoted_attribute += "::halfvec(#{connection.quote(cast_dimensions.to_i)})"
137
+ else
138
+ raise ArgumentError, "Invalid precision"
139
+ end
140
+ end
141
+
129
142
  order = "#{quoted_attribute} #{operator} #{query}"
130
143
  if operator == "#"
131
144
  order = "bit_count(#{order})"
@@ -10,7 +10,7 @@ module Neighbor
10
10
 
11
11
  module GeneratedAttribute
12
12
  def parse_type_and_options(type, *, **)
13
- if type =~ /\A(vector|halfvec|sparsevec)\{(\d+)\}\z/
13
+ if type =~ /\A(vector|halfvec|bit|sparsevec)\{(\d+)\}\z/
14
14
  return $1, limit: $2.to_i
15
15
  end
16
16
  super
@@ -6,7 +6,8 @@ module Neighbor
6
6
  end
7
7
 
8
8
  def serialize(value)
9
- if value.is_a?(Array)
9
+ if value.respond_to?(:to_a)
10
+ value = value.to_a
10
11
  if value.first.is_a?(Array)
11
12
  value = value.map { |v| serialize_point(v) }.join(", ")
12
13
  else
@@ -19,8 +20,8 @@ module Neighbor
19
20
  private
20
21
 
21
22
  def cast_value(value)
22
- if value.is_a?(Array)
23
- value
23
+ if value.respond_to?(:to_a)
24
+ value.to_a
24
25
  elsif value.is_a?(Numeric)
25
26
  [value]
26
27
  elsif value.is_a?(String)
@@ -6,8 +6,8 @@ module Neighbor
6
6
  end
7
7
 
8
8
  def serialize(value)
9
- if value.is_a?(Array)
10
- value = "[#{value.map(&:to_f).join(",")}]"
9
+ if value.respond_to?(:to_a)
10
+ value = "[#{value.to_a.map(&:to_f).join(",")}]"
11
11
  end
12
12
  super(value)
13
13
  end
@@ -17,8 +17,8 @@ module Neighbor
17
17
  def cast_value(value)
18
18
  if value.is_a?(String)
19
19
  value[1..-1].split(",").map(&:to_f)
20
- elsif value.is_a?(Array)
21
- value
20
+ elsif value.respond_to?(:to_a)
21
+ value.to_a
22
22
  else
23
23
  raise "can't cast #{value.class.name} to halfvec"
24
24
  end
@@ -19,8 +19,8 @@ module Neighbor
19
19
  value
20
20
  elsif value.is_a?(String)
21
21
  SparseVector.from_text(value)
22
- elsif value.is_a?(Array)
23
- value = SparseVector.new(value)
22
+ elsif value.respond_to?(:to_a)
23
+ value = SparseVector.new(value.to_a)
24
24
  else
25
25
  raise "can't cast #{value.class.name} to sparsevec"
26
26
  end
@@ -6,8 +6,8 @@ module Neighbor
6
6
  end
7
7
 
8
8
  def serialize(value)
9
- if value.is_a?(Array)
10
- value = "[#{value.map(&:to_f).join(",")}]"
9
+ if value.respond_to?(:to_a)
10
+ value = "[#{value.to_a.map(&:to_f).join(",")}]"
11
11
  end
12
12
  super(value)
13
13
  end
@@ -17,8 +17,8 @@ module Neighbor
17
17
  def cast_value(value)
18
18
  if value.is_a?(String)
19
19
  value[1..-1].split(",").map(&:to_f)
20
- elsif value.is_a?(Array)
21
- value
20
+ elsif value.respond_to?(:to_a)
21
+ value.to_a
22
22
  else
23
23
  raise "can't cast #{value.class.name} to vector"
24
24
  end
@@ -1,3 +1,3 @@
1
1
  module Neighbor
2
- VERSION = "0.4.0"
2
+ VERSION = "0.4.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: neighbor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-06-26 00:00:00.000000000 Z
11
+ date: 2024-08-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord