neighbor 0.3.2 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/LICENSE.txt +1 -1
- data/README.md +306 -32
- data/lib/generators/neighbor/cube_generator.rb +1 -0
- data/lib/generators/neighbor/vector_generator.rb +1 -0
- data/lib/neighbor/model.rb +76 -40
- data/lib/neighbor/railtie.rb +4 -4
- data/lib/neighbor/sparse_vector.rb +79 -0
- data/lib/neighbor/type/cube.rb +24 -19
- data/lib/neighbor/type/halfvec.rb +28 -0
- data/lib/neighbor/type/sparsevec.rb +30 -0
- data/lib/neighbor/type/vector.rb +19 -5
- data/lib/neighbor/utils.rb +42 -0
- data/lib/neighbor/version.rb +1 -1
- data/lib/neighbor.rb +15 -2
- metadata +8 -5
- data/lib/neighbor/vector.rb +0 -65
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8aa6de2790d94de9411b0142836b2ad181a411e299fce4b98357b96ac4161183
|
4
|
+
data.tar.gz: 2924d7f15f5b36bc89ee72372c1bfeb373d99481269696a9a9dcc41f90201f38
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2bc1b3ee6d5b1ee0ab175b017e753cf958bd8ceb1ef2a23ba769770dfebf54eec251ac59c8f5f3b6ca56efcbad1763c34622b94924a017622c2f78fc8740f762
|
7
|
+
data.tar.gz: d946dda99833964582f63863b2d898fea6bf065312cf60aec873631df96195e1a54375606ad9c9cc0f767937cdb7ea38b0d9990efcbbeab15ccbb11f8a2020ef
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,22 @@
|
|
1
|
+
## 0.4.1 (2024-08-26)
|
2
|
+
|
3
|
+
- Added `precision` option
|
4
|
+
- Added support for `bit` dimensions to model generator
|
5
|
+
- Fixed error with Numo arrays
|
6
|
+
|
7
|
+
## 0.4.0 (2024-06-25)
|
8
|
+
|
9
|
+
- Added support for `halfvec` and `sparsevec` types
|
10
|
+
- Added support for `taxicab`, `hamming`, and `jaccard` distances with `vector` extension
|
11
|
+
- Added deserialization for `cube` and `vector` columns without `has_neighbor`
|
12
|
+
- Added support for composite primary keys
|
13
|
+
- Changed `nearest_neighbors` to replace previous `order` scopes
|
14
|
+
- Changed `normalize` option to use `before_save` callback
|
15
|
+
- Changed dimensions and finite values checks to use Active Record validations
|
16
|
+
- Fixed issue with `nearest_neighbors` scope overriding `select` values
|
17
|
+
- Removed default attribute name
|
18
|
+
- Dropped support for Ruby < 3.1
|
19
|
+
|
1
20
|
## 0.3.2 (2023-12-12)
|
2
21
|
|
3
22
|
- Added deprecation warning for `has_neighbors` without an attribute name
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Nearest neighbor search for Rails and Postgres
|
4
4
|
|
5
|
-
[](https://github.com/ankane/neighbor/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -35,7 +35,7 @@ rails db:migrate
|
|
35
35
|
Create a migration
|
36
36
|
|
37
37
|
```ruby
|
38
|
-
class AddEmbeddingToItems < ActiveRecord::Migration[7.
|
38
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
|
39
39
|
def change
|
40
40
|
add_column :items, :embedding, :cube
|
41
41
|
# or
|
@@ -76,9 +76,11 @@ Supported values are:
|
|
76
76
|
|
77
77
|
- `euclidean`
|
78
78
|
- `cosine`
|
79
|
-
- `taxicab`
|
79
|
+
- `taxicab`
|
80
80
|
- `chebyshev` (cube only)
|
81
81
|
- `inner_product` (vector only)
|
82
|
+
- `hamming` (vector only)
|
83
|
+
- `jaccard` (vector only)
|
82
84
|
|
83
85
|
For cosine distance with cube, vectors must be normalized before being stored.
|
84
86
|
|
@@ -114,32 +116,114 @@ end
|
|
114
116
|
For vector, add an approximate index to speed up queries. Create a migration with:
|
115
117
|
|
116
118
|
```ruby
|
117
|
-
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.
|
119
|
+
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
|
118
120
|
def change
|
119
|
-
add_index :items, :embedding, using: :ivfflat, opclass: :vector_l2_ops
|
120
|
-
# or with pgvector 0.5.0+
|
121
121
|
add_index :items, :embedding, using: :hnsw, opclass: :vector_l2_ops
|
122
|
+
# or
|
123
|
+
add_index :items, :embedding, using: :ivfflat, opclass: :vector_l2_ops
|
122
124
|
end
|
123
125
|
end
|
124
126
|
```
|
125
127
|
|
126
128
|
Use `:vector_cosine_ops` for cosine distance and `:vector_ip_ops` for inner product.
|
127
129
|
|
128
|
-
Set the
|
130
|
+
Set the size of the dynamic candidate list with HNSW
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
Item.connection.execute("SET hnsw.ef_search = 100")
|
134
|
+
```
|
135
|
+
|
136
|
+
Or the number of probes with IVFFlat
|
129
137
|
|
130
138
|
```ruby
|
131
139
|
Item.connection.execute("SET ivfflat.probes = 3")
|
132
140
|
```
|
133
141
|
|
134
|
-
|
142
|
+
## Half-Precision Vectors
|
143
|
+
|
144
|
+
Use the `halfvec` type to store half-precision vectors
|
135
145
|
|
136
146
|
```ruby
|
137
|
-
|
147
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
|
148
|
+
def change
|
149
|
+
add_column :items, :embedding, :halfvec, limit: 3 # dimensions
|
150
|
+
end
|
151
|
+
end
|
152
|
+
```
|
153
|
+
|
154
|
+
## Half-Precision Indexing
|
155
|
+
|
156
|
+
Index vectors at half precision for smaller indexes
|
157
|
+
|
158
|
+
```ruby
|
159
|
+
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
|
160
|
+
def change
|
161
|
+
add_index :items, "(embedding::halfvec(3)) vector_l2_ops", using: :hnsw
|
162
|
+
end
|
163
|
+
end
|
164
|
+
```
|
165
|
+
|
166
|
+
Get the nearest neighbors
|
167
|
+
|
168
|
+
```ruby
|
169
|
+
Item.nearest_neighbors(:embedding, [0.9, 1.3, 1.1], distance: "euclidean", precision: "half").first(5)
|
170
|
+
```
|
171
|
+
|
172
|
+
## Binary Vectors
|
173
|
+
|
174
|
+
Use the `bit` type to store binary vectors
|
175
|
+
|
176
|
+
```ruby
|
177
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
|
178
|
+
def change
|
179
|
+
add_column :items, :embedding, :bit, limit: 3 # dimensions
|
180
|
+
end
|
181
|
+
end
|
182
|
+
```
|
183
|
+
|
184
|
+
Get the nearest neighbors by Hamming distance
|
185
|
+
|
186
|
+
```ruby
|
187
|
+
Item.nearest_neighbors(:embedding, "101", distance: "hamming").first(5)
|
188
|
+
```
|
189
|
+
|
190
|
+
## Binary Quantization
|
191
|
+
|
192
|
+
Use expression indexing for binary quantization
|
193
|
+
|
194
|
+
```ruby
|
195
|
+
class AddIndexToItemsEmbedding < ActiveRecord::Migration[7.2]
|
196
|
+
def change
|
197
|
+
add_index :items, "(binary_quantize(embedding)::bit(3)) bit_hamming_ops", using: :hnsw
|
198
|
+
end
|
199
|
+
end
|
200
|
+
```
|
201
|
+
|
202
|
+
## Sparse Vectors
|
203
|
+
|
204
|
+
Use the `sparsevec` type to store sparse vectors
|
205
|
+
|
206
|
+
```ruby
|
207
|
+
class AddEmbeddingToItems < ActiveRecord::Migration[7.2]
|
208
|
+
def change
|
209
|
+
add_column :items, :embedding, :sparsevec, limit: 3 # dimensions
|
210
|
+
end
|
211
|
+
end
|
212
|
+
```
|
213
|
+
|
214
|
+
Get the nearest neighbors
|
215
|
+
|
216
|
+
```ruby
|
217
|
+
embedding = Neighbor::SparseVector.new({0 => 0.9, 1 => 1.3, 2 => 1.1}, 3)
|
218
|
+
Item.nearest_neighbors(:embedding, embedding, distance: "euclidean").first(5)
|
138
219
|
```
|
139
220
|
|
140
221
|
## Examples
|
141
222
|
|
142
223
|
- [OpenAI Embeddings](#openai-embeddings)
|
224
|
+
- [Cohere Embeddings](#cohere-embeddings)
|
225
|
+
- [Sentence Embeddings](#sentence-embeddings)
|
226
|
+
- [Sparse Embeddings](#sparse-embeddings)
|
143
227
|
- [Disco Recommendations](#disco-recommendations)
|
144
228
|
|
145
229
|
### OpenAI Embeddings
|
@@ -170,10 +254,10 @@ def fetch_embeddings(input)
|
|
170
254
|
}
|
171
255
|
data = {
|
172
256
|
input: input,
|
173
|
-
model: "text-embedding-
|
257
|
+
model: "text-embedding-3-small"
|
174
258
|
}
|
175
259
|
|
176
|
-
response = Net::HTTP.post(URI(url), data.to_json, headers)
|
260
|
+
response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value)
|
177
261
|
JSON.parse(response.body)["data"].map { |v| v["embedding"] }
|
178
262
|
end
|
179
263
|
```
|
@@ -199,14 +283,221 @@ end
|
|
199
283
|
Document.insert_all!(documents)
|
200
284
|
```
|
201
285
|
|
202
|
-
And get similar
|
286
|
+
And get similar documents
|
287
|
+
|
288
|
+
```ruby
|
289
|
+
document = Document.first
|
290
|
+
document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content)
|
291
|
+
```
|
292
|
+
|
293
|
+
See the [complete code](examples/openai/example.rb)
|
294
|
+
|
295
|
+
### Cohere Embeddings
|
296
|
+
|
297
|
+
Generate a model
|
298
|
+
|
299
|
+
```sh
|
300
|
+
rails generate model Document content:text embedding:bit{1024}
|
301
|
+
rails db:migrate
|
302
|
+
```
|
303
|
+
|
304
|
+
And add `has_neighbors`
|
305
|
+
|
306
|
+
```ruby
|
307
|
+
class Document < ApplicationRecord
|
308
|
+
has_neighbors :embedding
|
309
|
+
end
|
310
|
+
```
|
311
|
+
|
312
|
+
Create a method to call the [embed API](https://docs.cohere.com/reference/embed)
|
313
|
+
|
314
|
+
```ruby
|
315
|
+
def fetch_embeddings(input, input_type)
|
316
|
+
url = "https://api.cohere.com/v1/embed"
|
317
|
+
headers = {
|
318
|
+
"Authorization" => "Bearer #{ENV.fetch("CO_API_KEY")}",
|
319
|
+
"Content-Type" => "application/json"
|
320
|
+
}
|
321
|
+
data = {
|
322
|
+
texts: input,
|
323
|
+
model: "embed-english-v3.0",
|
324
|
+
input_type: input_type,
|
325
|
+
embedding_types: ["ubinary"]
|
326
|
+
}
|
327
|
+
|
328
|
+
response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value)
|
329
|
+
JSON.parse(response.body)["embeddings"]["ubinary"].map { |e| e.map { |v| v.chr.unpack1("B*") }.join }
|
330
|
+
end
|
331
|
+
```
|
332
|
+
|
333
|
+
Pass your input
|
334
|
+
|
335
|
+
```ruby
|
336
|
+
input = [
|
337
|
+
"The dog is barking",
|
338
|
+
"The cat is purring",
|
339
|
+
"The bear is growling"
|
340
|
+
]
|
341
|
+
embeddings = fetch_embeddings(input, "search_document")
|
342
|
+
```
|
343
|
+
|
344
|
+
Store the embeddings
|
345
|
+
|
346
|
+
```ruby
|
347
|
+
documents = []
|
348
|
+
input.zip(embeddings) do |content, embedding|
|
349
|
+
documents << {content: content, embedding: embedding}
|
350
|
+
end
|
351
|
+
Document.insert_all!(documents)
|
352
|
+
```
|
353
|
+
|
354
|
+
Embed the search query
|
355
|
+
|
356
|
+
```ruby
|
357
|
+
query = "forest"
|
358
|
+
query_embedding = fetch_embeddings([query], "search_query")[0]
|
359
|
+
```
|
360
|
+
|
361
|
+
And search the documents
|
362
|
+
|
363
|
+
```ruby
|
364
|
+
Document.nearest_neighbors(:embedding, query_embedding, distance: "hamming").first(5).map(&:content)
|
365
|
+
```
|
366
|
+
|
367
|
+
See the [complete code](examples/cohere/example.rb)
|
368
|
+
|
369
|
+
### Sentence Embeddings
|
370
|
+
|
371
|
+
You can generate embeddings locally with [Informers](https://github.com/ankane/informers).
|
372
|
+
|
373
|
+
Generate a model
|
374
|
+
|
375
|
+
```sh
|
376
|
+
rails generate model Document content:text embedding:vector{384}
|
377
|
+
rails db:migrate
|
378
|
+
```
|
379
|
+
|
380
|
+
And add `has_neighbors`
|
381
|
+
|
382
|
+
```ruby
|
383
|
+
class Document < ApplicationRecord
|
384
|
+
has_neighbors :embedding
|
385
|
+
end
|
386
|
+
```
|
387
|
+
|
388
|
+
Load a [model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
|
389
|
+
|
390
|
+
```ruby
|
391
|
+
model = Informers::Model.new("sentence-transformers/all-MiniLM-L6-v2")
|
392
|
+
```
|
393
|
+
|
394
|
+
Pass your input
|
395
|
+
|
396
|
+
```ruby
|
397
|
+
input = [
|
398
|
+
"The dog is barking",
|
399
|
+
"The cat is purring",
|
400
|
+
"The bear is growling"
|
401
|
+
]
|
402
|
+
embeddings = model.embed(input)
|
403
|
+
```
|
404
|
+
|
405
|
+
Store the embeddings
|
406
|
+
|
407
|
+
```ruby
|
408
|
+
documents = []
|
409
|
+
input.zip(embeddings) do |content, embedding|
|
410
|
+
documents << {content: content, embedding: embedding}
|
411
|
+
end
|
412
|
+
Document.insert_all!(documents)
|
413
|
+
```
|
414
|
+
|
415
|
+
And get similar documents
|
203
416
|
|
204
417
|
```ruby
|
205
418
|
document = Document.first
|
206
419
|
document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content)
|
207
420
|
```
|
208
421
|
|
209
|
-
See the [complete code](examples/
|
422
|
+
See the [complete code](examples/informers/example.rb)
|
423
|
+
|
424
|
+
### Sparse Embeddings
|
425
|
+
|
426
|
+
You can generate sparse embeddings locally with [Transformers.rb](https://github.com/ankane/transformers-ruby).
|
427
|
+
|
428
|
+
Generate a model
|
429
|
+
|
430
|
+
```sh
|
431
|
+
rails generate model Document content:text embedding:sparsevec{30522}
|
432
|
+
rails db:migrate
|
433
|
+
```
|
434
|
+
|
435
|
+
And add `has_neighbors`
|
436
|
+
|
437
|
+
```ruby
|
438
|
+
class Document < ApplicationRecord
|
439
|
+
has_neighbors :embedding
|
440
|
+
end
|
441
|
+
```
|
442
|
+
|
443
|
+
Load a [model](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1) to generate embeddings
|
444
|
+
|
445
|
+
```ruby
|
446
|
+
class EmbeddingModel
|
447
|
+
def initialize(model_id)
|
448
|
+
@model = Transformers::AutoModelForMaskedLM.from_pretrained(model_id)
|
449
|
+
@tokenizer = Transformers::AutoTokenizer.from_pretrained(model_id)
|
450
|
+
@special_token_ids = @tokenizer.special_tokens_map.map { |_, token| @tokenizer.vocab[token] }
|
451
|
+
end
|
452
|
+
|
453
|
+
def embed(input)
|
454
|
+
feature = @tokenizer.(input, padding: true, truncation: true, return_tensors: "pt", return_token_type_ids: false)
|
455
|
+
output = @model.(**feature)[0]
|
456
|
+
values = Torch.max(output * feature[:attention_mask].unsqueeze(-1), dim: 1)[0]
|
457
|
+
values = Torch.log(1 + Torch.relu(values))
|
458
|
+
values[0.., @special_token_ids] = 0
|
459
|
+
values.to_a
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
model = EmbeddingModel.new("opensearch-project/opensearch-neural-sparse-encoding-v1")
|
464
|
+
```
|
465
|
+
|
466
|
+
Pass your input
|
467
|
+
|
468
|
+
```ruby
|
469
|
+
input = [
|
470
|
+
"The dog is barking",
|
471
|
+
"The cat is purring",
|
472
|
+
"The bear is growling"
|
473
|
+
]
|
474
|
+
embeddings = model.embed(input)
|
475
|
+
```
|
476
|
+
|
477
|
+
Store the embeddings
|
478
|
+
|
479
|
+
```ruby
|
480
|
+
documents = []
|
481
|
+
input.zip(embeddings) do |content, embedding|
|
482
|
+
documents << {content: content, embedding: Neighbor::SparseVector.new(embedding)}
|
483
|
+
end
|
484
|
+
Document.insert_all!(documents)
|
485
|
+
```
|
486
|
+
|
487
|
+
Embed the search query
|
488
|
+
|
489
|
+
```ruby
|
490
|
+
query = "forest"
|
491
|
+
query_embedding = model.embed([query])[0]
|
492
|
+
```
|
493
|
+
|
494
|
+
And search the documents
|
495
|
+
|
496
|
+
```ruby
|
497
|
+
Document.nearest_neighbors(:embedding, Neighbor::SparseVector.new(query_embedding), distance: "inner_product").first(5).map(&:content)
|
498
|
+
```
|
499
|
+
|
500
|
+
See the [complete code](examples/sparse/example.rb)
|
210
501
|
|
211
502
|
### Disco Recommendations
|
212
503
|
|
@@ -242,7 +533,7 @@ movies = []
|
|
242
533
|
recommender.item_ids.each do |item_id|
|
243
534
|
movies << {name: item_id, factors: recommender.item_factors(item_id)}
|
244
535
|
end
|
245
|
-
Movie.insert_all!(movies)
|
536
|
+
Movie.insert_all!(movies)
|
246
537
|
```
|
247
538
|
|
248
539
|
And get similar movies
|
@@ -252,19 +543,7 @@ movie = Movie.find_by(name: "Star Wars (1977)")
|
|
252
543
|
movie.nearest_neighbors(:factors, distance: "cosine").first(5).map(&:name)
|
253
544
|
```
|
254
545
|
|
255
|
-
See the complete code for [cube](examples/
|
256
|
-
|
257
|
-
## Upgrading
|
258
|
-
|
259
|
-
### 0.2.0
|
260
|
-
|
261
|
-
The `distance` option has been moved from `has_neighbors` to `nearest_neighbors`, and there is no longer a default. If you use cosine distance, set:
|
262
|
-
|
263
|
-
```ruby
|
264
|
-
class Item < ApplicationRecord
|
265
|
-
has_neighbors normalize: true
|
266
|
-
end
|
267
|
-
```
|
546
|
+
See the complete code for [cube](examples/disco/item_recs_cube.rb) and [vector](examples/disco/item_recs_vector.rb)
|
268
547
|
|
269
548
|
## History
|
270
549
|
|
@@ -286,10 +565,5 @@ git clone https://github.com/ankane/neighbor.git
|
|
286
565
|
cd neighbor
|
287
566
|
bundle install
|
288
567
|
createdb neighbor_test
|
289
|
-
|
290
|
-
# cube
|
291
568
|
bundle exec rake test
|
292
|
-
|
293
|
-
# vector
|
294
|
-
EXT=vector bundle exec rake test
|
295
569
|
```
|
data/lib/neighbor/model.rb
CHANGED
@@ -2,11 +2,9 @@ module Neighbor
|
|
2
2
|
module Model
|
3
3
|
def has_neighbors(*attribute_names, dimensions: nil, normalize: nil)
|
4
4
|
if attribute_names.empty?
|
5
|
-
|
6
|
-
attribute_names << :neighbor_vector
|
7
|
-
else
|
8
|
-
attribute_names.map!(&:to_sym)
|
5
|
+
raise ArgumentError, "has_neighbors requires an attribute name"
|
9
6
|
end
|
7
|
+
attribute_names.map!(&:to_sym)
|
10
8
|
|
11
9
|
class_eval do
|
12
10
|
@neighbor_attributes ||= {}
|
@@ -27,30 +25,46 @@ module Neighbor
|
|
27
25
|
attribute_names.each do |attribute_name|
|
28
26
|
raise Error, "has_neighbors already called for #{attribute_name.inspect}" if neighbor_attributes[attribute_name]
|
29
27
|
@neighbor_attributes[attribute_name] = {dimensions: dimensions, normalize: normalize}
|
30
|
-
|
31
|
-
attribute attribute_name, Neighbor::Vector.new(dimensions: dimensions, normalize: normalize, model: self, attribute_name: attribute_name)
|
32
28
|
end
|
33
29
|
|
34
30
|
return if @neighbor_attributes.size != attribute_names.size
|
35
31
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
32
|
+
validate do
|
33
|
+
self.class.neighbor_attributes.each do |k, v|
|
34
|
+
value = read_attribute(k)
|
35
|
+
next if value.nil?
|
36
|
+
|
37
|
+
column_info = self.class.columns_hash[k.to_s]
|
38
|
+
dimensions = v[:dimensions] || column_info&.limit
|
39
|
+
|
40
|
+
if !Neighbor::Utils.validate_dimensions(value, column_info&.type, dimensions).nil?
|
41
|
+
errors.add(k, "must have #{dimensions} dimensions")
|
42
|
+
end
|
43
|
+
if !Neighbor::Utils.validate_finite(value, column_info&.type)
|
44
|
+
errors.add(k, "must have finite values")
|
45
|
+
end
|
42
46
|
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# TODO move to normalizes when Active Record < 7.1 no longer supported
|
50
|
+
before_save do
|
51
|
+
self.class.neighbor_attributes.each do |k, v|
|
52
|
+
next unless v[:normalize] && attribute_changed?(k)
|
53
|
+
value = read_attribute(k)
|
54
|
+
next if value.nil?
|
55
|
+
self[k] = Neighbor::Utils.normalize(value, column_info: self.class.columns_hash[k.to_s])
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# cannot use keyword arguments with scope with Ruby 3.2 and Active Record 6.1
|
60
|
+
# https://github.com/rails/rails/issues/46934
|
61
|
+
scope :nearest_neighbors, ->(attribute_name, vector, options = nil) {
|
43
62
|
raise ArgumentError, "missing keyword: :distance" unless options.is_a?(Hash) && options.key?(:distance)
|
44
63
|
distance = options.delete(:distance)
|
64
|
+
precision = options.delete(:precision)
|
45
65
|
raise ArgumentError, "unknown keywords: #{options.keys.map(&:inspect).join(", ")}" if options.any?
|
46
66
|
|
47
|
-
if vector.nil? && !attribute_name.nil? && attribute_name.respond_to?(:to_a)
|
48
|
-
warn "[neighbor] nearest_neighbors without an attribute name is deprecated"
|
49
|
-
vector = attribute_name
|
50
|
-
attribute_name = :neighbor_vector
|
51
|
-
end
|
52
67
|
attribute_name = attribute_name.to_sym
|
53
|
-
|
54
68
|
options = neighbor_attributes[attribute_name]
|
55
69
|
raise ArgumentError, "Invalid attribute" unless options
|
56
70
|
normalize = options[:normalize]
|
@@ -62,10 +76,21 @@ module Neighbor
|
|
62
76
|
|
63
77
|
quoted_attribute = "#{connection.quote_table_name(table_name)}.#{connection.quote_column_name(attribute_name)}"
|
64
78
|
|
65
|
-
column_info =
|
79
|
+
column_info = columns_hash[attribute_name.to_s]
|
80
|
+
column_type = column_info&.type
|
66
81
|
|
67
82
|
operator =
|
68
|
-
|
83
|
+
case column_type
|
84
|
+
when :bit
|
85
|
+
case distance
|
86
|
+
when "hamming"
|
87
|
+
"<~>"
|
88
|
+
when "jaccard"
|
89
|
+
"<%>"
|
90
|
+
when "hamming2"
|
91
|
+
"#"
|
92
|
+
end
|
93
|
+
when :vector, :halfvec, :sparsevec
|
69
94
|
case distance
|
70
95
|
when "inner_product"
|
71
96
|
"<#>"
|
@@ -73,8 +98,10 @@ module Neighbor
|
|
73
98
|
"<=>"
|
74
99
|
when "euclidean"
|
75
100
|
"<->"
|
101
|
+
when "taxicab"
|
102
|
+
"<+>"
|
76
103
|
end
|
77
|
-
|
104
|
+
when :cube
|
78
105
|
case distance
|
79
106
|
when "taxicab"
|
80
107
|
"<#>"
|
@@ -83,27 +110,39 @@ module Neighbor
|
|
83
110
|
when "euclidean", "cosine"
|
84
111
|
"<->"
|
85
112
|
end
|
113
|
+
else
|
114
|
+
raise ArgumentError, "Unsupported type: #{column_type}"
|
86
115
|
end
|
87
116
|
|
88
117
|
raise ArgumentError, "Invalid distance: #{distance}" unless operator
|
89
118
|
|
90
119
|
# ensure normalize set (can be true or false)
|
91
|
-
if distance == "cosine" &&
|
120
|
+
if distance == "cosine" && column_type == :cube && normalize.nil?
|
92
121
|
raise Neighbor::Error, "Set normalize for cosine distance with cube"
|
93
122
|
end
|
94
123
|
|
95
|
-
|
124
|
+
column_attribute = klass.type_for_attribute(attribute_name)
|
125
|
+
vector = column_attribute.cast(vector)
|
126
|
+
Neighbor::Utils.validate(vector, dimensions: dimensions, column_info: column_info)
|
127
|
+
vector = Neighbor::Utils.normalize(vector, column_info: column_info) if normalize
|
128
|
+
|
129
|
+
query = connection.quote(column_attribute.serialize(vector))
|
96
130
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
131
|
+
if !precision.nil?
|
132
|
+
case precision.to_s
|
133
|
+
when "half"
|
134
|
+
cast_dimensions = dimensions || column_info&.limit
|
135
|
+
raise ArgumentError, "Unknown dimensions" unless cast_dimensions
|
136
|
+
quoted_attribute += "::halfvec(#{connection.quote(cast_dimensions.to_i)})"
|
102
137
|
else
|
103
|
-
|
138
|
+
raise ArgumentError, "Invalid precision"
|
104
139
|
end
|
140
|
+
end
|
105
141
|
|
106
142
|
order = "#{quoted_attribute} #{operator} #{query}"
|
143
|
+
if operator == "#"
|
144
|
+
order = "bit_count(#{order})"
|
145
|
+
end
|
107
146
|
|
108
147
|
# https://stats.stackexchange.com/questions/146221/is-cosine-similarity-identical-to-l2-normalized-euclidean-distance
|
109
148
|
# with normalized vectors:
|
@@ -111,31 +150,28 @@ module Neighbor
|
|
111
150
|
# cosine distance = 1 - cosine similarity
|
112
151
|
# this transformation doesn't change the order, so only needed for select
|
113
152
|
neighbor_distance =
|
114
|
-
if
|
153
|
+
if column_type == :cube && distance == "cosine"
|
115
154
|
"POWER(#{order}, 2) / 2.0"
|
116
|
-
elsif
|
155
|
+
elsif [:vector, :halfvec, :sparsevec].include?(column_type) && distance == "inner_product"
|
117
156
|
"(#{order}) * -1"
|
118
157
|
else
|
119
158
|
order
|
120
159
|
end
|
121
160
|
|
122
161
|
# for select, use column_names instead of * to account for ignored columns
|
123
|
-
|
162
|
+
select_columns = select_values.any? ? [] : column_names
|
163
|
+
select(*select_columns, "#{neighbor_distance} AS neighbor_distance")
|
124
164
|
.where.not(attribute_name => nil)
|
125
|
-
.
|
165
|
+
.reorder(Arel.sql(order))
|
126
166
|
}
|
127
167
|
|
128
|
-
def nearest_neighbors(attribute_name
|
129
|
-
if attribute_name.nil?
|
130
|
-
warn "[neighbor] nearest_neighbors without an attribute name is deprecated"
|
131
|
-
attribute_name = :neighbor_vector
|
132
|
-
end
|
168
|
+
def nearest_neighbors(attribute_name, **options)
|
133
169
|
attribute_name = attribute_name.to_sym
|
134
|
-
# important! check if neighbor attribute before
|
170
|
+
# important! check if neighbor attribute before accessing
|
135
171
|
raise ArgumentError, "Invalid attribute" unless self.class.neighbor_attributes[attribute_name]
|
136
172
|
|
137
173
|
self.class
|
138
|
-
.where.not(self.class.primary_key
|
174
|
+
.where.not(Array(self.class.primary_key).to_h { |k| [k, self[k]] })
|
139
175
|
.nearest_neighbors(attribute_name, self[attribute_name], **options)
|
140
176
|
end
|
141
177
|
end
|
data/lib/neighbor/railtie.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
module Neighbor
|
2
2
|
class Railtie < Rails::Railtie
|
3
3
|
generators do
|
4
|
+
require "rails/generators/generated_attribute"
|
5
|
+
|
4
6
|
# rails generate model Item embedding:vector{3}
|
5
|
-
|
6
|
-
Rails::Generators::GeneratedAttribute.singleton_class.prepend(Neighbor::GeneratedAttribute)
|
7
|
-
end
|
7
|
+
Rails::Generators::GeneratedAttribute.singleton_class.prepend(Neighbor::GeneratedAttribute)
|
8
8
|
end
|
9
9
|
end
|
10
10
|
|
11
11
|
module GeneratedAttribute
|
12
12
|
def parse_type_and_options(type, *, **)
|
13
|
-
if type =~ /\A(vector)\{(\d+)\}\z/
|
13
|
+
if type =~ /\A(vector|halfvec|bit|sparsevec)\{(\d+)\}\z/
|
14
14
|
return $1, limit: $2.to_i
|
15
15
|
end
|
16
16
|
super
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Neighbor
|
2
|
+
class SparseVector
|
3
|
+
attr_reader :dimensions, :indices, :values
|
4
|
+
|
5
|
+
NO_DEFAULT = Object.new
|
6
|
+
|
7
|
+
def initialize(value, dimensions = NO_DEFAULT)
|
8
|
+
if value.is_a?(Hash)
|
9
|
+
if dimensions == NO_DEFAULT
|
10
|
+
raise ArgumentError, "missing dimensions"
|
11
|
+
end
|
12
|
+
from_hash(value, dimensions)
|
13
|
+
else
|
14
|
+
unless dimensions == NO_DEFAULT
|
15
|
+
raise ArgumentError, "extra argument"
|
16
|
+
end
|
17
|
+
from_array(value)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_s
|
22
|
+
"{#{@indices.zip(@values).map { |i, v| "#{i.to_i + 1}:#{v.to_f}" }.join(",")}}/#{@dimensions.to_i}"
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_a
|
26
|
+
arr = Array.new(dimensions, 0.0)
|
27
|
+
@indices.zip(@values) do |i, v|
|
28
|
+
arr[i] = v
|
29
|
+
end
|
30
|
+
arr
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def from_hash(data, dimensions)
|
36
|
+
elements = data.select { |_, v| v != 0 }.sort
|
37
|
+
@dimensions = dimensions.to_i
|
38
|
+
@indices = elements.map { |v| v[0].to_i }
|
39
|
+
@values = elements.map { |v| v[1].to_f }
|
40
|
+
end
|
41
|
+
|
42
|
+
def from_array(arr)
|
43
|
+
arr = arr.to_a
|
44
|
+
@dimensions = arr.size
|
45
|
+
@indices = []
|
46
|
+
@values = []
|
47
|
+
arr.each_with_index do |v, i|
|
48
|
+
if v != 0
|
49
|
+
@indices << i
|
50
|
+
@values << v.to_f
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class << self
|
56
|
+
def from_text(string)
|
57
|
+
elements, dimensions = string.split("/", 2)
|
58
|
+
indices = []
|
59
|
+
values = []
|
60
|
+
elements[1..-2].split(",").each do |e|
|
61
|
+
index, value = e.split(":", 2)
|
62
|
+
indices << index.to_i - 1
|
63
|
+
values << value.to_f
|
64
|
+
end
|
65
|
+
from_parts(dimensions.to_i, indices, values)
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def from_parts(dimensions, indices, values)
|
71
|
+
vec = allocate
|
72
|
+
vec.instance_variable_set(:@dimensions, dimensions)
|
73
|
+
vec.instance_variable_set(:@indices, indices)
|
74
|
+
vec.instance_variable_set(:@values, values)
|
75
|
+
vec
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/neighbor/type/cube.rb
CHANGED
@@ -1,36 +1,41 @@
|
|
1
1
|
module Neighbor
|
2
2
|
module Type
|
3
|
-
class Cube < ActiveRecord::Type::
|
3
|
+
class Cube < ActiveRecord::Type::Value
|
4
4
|
def type
|
5
5
|
:cube
|
6
6
|
end
|
7
7
|
|
8
|
-
def
|
9
|
-
if value.
|
8
|
+
def serialize(value)
|
9
|
+
if value.respond_to?(:to_a)
|
10
|
+
value = value.to_a
|
10
11
|
if value.first.is_a?(Array)
|
11
|
-
value.map { |v|
|
12
|
+
value = value.map { |v| serialize_point(v) }.join(", ")
|
12
13
|
else
|
13
|
-
|
14
|
+
value = serialize_point(value)
|
14
15
|
end
|
15
|
-
else
|
16
|
-
super
|
17
16
|
end
|
17
|
+
super(value)
|
18
18
|
end
|
19
19
|
|
20
|
-
# TODO uncomment in 0.4.0
|
21
|
-
# def deserialize(value)
|
22
|
-
# if value.nil?
|
23
|
-
# super
|
24
|
-
# elsif value.include?("),(")
|
25
|
-
# value[1..-1].split("),(").map { |v| v.split(",").map(&:to_f) }
|
26
|
-
# else
|
27
|
-
# value[1..-1].split(",").map(&:to_f)
|
28
|
-
# end
|
29
|
-
# end
|
30
|
-
|
31
20
|
private
|
32
21
|
|
33
|
-
def
|
22
|
+
def cast_value(value)
|
23
|
+
if value.respond_to?(:to_a)
|
24
|
+
value.to_a
|
25
|
+
elsif value.is_a?(Numeric)
|
26
|
+
[value]
|
27
|
+
elsif value.is_a?(String)
|
28
|
+
if value.include?("),(")
|
29
|
+
value[1..-1].split("),(").map { |v| v.split(",").map(&:to_f) }
|
30
|
+
else
|
31
|
+
value[1..-1].split(",").map(&:to_f)
|
32
|
+
end
|
33
|
+
else
|
34
|
+
raise "can't cast #{value.class.name} to cube"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def serialize_point(value)
|
34
39
|
"(#{value.map(&:to_f).join(", ")})"
|
35
40
|
end
|
36
41
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Neighbor
|
2
|
+
module Type
|
3
|
+
class Halfvec < ActiveRecord::Type::Value
|
4
|
+
def type
|
5
|
+
:halfvec
|
6
|
+
end
|
7
|
+
|
8
|
+
def serialize(value)
|
9
|
+
if value.respond_to?(:to_a)
|
10
|
+
value = "[#{value.to_a.map(&:to_f).join(",")}]"
|
11
|
+
end
|
12
|
+
super(value)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def cast_value(value)
|
18
|
+
if value.is_a?(String)
|
19
|
+
value[1..-1].split(",").map(&:to_f)
|
20
|
+
elsif value.respond_to?(:to_a)
|
21
|
+
value.to_a
|
22
|
+
else
|
23
|
+
raise "can't cast #{value.class.name} to halfvec"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Neighbor
|
2
|
+
module Type
|
3
|
+
class Sparsevec < ActiveRecord::Type::Value
|
4
|
+
def type
|
5
|
+
:sparsevec
|
6
|
+
end
|
7
|
+
|
8
|
+
def serialize(value)
|
9
|
+
if value.is_a?(SparseVector)
|
10
|
+
value = "{#{value.indices.zip(value.values).map { |i, v| "#{i.to_i + 1}:#{v.to_f}" }.join(",")}}/#{value.dimensions.to_i}"
|
11
|
+
end
|
12
|
+
super(value)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def cast_value(value)
|
18
|
+
if value.is_a?(SparseVector)
|
19
|
+
value
|
20
|
+
elsif value.is_a?(String)
|
21
|
+
SparseVector.from_text(value)
|
22
|
+
elsif value.respond_to?(:to_a)
|
23
|
+
value = SparseVector.new(value.to_a)
|
24
|
+
else
|
25
|
+
raise "can't cast #{value.class.name} to sparsevec"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/neighbor/type/vector.rb
CHANGED
@@ -1,14 +1,28 @@
|
|
1
1
|
module Neighbor
|
2
2
|
module Type
|
3
|
-
class Vector < ActiveRecord::Type::
|
3
|
+
class Vector < ActiveRecord::Type::Value
|
4
4
|
def type
|
5
5
|
:vector
|
6
6
|
end
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
def serialize(value)
|
9
|
+
if value.respond_to?(:to_a)
|
10
|
+
value = "[#{value.to_a.map(&:to_f).join(",")}]"
|
11
|
+
end
|
12
|
+
super(value)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def cast_value(value)
|
18
|
+
if value.is_a?(String)
|
19
|
+
value[1..-1].split(",").map(&:to_f)
|
20
|
+
elsif value.respond_to?(:to_a)
|
21
|
+
value.to_a
|
22
|
+
else
|
23
|
+
raise "can't cast #{value.class.name} to vector"
|
24
|
+
end
|
25
|
+
end
|
12
26
|
end
|
13
27
|
end
|
14
28
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Neighbor
|
2
|
+
module Utils
|
3
|
+
def self.validate_dimensions(value, type, expected)
|
4
|
+
dimensions = type == :sparsevec ? value.dimensions : value.size
|
5
|
+
if expected && dimensions != expected
|
6
|
+
"Expected #{expected} dimensions, not #{dimensions}"
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.validate_finite(value, type)
|
11
|
+
case type
|
12
|
+
when :bit
|
13
|
+
true
|
14
|
+
when :sparsevec
|
15
|
+
value.values.all?(&:finite?)
|
16
|
+
else
|
17
|
+
value.all?(&:finite?)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.validate(value, dimensions:, column_info:)
|
22
|
+
if (message = validate_dimensions(value, column_info&.type, dimensions || column_info&.limit))
|
23
|
+
raise Error, message
|
24
|
+
end
|
25
|
+
|
26
|
+
if !validate_finite(value, column_info&.type)
|
27
|
+
raise Error, "Values must be finite"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.normalize(value, column_info:)
|
32
|
+
raise Error, "Normalize not supported for type" unless [:cube, :vector, :halfvec].include?(column_info&.type)
|
33
|
+
|
34
|
+
norm = Math.sqrt(value.sum { |v| v * v })
|
35
|
+
|
36
|
+
# store zero vector as all zeros
|
37
|
+
# since NaN makes the distance always 0
|
38
|
+
# could also throw error
|
39
|
+
norm > 0 ? value.map { |v| v / norm } : value
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/lib/neighbor/version.rb
CHANGED
data/lib/neighbor.rb
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
require "active_support"
|
3
3
|
|
4
4
|
# modules
|
5
|
+
require_relative "neighbor/sparse_vector"
|
6
|
+
require_relative "neighbor/utils"
|
5
7
|
require_relative "neighbor/version"
|
6
8
|
|
7
9
|
module Neighbor
|
@@ -11,6 +13,14 @@ module Neighbor
|
|
11
13
|
def initialize_type_map(m = type_map)
|
12
14
|
super
|
13
15
|
m.register_type "cube", Type::Cube.new
|
16
|
+
m.register_type "halfvec" do |_, _, sql_type|
|
17
|
+
limit = extract_limit(sql_type)
|
18
|
+
Type::Halfvec.new(limit: limit)
|
19
|
+
end
|
20
|
+
m.register_type "sparsevec" do |_, _, sql_type|
|
21
|
+
limit = extract_limit(sql_type)
|
22
|
+
Type::Sparsevec.new(limit: limit)
|
23
|
+
end
|
14
24
|
m.register_type "vector" do |_, _, sql_type|
|
15
25
|
limit = extract_limit(sql_type)
|
16
26
|
Type::Vector.new(limit: limit)
|
@@ -21,8 +31,9 @@ end
|
|
21
31
|
|
22
32
|
ActiveSupport.on_load(:active_record) do
|
23
33
|
require_relative "neighbor/model"
|
24
|
-
require_relative "neighbor/vector"
|
25
34
|
require_relative "neighbor/type/cube"
|
35
|
+
require_relative "neighbor/type/halfvec"
|
36
|
+
require_relative "neighbor/type/sparsevec"
|
26
37
|
require_relative "neighbor/type/vector"
|
27
38
|
|
28
39
|
extend Neighbor::Model
|
@@ -31,10 +42,12 @@ ActiveSupport.on_load(:active_record) do
|
|
31
42
|
|
32
43
|
# ensure schema can be dumped
|
33
44
|
ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:cube] = {name: "cube"}
|
45
|
+
ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:halfvec] = {name: "halfvec"}
|
46
|
+
ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:sparsevec] = {name: "sparsevec"}
|
34
47
|
ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:vector] = {name: "vector"}
|
35
48
|
|
36
49
|
# ensure schema can be loaded
|
37
|
-
ActiveRecord::ConnectionAdapters::TableDefinition.send(:define_column_methods, :cube, :vector)
|
50
|
+
ActiveRecord::ConnectionAdapters::TableDefinition.send(:define_column_methods, :cube, :halfvec, :sparsevec, :vector)
|
38
51
|
|
39
52
|
# prevent unknown OID warning
|
40
53
|
if ActiveRecord::VERSION::MAJOR >= 7
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: neighbor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-08-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -40,9 +40,12 @@ files:
|
|
40
40
|
- lib/neighbor.rb
|
41
41
|
- lib/neighbor/model.rb
|
42
42
|
- lib/neighbor/railtie.rb
|
43
|
+
- lib/neighbor/sparse_vector.rb
|
43
44
|
- lib/neighbor/type/cube.rb
|
45
|
+
- lib/neighbor/type/halfvec.rb
|
46
|
+
- lib/neighbor/type/sparsevec.rb
|
44
47
|
- lib/neighbor/type/vector.rb
|
45
|
-
- lib/neighbor/
|
48
|
+
- lib/neighbor/utils.rb
|
46
49
|
- lib/neighbor/version.rb
|
47
50
|
homepage: https://github.com/ankane/neighbor
|
48
51
|
licenses:
|
@@ -56,14 +59,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
56
59
|
requirements:
|
57
60
|
- - ">="
|
58
61
|
- !ruby/object:Gem::Version
|
59
|
-
version: '3'
|
62
|
+
version: '3.1'
|
60
63
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
64
|
requirements:
|
62
65
|
- - ">="
|
63
66
|
- !ruby/object:Gem::Version
|
64
67
|
version: '0'
|
65
68
|
requirements: []
|
66
|
-
rubygems_version: 3.
|
69
|
+
rubygems_version: 3.5.11
|
67
70
|
signing_key:
|
68
71
|
specification_version: 4
|
69
72
|
summary: Nearest neighbor search for Rails and Postgres
|
data/lib/neighbor/vector.rb
DELETED
@@ -1,65 +0,0 @@
|
|
1
|
-
module Neighbor
|
2
|
-
class Vector < ActiveRecord::Type::Value
|
3
|
-
def initialize(dimensions:, normalize:, model:, attribute_name:)
|
4
|
-
super()
|
5
|
-
@dimensions = dimensions
|
6
|
-
@normalize = normalize
|
7
|
-
@model = model
|
8
|
-
@attribute_name = attribute_name
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.cast(value, dimensions:, normalize:, column_info:)
|
12
|
-
value = value.to_a.map(&:to_f)
|
13
|
-
|
14
|
-
dimensions ||= column_info[:dimensions]
|
15
|
-
raise Error, "Expected #{dimensions} dimensions, not #{value.size}" if dimensions && value.size != dimensions
|
16
|
-
|
17
|
-
raise Error, "Values must be finite" unless value.all?(&:finite?)
|
18
|
-
|
19
|
-
if normalize
|
20
|
-
norm = Math.sqrt(value.sum { |v| v * v })
|
21
|
-
|
22
|
-
# store zero vector as all zeros
|
23
|
-
# since NaN makes the distance always 0
|
24
|
-
# could also throw error
|
25
|
-
|
26
|
-
# safe to update in-place since earlier map dups
|
27
|
-
value.map! { |v| v / norm } if norm > 0
|
28
|
-
end
|
29
|
-
|
30
|
-
value
|
31
|
-
end
|
32
|
-
|
33
|
-
def self.column_info(model, attribute_name)
|
34
|
-
attribute_name = attribute_name.to_s
|
35
|
-
column = model.columns.detect { |c| c.name == attribute_name }
|
36
|
-
{
|
37
|
-
type: column.try(:type),
|
38
|
-
dimensions: column.try(:limit)
|
39
|
-
}
|
40
|
-
end
|
41
|
-
|
42
|
-
# need to be careful to avoid loading column info before needed
|
43
|
-
def column_info
|
44
|
-
@column_info ||= self.class.column_info(@model, @attribute_name)
|
45
|
-
end
|
46
|
-
|
47
|
-
def cast(value)
|
48
|
-
self.class.cast(value, dimensions: @dimensions, normalize: @normalize, column_info: column_info) unless value.nil?
|
49
|
-
end
|
50
|
-
|
51
|
-
def serialize(value)
|
52
|
-
unless value.nil?
|
53
|
-
if column_info[:type] == :vector
|
54
|
-
"[#{cast(value).join(", ")}]"
|
55
|
-
else
|
56
|
-
"(#{cast(value).join(", ")})"
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
def deserialize(value)
|
62
|
-
value[1..-1].split(",").map(&:to_f) unless value.nil?
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|