anngler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/anngler.rb +4 -0
- data/lib/anngler/helpers.rb +16 -0
- data/lib/anngler/index.rb +128 -0
- data/lib/anngler/storage/memory_backend.rb +23 -0
- data/lib/anngler/storage/redis_backend.rb +30 -0
- metadata +158 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 02dc26823e95142aba1c8f1468ee440e5850d17c940df0488227ceec9e5d9568
|
4
|
+
data.tar.gz: 0214635227152047fc51f98a16d753e38133e3bd00a1ef50bb6a86a9fc3e4fc4
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fdabb190537d475693d21619d2d5ee700b1cb13c27f3c526df0e880023f021d95cf4c5263248fd8d0b68346ca0c04f707faf2f836c3ad7336e3a91f747c7efee
|
7
|
+
data.tar.gz: 91cd7f7c122227c3764d3293385125eb81521181e513095eeed4f3cafa8fb97bea20dc60edc7d8c5d0441375b1efcf22bfacb69ac9495a25c50162f42747130a
|
data/lib/anngler.rb
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'base64'
|
3
|
+
require 'zlib'
|
4
|
+
|
5
|
+
module Anngler
|
6
|
+
class Index
|
7
|
+
#the number of features of the vectors we are storing
|
8
|
+
attr_reader :n_features
|
9
|
+
|
10
|
+
#the number of projections to generate (more = less vectors per bucket)
|
11
|
+
attr_reader :n_projections
|
12
|
+
|
13
|
+
#seed for our random number generator, we ensure this is deterministic buy resorting to the base16 of the bucket name if no seed is provided
|
14
|
+
attr_reader :seed
|
15
|
+
|
16
|
+
#the name of the bucket to allow multiple different hash tables in redis
|
17
|
+
attr_reader :bucket_name
|
18
|
+
|
19
|
+
#the random number generator for the projection matrices
|
20
|
+
attr_reader :rng
|
21
|
+
|
22
|
+
#an n_trees x n_features x n_projections matrix to store our projections
|
23
|
+
attr_reader :trees
|
24
|
+
|
25
|
+
#how many different projections to overlap (more allows for better accuracy but will slow performance)
|
26
|
+
attr_reader :n_trees
|
27
|
+
|
28
|
+
#which storage service to use (either redis or local memory)
|
29
|
+
attr_reader :storage
|
30
|
+
|
31
|
+
def initialize(
|
32
|
+
bucket_name,
|
33
|
+
n_projections,
|
34
|
+
n_features,
|
35
|
+
seed: nil,
|
36
|
+
n_trees: 1,
|
37
|
+
storage: Anngler::Storage::MemoryBackend.new
|
38
|
+
)
|
39
|
+
@n_projections = n_projections
|
40
|
+
@n_features = n_features
|
41
|
+
@seed = seed
|
42
|
+
@seed ||= bucket_name.to_i(36)
|
43
|
+
@bucket_name = bucket_name
|
44
|
+
@rng = Random.new(@seed)
|
45
|
+
@n_trees = n_trees
|
46
|
+
@storage = storage
|
47
|
+
|
48
|
+
gen_trees
|
49
|
+
end
|
50
|
+
|
51
|
+
def add(vec, label: "")
|
52
|
+
hashes = calc_hashes(vec)
|
53
|
+
#Serialize the vector and the label
|
54
|
+
serialized_data = pack_data(vec, label)
|
55
|
+
|
56
|
+
#add the vector into each tree
|
57
|
+
hashes.each_with_index do |hash, i|
|
58
|
+
bucket = "#{@bucket_name}:#{i}:#{hash2string(hash)}"
|
59
|
+
@storage.add_vector(bucket, serialized_data)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def remove(vec)
|
64
|
+
hashes = calc_hashes(vec)
|
65
|
+
|
66
|
+
#remove vector from each tree
|
67
|
+
hashes.each_with_index do |hash, i|
|
68
|
+
bucket = "#{@bucket_name}:#{i}:#{hash2string(hash)}"
|
69
|
+
@storage.remove_vector(bucket, encode_vec(vec))
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def query(vec)
|
74
|
+
hashes = calc_hashes(vec)
|
75
|
+
raw_results = []
|
76
|
+
|
77
|
+
#search each tree and append the results into raw_results
|
78
|
+
hashes.each_with_index do |hash, i|
|
79
|
+
bucket = "#{@bucket_name}:#{i}:#{hash2string(hash)}"
|
80
|
+
raw_results += @storage.query_bucket(bucket)
|
81
|
+
end
|
82
|
+
|
83
|
+
#remove duplicates and decode the data
|
84
|
+
raw_results.uniq.map do |encoded_data|
|
85
|
+
unpack_data(encoded_data)
|
86
|
+
end.sort_by do |data|
|
87
|
+
#sort the results by cosine distance
|
88
|
+
Helpers.cosine_distance(vec, data["vec"])
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
def gen_trees
|
95
|
+
vals = Array.new(@n_trees * @n_features * @n_projections) { (@rng.rand * 2) -1 }
|
96
|
+
@trees = Numo::DFloat.asarray(vals).reshape(@n_trees, @n_features, @n_projections)
|
97
|
+
end
|
98
|
+
|
99
|
+
#return a hash key for each tree
|
100
|
+
def calc_hashes(vec)
|
101
|
+
[0..@n_trees - 1].map do |i|
|
102
|
+
vec.dot(@trees[i, true, true]).ge(0.0)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
#turn the vector into a hexadecimal string
|
107
|
+
def hash2string(hash)
|
108
|
+
hash.to_a.join.to_i(2).to_s(16)
|
109
|
+
end
|
110
|
+
|
111
|
+
def encode_vec(vec)
|
112
|
+
Base64.encode64(Zlib::Deflate.deflate(vec.to_a.join(",")))
|
113
|
+
end
|
114
|
+
|
115
|
+
def decode_vec(vec)
|
116
|
+
Numo::DFloat.asarray(Zlib::Inflate.inflate(Base64.decode64(vec)).split(",").map(&:to_f))
|
117
|
+
end
|
118
|
+
|
119
|
+
def pack_data(vec, label)
|
120
|
+
"#{encode_vec(vec)}:#{label}"
|
121
|
+
end
|
122
|
+
|
123
|
+
def unpack_data(encoded_data)
|
124
|
+
encoded_vector, label = encoded_data.split(":")
|
125
|
+
{ "label" => label, "vec" => decode_vec(encoded_vector) }
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Anngler
|
2
|
+
module Storage
|
3
|
+
class MemoryBackend
|
4
|
+
def initialize
|
5
|
+
@storage = Hash.new( [] )
|
6
|
+
end
|
7
|
+
|
8
|
+
def add_vector(bucket, data)
|
9
|
+
@storage[bucket] = [data] + @storage[bucket]
|
10
|
+
end
|
11
|
+
|
12
|
+
def remove_vector(bucket, encoded_vec)
|
13
|
+
@storage[bucket] = @storage[bucket].reject do |encoded_str|
|
14
|
+
encoded_str.split(":")[0] == encoded_vec
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def query_bucket(bucket)
|
19
|
+
@storage[bucket]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'redis'
|
2
|
+
|
3
|
+
module Anngler
|
4
|
+
module Storage
|
5
|
+
class RedisBackend
|
6
|
+
|
7
|
+
def initialize(instance)
|
8
|
+
@instance = instance
|
9
|
+
end
|
10
|
+
|
11
|
+
def add_vector(bucket, data)
|
12
|
+
@instance.lpush(bucket, data)
|
13
|
+
end
|
14
|
+
|
15
|
+
def remove_vector(bucket, encoded_vec)
|
16
|
+
@instance.lrange(bucket, 0, -1).each do |val|
|
17
|
+
if(val.split(":")[0] == encoded_vec)
|
18
|
+
@instance.lrem(bucket, 0, val)
|
19
|
+
return
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def query_bucket(bucket)
|
25
|
+
@instance.lrange(bucket, 0, -1)
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,158 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: anngler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Aiden Leeming
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-05-20 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: mock_redis
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: bundler
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: json
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: numo-narray
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: redis
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: zlib
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
description:
|
126
|
+
email:
|
127
|
+
executables: []
|
128
|
+
extensions: []
|
129
|
+
extra_rdoc_files: []
|
130
|
+
files:
|
131
|
+
- lib/anngler.rb
|
132
|
+
- lib/anngler/helpers.rb
|
133
|
+
- lib/anngler/index.rb
|
134
|
+
- lib/anngler/storage/memory_backend.rb
|
135
|
+
- lib/anngler/storage/redis_backend.rb
|
136
|
+
homepage:
|
137
|
+
licenses: []
|
138
|
+
metadata: {}
|
139
|
+
post_install_message:
|
140
|
+
rdoc_options: []
|
141
|
+
require_paths:
|
142
|
+
- lib
|
143
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
144
|
+
requirements:
|
145
|
+
- - ">="
|
146
|
+
- !ruby/object:Gem::Version
|
147
|
+
version: '0'
|
148
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
requirements: []
|
154
|
+
rubygems_version: 3.1.4
|
155
|
+
signing_key:
|
156
|
+
specification_version: 4
|
157
|
+
summary: A ruby locality sensitive hashing implementation using Redis for storage
|
158
|
+
test_files: []
|