lsh 0.4.2-java → 0.5.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/lsh/index.rb +23 -15
- data/lib/lsh/math_util_gsl.rb +0 -8
- data/lib/lsh/math_util_jblas.rb +0 -4
- data/lib/lsh/storage/memory.rb +32 -15
- data/lib/lsh/storage/redis_backend.rb +32 -26
- data/lib/lsh/web.rb +1 -7
- metadata +1 -1
data/lib/lsh/index.rb
CHANGED
@@ -39,16 +39,18 @@ module LSH
|
|
39
39
|
end
|
40
40
|
|
41
41
|
def add(vector, id = nil)
|
42
|
-
|
42
|
+
vector_hash = vector.hash
|
43
|
+
storage.add_vector(vector, vector_hash)
|
44
|
+
storage.add_vector_id(vector_hash, id) if id
|
43
45
|
hashes(vector).each_with_index do |hash, i|
|
44
46
|
hash_i = array_to_hash(hash)
|
45
47
|
bucket = storage.find_bucket(i)
|
46
|
-
storage.
|
48
|
+
storage.add_vector_hash_to_bucket(bucket, hash_i, vector_hash)
|
47
49
|
end
|
48
50
|
end
|
49
51
|
|
50
|
-
def
|
51
|
-
storage.
|
52
|
+
def vector_hash_to_id(vector_hash)
|
53
|
+
storage.vector_hash_to_id(vector_hash)
|
52
54
|
end
|
53
55
|
|
54
56
|
def id_to_vector(id)
|
@@ -63,14 +65,15 @@ module LSH
|
|
63
65
|
# Take query hashes, move them around at radius r, and use them to do another query
|
64
66
|
# TODO: only works for binary LSH atm
|
65
67
|
if multiprobe_radius > 0
|
68
|
+
raise Exception.new("Non-zero multiprobe radius only implemented for binary LSH") unless hashes_are_binary?
|
66
69
|
mp_arrays = multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
|
67
70
|
mp_arrays.each do |probes_arrays|
|
68
71
|
probes_hashes = probes_arrays.map { |a| array_to_hash(a) }
|
69
72
|
results += storage.query_buckets(probes_hashes)
|
70
73
|
end
|
74
|
+
results.uniq! { |result| result[:hash] }
|
71
75
|
end
|
72
|
-
|
73
|
-
order_vectors_by_similarity(vector, results)
|
76
|
+
order_results_by_similarity(vector, results)
|
74
77
|
end
|
75
78
|
|
76
79
|
def query_ids(id, multiprobe_radius = 0)
|
@@ -79,10 +82,8 @@ module LSH
|
|
79
82
|
end
|
80
83
|
|
81
84
|
def query_ids_by_vector(vector, multiprobe_radius = 0)
|
82
|
-
|
83
|
-
results
|
84
|
-
vectors.each { |v| results << vector_to_id(v) }
|
85
|
-
results
|
85
|
+
results = query(vector, multiprobe_radius)
|
86
|
+
results.map { |result| vector_hash_to_id(result[:hash]) }
|
86
87
|
end
|
87
88
|
|
88
89
|
def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
|
@@ -97,8 +98,11 @@ module LSH
|
|
97
98
|
mp_arrays
|
98
99
|
end
|
99
100
|
|
100
|
-
def
|
101
|
-
|
101
|
+
def order_results_by_similarity(vector, results)
|
102
|
+
# Faster than vectors.sort - we precompute all similarities to vector
|
103
|
+
# and order using those
|
104
|
+
similarities = results.map { |result| [ result[:hash], result[:id], result[:data], similarity(vector, result[:data]) ] }
|
105
|
+
similarities.sort_by { |hash, id, vector, sim| sim } .reverse .map { |vs| { :hash => vs[0], :id => vs[1], :data => vs[2] } }
|
102
106
|
end
|
103
107
|
|
104
108
|
def hashes(vector)
|
@@ -108,7 +112,7 @@ module LSH
|
|
108
112
|
end
|
109
113
|
hashes
|
110
114
|
end
|
111
|
-
|
115
|
+
|
112
116
|
def hash(vector, projection, bias = true)
|
113
117
|
hash = []
|
114
118
|
dot_products = (projection * vector.transpose).column(0).to_a
|
@@ -128,6 +132,10 @@ module LSH
|
|
128
132
|
hash
|
129
133
|
end
|
130
134
|
|
135
|
+
def hashes_are_binary?
|
136
|
+
storage.parameters[:window] == Float::INFINITY
|
137
|
+
end
|
138
|
+
|
131
139
|
def random_vector(dim)
|
132
140
|
MathUtil.random_gaussian_matrix(1, dim)
|
133
141
|
end
|
@@ -150,7 +158,7 @@ module LSH
|
|
150
158
|
#value
|
151
159
|
end
|
152
160
|
|
153
|
-
|
161
|
+
def generate_projections(dim, k, l)
|
154
162
|
projections = []
|
155
163
|
l.times do |i|
|
156
164
|
projections << generate_projection(dim, k)
|
@@ -160,7 +168,7 @@ module LSH
|
|
160
168
|
|
161
169
|
def generate_projection(dim, k)
|
162
170
|
MathUtil.random_gaussian_matrix(k, dim)
|
163
|
-
|
171
|
+
end
|
164
172
|
|
165
173
|
def similarity(v1, v2)
|
166
174
|
MathUtil.dot(v1, v2)
|
data/lib/lsh/math_util_gsl.rb
CHANGED
@@ -51,14 +51,6 @@ module LSH
|
|
51
51
|
v.norm
|
52
52
|
end
|
53
53
|
|
54
|
-
def self.uniq(vs)
|
55
|
-
# Can't use uniq as
|
56
|
-
# [ v, JSON.parse(v.to_json, :create_additions => true) ].uniq.size == 2 with GSL
|
57
|
-
results = []
|
58
|
-
vs.each { |v| results << v unless results.member? v }
|
59
|
-
results
|
60
|
-
end
|
61
|
-
|
62
54
|
end
|
63
55
|
|
64
56
|
end
|
data/lib/lsh/math_util_jblas.rb
CHANGED
data/lib/lsh/storage/memory.rb
CHANGED
@@ -30,6 +30,9 @@ module LSH
|
|
30
30
|
|
31
31
|
def reset!
|
32
32
|
@buckets = nil
|
33
|
+
@vectors = nil
|
34
|
+
@vector_hash_to_id = nil
|
35
|
+
@id_to_vector = nil
|
33
36
|
end
|
34
37
|
|
35
38
|
def create_new_bucket
|
@@ -37,27 +40,32 @@ module LSH
|
|
37
40
|
@buckets << {}
|
38
41
|
end
|
39
42
|
|
40
|
-
def
|
43
|
+
def add_vector(vector, vector_hash)
|
44
|
+
@vectors ||= {}
|
45
|
+
@vectors[vector_hash] = vector
|
46
|
+
end
|
47
|
+
|
48
|
+
def add_vector_hash_to_bucket(bucket, hash, vector_hash)
|
41
49
|
if bucket.has_key? hash
|
42
|
-
bucket[hash] <<
|
50
|
+
bucket[hash] << vector_hash
|
43
51
|
else
|
44
|
-
bucket[hash] = [
|
52
|
+
bucket[hash] = [vector_hash]
|
45
53
|
end
|
46
54
|
end
|
47
55
|
|
48
|
-
def add_vector_id(
|
49
|
-
@
|
50
|
-
@
|
56
|
+
def add_vector_id(vector_hash, id)
|
57
|
+
@vector_hash_to_id ||= {}
|
58
|
+
@vector_hash_to_id[vector_hash] = id
|
51
59
|
@id_to_vector ||= {}
|
52
|
-
@id_to_vector[id] =
|
60
|
+
@id_to_vector[id] = vector_hash
|
53
61
|
end
|
54
62
|
|
55
|
-
def
|
56
|
-
@
|
63
|
+
def vector_hash_to_id(vector_hash)
|
64
|
+
@vector_hash_to_id[vector_hash] if @vector_hash_to_id
|
57
65
|
end
|
58
66
|
|
59
67
|
def id_to_vector(id)
|
60
|
-
@id_to_vector[id] if @id_to_vector
|
68
|
+
@vectors[@id_to_vector[id]] if @id_to_vector
|
61
69
|
end
|
62
70
|
|
63
71
|
def find_bucket(i)
|
@@ -65,13 +73,22 @@ module LSH
|
|
65
73
|
end
|
66
74
|
|
67
75
|
def query_buckets(hashes)
|
68
|
-
|
76
|
+
results_hashes = {}
|
69
77
|
hashes.each_with_index do |hash, i|
|
70
|
-
|
71
|
-
|
72
|
-
|
78
|
+
vectors_hashes_in_bucket = @buckets[i][hash]
|
79
|
+
if vectors_hashes_in_bucket
|
80
|
+
vectors_hashes_in_bucket.each do |vector_hash|
|
81
|
+
results_hashes[vector_hash] = true
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
results_hashes.keys.map do |vector_hash|
|
86
|
+
{
|
87
|
+
:data => @vectors[vector_hash],
|
88
|
+
:hash => vector_hash,
|
89
|
+
:id => vector_hash_to_id(vector_hash)
|
90
|
+
}
|
73
91
|
end
|
74
|
-
results
|
75
92
|
end
|
76
93
|
|
77
94
|
end
|
@@ -28,10 +28,8 @@ module LSH
|
|
28
28
|
def initialize(params = { :redis => { :host => '127.0.0.1', :port => 6379 }, :data_dir => 'data' })
|
29
29
|
@redis = Redis.new(params[:redis])
|
30
30
|
@data_dir = params[:data_dir]
|
31
|
-
unless File.exists?(@data_dir)
|
32
|
-
|
33
|
-
Dir.mkdir(File.join(@data_dir, 'projections'))
|
34
|
-
end
|
31
|
+
Dir.mkdir(@data_dir) unless File.exists?(@data_dir)
|
32
|
+
Dir.mkdir(File.join(@data_dir, 'projections')) unless File.exists?(File.join(@data_dir, 'projections'))
|
35
33
|
end
|
36
34
|
|
37
35
|
def reset!
|
@@ -42,6 +40,10 @@ module LSH
|
|
42
40
|
def clear_data!
|
43
41
|
keys = @redis.keys("lsh:bucket:*")
|
44
42
|
@redis.del(keys) unless keys.empty?
|
43
|
+
keys = @redis.keys("lsh:vector_to_id:*")
|
44
|
+
@redis.del(keys) unless keys.empty?
|
45
|
+
keys = @redis.keys("lsh:id_to_vector:*")
|
46
|
+
@redis.del(keys) unless keys.empty?
|
45
47
|
delete_dat_files_in_dir(@data_dir)
|
46
48
|
end
|
47
49
|
|
@@ -107,8 +109,8 @@ module LSH
|
|
107
109
|
@redis.incr "lsh:buckets"
|
108
110
|
end
|
109
111
|
|
110
|
-
def save_vector(vector)
|
111
|
-
path = File.join(@data_dir,
|
112
|
+
def save_vector(vector, vector_hash)
|
113
|
+
path = File.join(@data_dir, vector_hash.to_s+'.dat')
|
112
114
|
vector.save(path) unless File.exists?(path)
|
113
115
|
end
|
114
116
|
|
@@ -118,19 +120,21 @@ module LSH
|
|
118
120
|
vector
|
119
121
|
end
|
120
122
|
|
121
|
-
def
|
122
|
-
save_vector(vector) # Writing vector to disk if not already there
|
123
|
-
@redis.sadd "#{bucket}:#{hash}", vector.hash.to_s # Only storing vector's hash in Redis
|
123
|
+
def add_vector(vector, vector_hash)
|
124
|
+
save_vector(vector, vector_hash) # Writing vector to disk if not already there
|
124
125
|
end
|
125
126
|
|
126
|
-
def
|
127
|
-
|
128
|
-
@redis.set "lsh:vector_to_id:#{vector.hash}", id
|
129
|
-
@redis.set "lsh:id_to_vector:#{id}", vector.hash.to_s
|
127
|
+
def add_vector_hash_to_bucket(bucket, hash, vector_hash)
|
128
|
+
@redis.sadd "#{bucket}:#{hash}", vector_hash.to_s # Only storing vector's hash in Redis
|
130
129
|
end
|
131
130
|
|
132
|
-
def
|
133
|
-
@redis.
|
131
|
+
def add_vector_id(vector_hash, id)
|
132
|
+
@redis.set "lsh:vector_to_id:#{vector_hash}", id
|
133
|
+
@redis.set "lsh:id_to_vector:#{id}", vector_hash.to_s
|
134
|
+
end
|
135
|
+
|
136
|
+
def vector_hash_to_id(vector_hash)
|
137
|
+
@redis.get "lsh:vector_to_id:#{vector_hash}"
|
134
138
|
end
|
135
139
|
|
136
140
|
def id_to_vector(id)
|
@@ -143,21 +147,23 @@ module LSH
|
|
143
147
|
end
|
144
148
|
|
145
149
|
def query_buckets(hashes)
|
146
|
-
|
150
|
+
results_hashes = {}
|
147
151
|
hashes.each_with_index do |hash, i|
|
148
152
|
bucket = find_bucket(i)
|
149
|
-
|
150
|
-
|
153
|
+
vector_hashes_in_bucket = @redis.smembers("#{bucket}:#{hash}")
|
154
|
+
if vector_hashes_in_bucket
|
155
|
+
vector_hashes_in_bucket.each do |vector_hash|
|
156
|
+
results_hashes[vector_hash] = true
|
157
|
+
end
|
158
|
+
end
|
151
159
|
end
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
results << vector
|
160
|
+
results_hashes.keys.map do |vector_hash|
|
161
|
+
{
|
162
|
+
:data => load_vector(vector_hash),
|
163
|
+
:hash => vector_hash.to_i,
|
164
|
+
:id => vector_hash_to_id(vector_hash)
|
165
|
+
}
|
159
166
|
end
|
160
|
-
results
|
161
167
|
end
|
162
168
|
|
163
169
|
end
|
data/lib/lsh/web.rb
CHANGED
@@ -24,13 +24,7 @@ module LSH
|
|
24
24
|
if mime_type == 'application/json'
|
25
25
|
t0 = Time.now
|
26
26
|
vector = JSON.parse(params[:data], :create_additions => true)
|
27
|
-
|
28
|
-
results = []
|
29
|
-
if params[:include] == 'id'
|
30
|
-
result_vectors.each { |v| results << { :id => index.vector_to_id(v), :data => v } }
|
31
|
-
else
|
32
|
-
result_vectors.each { |v| results << { :data => v } }
|
33
|
-
end
|
27
|
+
results = index.query(vector, params[:radius] || 0)
|
34
28
|
content_type :json
|
35
29
|
{ "time" => Time.now - t0, "results" => results }.to_json
|
36
30
|
else
|