lsh 0.4.2-java → 0.5.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/lsh/index.rb +23 -15
- data/lib/lsh/math_util_gsl.rb +0 -8
- data/lib/lsh/math_util_jblas.rb +0 -4
- data/lib/lsh/storage/memory.rb +32 -15
- data/lib/lsh/storage/redis_backend.rb +32 -26
- data/lib/lsh/web.rb +1 -7
- metadata +1 -1
data/lib/lsh/index.rb
CHANGED
@@ -39,16 +39,18 @@ module LSH
|
|
39
39
|
end
|
40
40
|
|
41
41
|
def add(vector, id = nil)
|
42
|
-
|
42
|
+
vector_hash = vector.hash
|
43
|
+
storage.add_vector(vector, vector_hash)
|
44
|
+
storage.add_vector_id(vector_hash, id) if id
|
43
45
|
hashes(vector).each_with_index do |hash, i|
|
44
46
|
hash_i = array_to_hash(hash)
|
45
47
|
bucket = storage.find_bucket(i)
|
46
|
-
storage.
|
48
|
+
storage.add_vector_hash_to_bucket(bucket, hash_i, vector_hash)
|
47
49
|
end
|
48
50
|
end
|
49
51
|
|
50
|
-
def
|
51
|
-
storage.
|
52
|
+
def vector_hash_to_id(vector_hash)
|
53
|
+
storage.vector_hash_to_id(vector_hash)
|
52
54
|
end
|
53
55
|
|
54
56
|
def id_to_vector(id)
|
@@ -63,14 +65,15 @@ module LSH
|
|
63
65
|
# Take query hashes, move them around at radius r, and use them to do another query
|
64
66
|
# TODO: only works for binary LSH atm
|
65
67
|
if multiprobe_radius > 0
|
68
|
+
raise Exception.new("Non-zero multiprobe radius only implemented for binary LSH") unless hashes_are_binary?
|
66
69
|
mp_arrays = multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
|
67
70
|
mp_arrays.each do |probes_arrays|
|
68
71
|
probes_hashes = probes_arrays.map { |a| array_to_hash(a) }
|
69
72
|
results += storage.query_buckets(probes_hashes)
|
70
73
|
end
|
74
|
+
results.uniq! { |result| result[:hash] }
|
71
75
|
end
|
72
|
-
|
73
|
-
order_vectors_by_similarity(vector, results)
|
76
|
+
order_results_by_similarity(vector, results)
|
74
77
|
end
|
75
78
|
|
76
79
|
def query_ids(id, multiprobe_radius = 0)
|
@@ -79,10 +82,8 @@ module LSH
|
|
79
82
|
end
|
80
83
|
|
81
84
|
def query_ids_by_vector(vector, multiprobe_radius = 0)
|
82
|
-
|
83
|
-
results
|
84
|
-
vectors.each { |v| results << vector_to_id(v) }
|
85
|
-
results
|
85
|
+
results = query(vector, multiprobe_radius)
|
86
|
+
results.map { |result| vector_hash_to_id(result[:hash]) }
|
86
87
|
end
|
87
88
|
|
88
89
|
def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
|
@@ -97,8 +98,11 @@ module LSH
|
|
97
98
|
mp_arrays
|
98
99
|
end
|
99
100
|
|
100
|
-
def
|
101
|
-
|
101
|
+
def order_results_by_similarity(vector, results)
|
102
|
+
# Faster than vectors.sort - we precompute all similarities to vector
|
103
|
+
# and order using those
|
104
|
+
similarities = results.map { |result| [ result[:hash], result[:id], result[:data], similarity(vector, result[:data]) ] }
|
105
|
+
similarities.sort_by { |hash, id, vector, sim| sim } .reverse .map { |vs| { :hash => vs[0], :id => vs[1], :data => vs[2] } }
|
102
106
|
end
|
103
107
|
|
104
108
|
def hashes(vector)
|
@@ -108,7 +112,7 @@ module LSH
|
|
108
112
|
end
|
109
113
|
hashes
|
110
114
|
end
|
111
|
-
|
115
|
+
|
112
116
|
def hash(vector, projection, bias = true)
|
113
117
|
hash = []
|
114
118
|
dot_products = (projection * vector.transpose).column(0).to_a
|
@@ -128,6 +132,10 @@ module LSH
|
|
128
132
|
hash
|
129
133
|
end
|
130
134
|
|
135
|
+
def hashes_are_binary?
|
136
|
+
storage.parameters[:window] == Float::INFINITY
|
137
|
+
end
|
138
|
+
|
131
139
|
def random_vector(dim)
|
132
140
|
MathUtil.random_gaussian_matrix(1, dim)
|
133
141
|
end
|
@@ -150,7 +158,7 @@ module LSH
|
|
150
158
|
#value
|
151
159
|
end
|
152
160
|
|
153
|
-
|
161
|
+
def generate_projections(dim, k, l)
|
154
162
|
projections = []
|
155
163
|
l.times do |i|
|
156
164
|
projections << generate_projection(dim, k)
|
@@ -160,7 +168,7 @@ module LSH
|
|
160
168
|
|
161
169
|
def generate_projection(dim, k)
|
162
170
|
MathUtil.random_gaussian_matrix(k, dim)
|
163
|
-
|
171
|
+
end
|
164
172
|
|
165
173
|
def similarity(v1, v2)
|
166
174
|
MathUtil.dot(v1, v2)
|
data/lib/lsh/math_util_gsl.rb
CHANGED
@@ -51,14 +51,6 @@ module LSH
|
|
51
51
|
v.norm
|
52
52
|
end
|
53
53
|
|
54
|
-
def self.uniq(vs)
|
55
|
-
# Can't use uniq as
|
56
|
-
# [ v, JSON.parse(v.to_json, :create_additions => true) ].uniq.size == 2 with GSL
|
57
|
-
results = []
|
58
|
-
vs.each { |v| results << v unless results.member? v }
|
59
|
-
results
|
60
|
-
end
|
61
|
-
|
62
54
|
end
|
63
55
|
|
64
56
|
end
|
data/lib/lsh/math_util_jblas.rb
CHANGED
data/lib/lsh/storage/memory.rb
CHANGED
@@ -30,6 +30,9 @@ module LSH
|
|
30
30
|
|
31
31
|
def reset!
|
32
32
|
@buckets = nil
|
33
|
+
@vectors = nil
|
34
|
+
@vector_hash_to_id = nil
|
35
|
+
@id_to_vector = nil
|
33
36
|
end
|
34
37
|
|
35
38
|
def create_new_bucket
|
@@ -37,27 +40,32 @@ module LSH
|
|
37
40
|
@buckets << {}
|
38
41
|
end
|
39
42
|
|
40
|
-
def
|
43
|
+
def add_vector(vector, vector_hash)
|
44
|
+
@vectors ||= {}
|
45
|
+
@vectors[vector_hash] = vector
|
46
|
+
end
|
47
|
+
|
48
|
+
def add_vector_hash_to_bucket(bucket, hash, vector_hash)
|
41
49
|
if bucket.has_key? hash
|
42
|
-
bucket[hash] <<
|
50
|
+
bucket[hash] << vector_hash
|
43
51
|
else
|
44
|
-
bucket[hash] = [
|
52
|
+
bucket[hash] = [vector_hash]
|
45
53
|
end
|
46
54
|
end
|
47
55
|
|
48
|
-
def add_vector_id(
|
49
|
-
@
|
50
|
-
@
|
56
|
+
def add_vector_id(vector_hash, id)
|
57
|
+
@vector_hash_to_id ||= {}
|
58
|
+
@vector_hash_to_id[vector_hash] = id
|
51
59
|
@id_to_vector ||= {}
|
52
|
-
@id_to_vector[id] =
|
60
|
+
@id_to_vector[id] = vector_hash
|
53
61
|
end
|
54
62
|
|
55
|
-
def
|
56
|
-
@
|
63
|
+
def vector_hash_to_id(vector_hash)
|
64
|
+
@vector_hash_to_id[vector_hash] if @vector_hash_to_id
|
57
65
|
end
|
58
66
|
|
59
67
|
def id_to_vector(id)
|
60
|
-
@id_to_vector[id] if @id_to_vector
|
68
|
+
@vectors[@id_to_vector[id]] if @id_to_vector
|
61
69
|
end
|
62
70
|
|
63
71
|
def find_bucket(i)
|
@@ -65,13 +73,22 @@ module LSH
|
|
65
73
|
end
|
66
74
|
|
67
75
|
def query_buckets(hashes)
|
68
|
-
|
76
|
+
results_hashes = {}
|
69
77
|
hashes.each_with_index do |hash, i|
|
70
|
-
|
71
|
-
|
72
|
-
|
78
|
+
vectors_hashes_in_bucket = @buckets[i][hash]
|
79
|
+
if vectors_hashes_in_bucket
|
80
|
+
vectors_hashes_in_bucket.each do |vector_hash|
|
81
|
+
results_hashes[vector_hash] = true
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
results_hashes.keys.map do |vector_hash|
|
86
|
+
{
|
87
|
+
:data => @vectors[vector_hash],
|
88
|
+
:hash => vector_hash,
|
89
|
+
:id => vector_hash_to_id(vector_hash)
|
90
|
+
}
|
73
91
|
end
|
74
|
-
results
|
75
92
|
end
|
76
93
|
|
77
94
|
end
|
@@ -28,10 +28,8 @@ module LSH
|
|
28
28
|
def initialize(params = { :redis => { :host => '127.0.0.1', :port => 6379 }, :data_dir => 'data' })
|
29
29
|
@redis = Redis.new(params[:redis])
|
30
30
|
@data_dir = params[:data_dir]
|
31
|
-
unless File.exists?(@data_dir)
|
32
|
-
|
33
|
-
Dir.mkdir(File.join(@data_dir, 'projections'))
|
34
|
-
end
|
31
|
+
Dir.mkdir(@data_dir) unless File.exists?(@data_dir)
|
32
|
+
Dir.mkdir(File.join(@data_dir, 'projections')) unless File.exists?(File.join(@data_dir, 'projections'))
|
35
33
|
end
|
36
34
|
|
37
35
|
def reset!
|
@@ -42,6 +40,10 @@ module LSH
|
|
42
40
|
def clear_data!
|
43
41
|
keys = @redis.keys("lsh:bucket:*")
|
44
42
|
@redis.del(keys) unless keys.empty?
|
43
|
+
keys = @redis.keys("lsh:vector_to_id:*")
|
44
|
+
@redis.del(keys) unless keys.empty?
|
45
|
+
keys = @redis.keys("lsh:id_to_vector:*")
|
46
|
+
@redis.del(keys) unless keys.empty?
|
45
47
|
delete_dat_files_in_dir(@data_dir)
|
46
48
|
end
|
47
49
|
|
@@ -107,8 +109,8 @@ module LSH
|
|
107
109
|
@redis.incr "lsh:buckets"
|
108
110
|
end
|
109
111
|
|
110
|
-
def save_vector(vector)
|
111
|
-
path = File.join(@data_dir,
|
112
|
+
def save_vector(vector, vector_hash)
|
113
|
+
path = File.join(@data_dir, vector_hash.to_s+'.dat')
|
112
114
|
vector.save(path) unless File.exists?(path)
|
113
115
|
end
|
114
116
|
|
@@ -118,19 +120,21 @@ module LSH
|
|
118
120
|
vector
|
119
121
|
end
|
120
122
|
|
121
|
-
def
|
122
|
-
save_vector(vector) # Writing vector to disk if not already there
|
123
|
-
@redis.sadd "#{bucket}:#{hash}", vector.hash.to_s # Only storing vector's hash in Redis
|
123
|
+
def add_vector(vector, vector_hash)
|
124
|
+
save_vector(vector, vector_hash) # Writing vector to disk if not already there
|
124
125
|
end
|
125
126
|
|
126
|
-
def
|
127
|
-
|
128
|
-
@redis.set "lsh:vector_to_id:#{vector.hash}", id
|
129
|
-
@redis.set "lsh:id_to_vector:#{id}", vector.hash.to_s
|
127
|
+
def add_vector_hash_to_bucket(bucket, hash, vector_hash)
|
128
|
+
@redis.sadd "#{bucket}:#{hash}", vector_hash.to_s # Only storing vector's hash in Redis
|
130
129
|
end
|
131
130
|
|
132
|
-
def
|
133
|
-
@redis.
|
131
|
+
def add_vector_id(vector_hash, id)
|
132
|
+
@redis.set "lsh:vector_to_id:#{vector_hash}", id
|
133
|
+
@redis.set "lsh:id_to_vector:#{id}", vector_hash.to_s
|
134
|
+
end
|
135
|
+
|
136
|
+
def vector_hash_to_id(vector_hash)
|
137
|
+
@redis.get "lsh:vector_to_id:#{vector_hash}"
|
134
138
|
end
|
135
139
|
|
136
140
|
def id_to_vector(id)
|
@@ -143,21 +147,23 @@ module LSH
|
|
143
147
|
end
|
144
148
|
|
145
149
|
def query_buckets(hashes)
|
146
|
-
|
150
|
+
results_hashes = {}
|
147
151
|
hashes.each_with_index do |hash, i|
|
148
152
|
bucket = find_bucket(i)
|
149
|
-
|
150
|
-
|
153
|
+
vector_hashes_in_bucket = @redis.smembers("#{bucket}:#{hash}")
|
154
|
+
if vector_hashes_in_bucket
|
155
|
+
vector_hashes_in_bucket.each do |vector_hash|
|
156
|
+
results_hashes[vector_hash] = true
|
157
|
+
end
|
158
|
+
end
|
151
159
|
end
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
results << vector
|
160
|
+
results_hashes.keys.map do |vector_hash|
|
161
|
+
{
|
162
|
+
:data => load_vector(vector_hash),
|
163
|
+
:hash => vector_hash.to_i,
|
164
|
+
:id => vector_hash_to_id(vector_hash)
|
165
|
+
}
|
159
166
|
end
|
160
|
-
results
|
161
167
|
end
|
162
168
|
|
163
169
|
end
|
data/lib/lsh/web.rb
CHANGED
@@ -24,13 +24,7 @@ module LSH
|
|
24
24
|
if mime_type == 'application/json'
|
25
25
|
t0 = Time.now
|
26
26
|
vector = JSON.parse(params[:data], :create_additions => true)
|
27
|
-
|
28
|
-
results = []
|
29
|
-
if params[:include] == 'id'
|
30
|
-
result_vectors.each { |v| results << { :id => index.vector_to_id(v), :data => v } }
|
31
|
-
else
|
32
|
-
result_vectors.each { |v| results << { :data => v } }
|
33
|
-
end
|
27
|
+
results = index.query(vector, params[:radius] || 0)
|
34
28
|
content_type :json
|
35
29
|
{ "time" => Time.now - t0, "results" => results }.to_json
|
36
30
|
else
|