lsh 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/lsh/index.rb +23 -19
- data/lib/lsh/storage/memory.rb +8 -2
- data/lib/lsh/storage/redis_backend.rb +8 -2
- metadata +1 -1
data/lib/lsh/index.rb
CHANGED
@@ -47,31 +47,35 @@ module LSH
|
|
47
47
|
end
|
48
48
|
|
49
49
|
def query(vector, multiprobe_radius = 0)
|
50
|
-
|
51
|
-
hashes
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
(
|
61
|
-
(0..(storage.parameters[:number_of_random_vectors] - 1)).to_a.combination(radius).each do |flips|
|
62
|
-
probe = hash.clone
|
63
|
-
flips.each { |d| probe[d] = (probe[d] == 1) ? 0 : 1 }
|
64
|
-
probe_hash = array_to_hash(probe)
|
65
|
-
probe_bucket_results = storage.query_bucket(bucket, probe_hash)
|
66
|
-
results += probe_bucket_results if probe_bucket_results
|
67
|
-
end
|
68
|
-
end
|
50
|
+
hash_arrays = hashes(vector)
|
51
|
+
hashes = hash_arrays.map { |a| array_to_hash(a) }
|
52
|
+
results = storage.query_buckets(hashes)
|
53
|
+
# Multiprobe LSH
|
54
|
+
# Take query hashes, move them around at radius r, and use them to do another query
|
55
|
+
# TODO: only works for binary LSH atm
|
56
|
+
if multiprobe_radius > 0
|
57
|
+
mp_arrays = multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
|
58
|
+
mp_arrays.each do |probes_arrays|
|
59
|
+
probes_hashes = probes_arrays.map { |a| array_to_hash(a) }
|
60
|
+
results += storage.query_buckets(probes_hashes)
|
69
61
|
end
|
70
62
|
end
|
71
63
|
results = MathUtil.uniq(results)
|
72
64
|
order_vectors_by_similarity(vector, results)
|
73
65
|
end
|
74
66
|
|
67
|
+
def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
|
68
|
+
mp_arrays = []
|
69
|
+
(1..multiprobe_radius).to_a.each do |radius|
|
70
|
+
(0..(storage.parameters[:number_of_random_vectors] - 1)).to_a.combination(radius).each do |flips|
|
71
|
+
probes = Marshal.load(Marshal.dump(hash_arrays))
|
72
|
+
probes.each { |probe| flips.each { |d| probe[d] = (probe[d] == 1) ? 0 : 1 } }
|
73
|
+
mp_arrays << probes
|
74
|
+
end
|
75
|
+
end
|
76
|
+
mp_arrays
|
77
|
+
end
|
78
|
+
|
75
79
|
def order_vectors_by_similarity(vector, vectors)
|
76
80
|
vectors.map { |v| [ v, similarity(vector, v) ] } .sort_by { |v, sim| sim } .reverse .map { |vs| vs[0] }
|
77
81
|
end
|
data/lib/lsh/storage/memory.rb
CHANGED
@@ -49,8 +49,14 @@ module LSH
|
|
49
49
|
@buckets[i]
|
50
50
|
end
|
51
51
|
|
52
|
-
def
|
53
|
-
|
52
|
+
def query_buckets(hashes)
|
53
|
+
results = []
|
54
|
+
hashes.each_with_index do |hash, i|
|
55
|
+
bucket = find_bucket(i)
|
56
|
+
in_bucket = bucket[hash]
|
57
|
+
results += in_bucket if in_bucket
|
58
|
+
end
|
59
|
+
results
|
54
60
|
end
|
55
61
|
|
56
62
|
end
|
@@ -111,9 +111,15 @@ module LSH
|
|
111
111
|
"bucket:#{i}" if @redis.get("buckets").to_i > i
|
112
112
|
end
|
113
113
|
|
114
|
-
def
|
114
|
+
def query_buckets(hashes)
|
115
|
+
vector_hashes = []
|
116
|
+
hashes.each_with_index do |hash, i|
|
117
|
+
bucket = find_bucket(i)
|
118
|
+
vector_hashes += @redis.smembers("#{bucket}:#{hash}")
|
119
|
+
end
|
120
|
+
vector_hashes.uniq!
|
115
121
|
results = []
|
116
|
-
|
122
|
+
vector_hashes.each do |vector_hash|
|
117
123
|
vector = MathUtil.zeros(parameters[:dim])
|
118
124
|
vector.load(File.join(@data_dir, vector_hash+'.dat'))
|
119
125
|
results << vector
|