lsh 0.4.2-java → 0.5.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -39,16 +39,18 @@ module LSH
39
39
  end
40
40
 
41
41
  def add(vector, id = nil)
42
- storage.add_vector_id(vector, id) if id
42
+ vector_hash = vector.hash
43
+ storage.add_vector(vector, vector_hash)
44
+ storage.add_vector_id(vector_hash, id) if id
43
45
  hashes(vector).each_with_index do |hash, i|
44
46
  hash_i = array_to_hash(hash)
45
47
  bucket = storage.find_bucket(i)
46
- storage.add_vector_to_bucket(bucket, hash_i, vector)
48
+ storage.add_vector_hash_to_bucket(bucket, hash_i, vector_hash)
47
49
  end
48
50
  end
49
51
 
50
- def vector_to_id(vector)
51
- storage.vector_to_id(vector)
52
+ def vector_hash_to_id(vector_hash)
53
+ storage.vector_hash_to_id(vector_hash)
52
54
  end
53
55
 
54
56
  def id_to_vector(id)
@@ -63,14 +65,15 @@ module LSH
63
65
  # Take query hashes, move them around at radius r, and use them to do another query
64
66
  # TODO: only works for binary LSH atm
65
67
  if multiprobe_radius > 0
68
+ raise Exception.new("Non-zero multiprobe radius only implemented for binary LSH") unless hashes_are_binary?
66
69
  mp_arrays = multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
67
70
  mp_arrays.each do |probes_arrays|
68
71
  probes_hashes = probes_arrays.map { |a| array_to_hash(a) }
69
72
  results += storage.query_buckets(probes_hashes)
70
73
  end
74
+ results.uniq! { |result| result[:hash] }
71
75
  end
72
- results = MathUtil.uniq(results)
73
- order_vectors_by_similarity(vector, results)
76
+ order_results_by_similarity(vector, results)
74
77
  end
75
78
 
76
79
  def query_ids(id, multiprobe_radius = 0)
@@ -79,10 +82,8 @@ module LSH
79
82
  end
80
83
 
81
84
  def query_ids_by_vector(vector, multiprobe_radius = 0)
82
- vectors = query(vector, multiprobe_radius)
83
- results = []
84
- vectors.each { |v| results << vector_to_id(v) }
85
- results
85
+ results = query(vector, multiprobe_radius)
86
+ results.map { |result| vector_hash_to_id(result[:hash]) }
86
87
  end
87
88
 
88
89
  def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
@@ -97,8 +98,11 @@ module LSH
97
98
  mp_arrays
98
99
  end
99
100
 
100
- def order_vectors_by_similarity(vector, vectors)
101
- vectors.map { |v| [ v, similarity(vector, v) ] } .sort_by { |v, sim| sim } .reverse .map { |vs| vs[0] }
101
+ def order_results_by_similarity(vector, results)
102
+ # Faster than vectors.sort - we precompute all similarities to vector
103
+ # and order using those
104
+ similarities = results.map { |result| [ result[:hash], result[:id], result[:data], similarity(vector, result[:data]) ] }
105
+ similarities.sort_by { |hash, id, vector, sim| sim } .reverse .map { |vs| { :hash => vs[0], :id => vs[1], :data => vs[2] } }
102
106
  end
103
107
 
104
108
  def hashes(vector)
@@ -108,7 +112,7 @@ module LSH
108
112
  end
109
113
  hashes
110
114
  end
111
-
115
+
112
116
  def hash(vector, projection, bias = true)
113
117
  hash = []
114
118
  dot_products = (projection * vector.transpose).column(0).to_a
@@ -128,6 +132,10 @@ module LSH
128
132
  hash
129
133
  end
130
134
 
135
+ def hashes_are_binary?
136
+ storage.parameters[:window] == Float::INFINITY
137
+ end
138
+
131
139
  def random_vector(dim)
132
140
  MathUtil.random_gaussian_matrix(1, dim)
133
141
  end
@@ -150,7 +158,7 @@ module LSH
150
158
  #value
151
159
  end
152
160
 
153
- def generate_projections(dim, k, l)
161
+ def generate_projections(dim, k, l)
154
162
  projections = []
155
163
  l.times do |i|
156
164
  projections << generate_projection(dim, k)
@@ -160,7 +168,7 @@ module LSH
160
168
 
161
169
  def generate_projection(dim, k)
162
170
  MathUtil.random_gaussian_matrix(k, dim)
163
- end
171
+ end
164
172
 
165
173
  def similarity(v1, v2)
166
174
  MathUtil.dot(v1, v2)
@@ -51,14 +51,6 @@ module LSH
51
51
  v.norm
52
52
  end
53
53
 
54
- def self.uniq(vs)
55
- # Can't use uniq as
56
- # [ v, JSON.parse(v.to_json, :create_additions => true) ].uniq.size == 2 with GSL
57
- results = []
58
- vs.each { |v| results << v unless results.member? v }
59
- results
60
- end
61
-
62
54
  end
63
55
 
64
56
  end
@@ -44,10 +44,6 @@ module LSH
44
44
  v.norm2
45
45
  end
46
46
 
47
- def self.uniq(vs)
48
- vs.uniq
49
- end
50
-
51
47
  end
52
48
 
53
49
  end
@@ -30,6 +30,9 @@ module LSH
30
30
 
31
31
  def reset!
32
32
  @buckets = nil
33
+ @vectors = nil
34
+ @vector_hash_to_id = nil
35
+ @id_to_vector = nil
33
36
  end
34
37
 
35
38
  def create_new_bucket
@@ -37,27 +40,32 @@ module LSH
37
40
  @buckets << {}
38
41
  end
39
42
 
40
- def add_vector_to_bucket(bucket, hash, vector)
43
+ def add_vector(vector, vector_hash)
44
+ @vectors ||= {}
45
+ @vectors[vector_hash] = vector
46
+ end
47
+
48
+ def add_vector_hash_to_bucket(bucket, hash, vector_hash)
41
49
  if bucket.has_key? hash
42
- bucket[hash] << vector
50
+ bucket[hash] << vector_hash
43
51
  else
44
- bucket[hash] = [vector]
52
+ bucket[hash] = [vector_hash]
45
53
  end
46
54
  end
47
55
 
48
- def add_vector_id(vector, id)
49
- @vector_to_id ||= {}
50
- @vector_to_id[vector.hash] = id
56
+ def add_vector_id(vector_hash, id)
57
+ @vector_hash_to_id ||= {}
58
+ @vector_hash_to_id[vector_hash] = id
51
59
  @id_to_vector ||= {}
52
- @id_to_vector[id] = vector
60
+ @id_to_vector[id] = vector_hash
53
61
  end
54
62
 
55
- def vector_to_id(vector)
56
- @vector_to_id[vector.hash] if @vector_to_id
63
+ def vector_hash_to_id(vector_hash)
64
+ @vector_hash_to_id[vector_hash] if @vector_hash_to_id
57
65
  end
58
66
 
59
67
  def id_to_vector(id)
60
- @id_to_vector[id] if @id_to_vector
68
+ @vectors[@id_to_vector[id]] if @id_to_vector
61
69
  end
62
70
 
63
71
  def find_bucket(i)
@@ -65,13 +73,22 @@ module LSH
65
73
  end
66
74
 
67
75
  def query_buckets(hashes)
68
- results = []
76
+ results_hashes = {}
69
77
  hashes.each_with_index do |hash, i|
70
- bucket = find_bucket(i)
71
- in_bucket = bucket[hash]
72
- results += in_bucket if in_bucket
78
+ vectors_hashes_in_bucket = @buckets[i][hash]
79
+ if vectors_hashes_in_bucket
80
+ vectors_hashes_in_bucket.each do |vector_hash|
81
+ results_hashes[vector_hash] = true
82
+ end
83
+ end
84
+ end
85
+ results_hashes.keys.map do |vector_hash|
86
+ {
87
+ :data => @vectors[vector_hash],
88
+ :hash => vector_hash,
89
+ :id => vector_hash_to_id(vector_hash)
90
+ }
73
91
  end
74
- results
75
92
  end
76
93
 
77
94
  end
@@ -28,10 +28,8 @@ module LSH
28
28
  def initialize(params = { :redis => { :host => '127.0.0.1', :port => 6379 }, :data_dir => 'data' })
29
29
  @redis = Redis.new(params[:redis])
30
30
  @data_dir = params[:data_dir]
31
- unless File.exists?(@data_dir)
32
- Dir.mkdir(@data_dir)
33
- Dir.mkdir(File.join(@data_dir, 'projections'))
34
- end
31
+ Dir.mkdir(@data_dir) unless File.exists?(@data_dir)
32
+ Dir.mkdir(File.join(@data_dir, 'projections')) unless File.exists?(File.join(@data_dir, 'projections'))
35
33
  end
36
34
 
37
35
  def reset!
@@ -42,6 +40,10 @@ module LSH
42
40
  def clear_data!
43
41
  keys = @redis.keys("lsh:bucket:*")
44
42
  @redis.del(keys) unless keys.empty?
43
+ keys = @redis.keys("lsh:vector_to_id:*")
44
+ @redis.del(keys) unless keys.empty?
45
+ keys = @redis.keys("lsh:id_to_vector:*")
46
+ @redis.del(keys) unless keys.empty?
45
47
  delete_dat_files_in_dir(@data_dir)
46
48
  end
47
49
 
@@ -107,8 +109,8 @@ module LSH
107
109
  @redis.incr "lsh:buckets"
108
110
  end
109
111
 
110
- def save_vector(vector)
111
- path = File.join(@data_dir, vector.hash.to_s+'.dat')
112
+ def save_vector(vector, vector_hash)
113
+ path = File.join(@data_dir, vector_hash.to_s+'.dat')
112
114
  vector.save(path) unless File.exists?(path)
113
115
  end
114
116
 
@@ -118,19 +120,21 @@ module LSH
118
120
  vector
119
121
  end
120
122
 
121
- def add_vector_to_bucket(bucket, hash, vector)
122
- save_vector(vector) # Writing vector to disk if not already there
123
- @redis.sadd "#{bucket}:#{hash}", vector.hash.to_s # Only storing vector's hash in Redis
123
+ def add_vector(vector, vector_hash)
124
+ save_vector(vector, vector_hash) # Writing vector to disk if not already there
124
125
  end
125
126
 
126
- def add_vector_id(vector, id)
127
- save_vector(vector) # Writing vector to disk if not already there
128
- @redis.set "lsh:vector_to_id:#{vector.hash}", id
129
- @redis.set "lsh:id_to_vector:#{id}", vector.hash.to_s
127
+ def add_vector_hash_to_bucket(bucket, hash, vector_hash)
128
+ @redis.sadd "#{bucket}:#{hash}", vector_hash.to_s # Only storing vector's hash in Redis
130
129
  end
131
130
 
132
- def vector_to_id(vector)
133
- @redis.get "lsh:vector_to_id:#{vector.hash}"
131
+ def add_vector_id(vector_hash, id)
132
+ @redis.set "lsh:vector_to_id:#{vector_hash}", id
133
+ @redis.set "lsh:id_to_vector:#{id}", vector_hash.to_s
134
+ end
135
+
136
+ def vector_hash_to_id(vector_hash)
137
+ @redis.get "lsh:vector_to_id:#{vector_hash}"
134
138
  end
135
139
 
136
140
  def id_to_vector(id)
@@ -143,21 +147,23 @@ module LSH
143
147
  end
144
148
 
145
149
  def query_buckets(hashes)
146
- vector_hashes = []
150
+ results_hashes = {}
147
151
  hashes.each_with_index do |hash, i|
148
152
  bucket = find_bucket(i)
149
- results = @redis.smembers("#{bucket}:#{hash}")
150
- vector_hashes += results if results
153
+ vector_hashes_in_bucket = @redis.smembers("#{bucket}:#{hash}")
154
+ if vector_hashes_in_bucket
155
+ vector_hashes_in_bucket.each do |vector_hash|
156
+ results_hashes[vector_hash] = true
157
+ end
158
+ end
151
159
  end
152
- # Making sure we don't load the same vectors twice if they match
153
- # in different random projections
154
- vector_hashes.uniq!
155
- results = []
156
- vector_hashes.each do |vector_hash|
157
- vector = load_vector(vector_hash)
158
- results << vector
160
+ results_hashes.keys.map do |vector_hash|
161
+ {
162
+ :data => load_vector(vector_hash),
163
+ :hash => vector_hash.to_i,
164
+ :id => vector_hash_to_id(vector_hash)
165
+ }
159
166
  end
160
- results
161
167
  end
162
168
 
163
169
  end
@@ -24,13 +24,7 @@ module LSH
24
24
  if mime_type == 'application/json'
25
25
  t0 = Time.now
26
26
  vector = JSON.parse(params[:data], :create_additions => true)
27
- result_vectors = index.query(vector, params[:radius] || 0)
28
- results = []
29
- if params[:include] == 'id'
30
- result_vectors.each { |v| results << { :id => index.vector_to_id(v), :data => v } }
31
- else
32
- result_vectors.each { |v| results << { :data => v } }
33
- end
27
+ results = index.query(vector, params[:radius] || 0)
34
28
  content_type :json
35
29
  { "time" => Time.now - t0, "results" => results }.to_json
36
30
  else
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lsh
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.0
5
5
  prerelease:
6
6
  platform: java
7
7
  authors: