lsh 0.4.2-java → 0.5.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,16 +39,18 @@ module LSH
39
39
  end
40
40
 
41
41
  def add(vector, id = nil)
42
- storage.add_vector_id(vector, id) if id
42
+ vector_hash = vector.hash
43
+ storage.add_vector(vector, vector_hash)
44
+ storage.add_vector_id(vector_hash, id) if id
43
45
  hashes(vector).each_with_index do |hash, i|
44
46
  hash_i = array_to_hash(hash)
45
47
  bucket = storage.find_bucket(i)
46
- storage.add_vector_to_bucket(bucket, hash_i, vector)
48
+ storage.add_vector_hash_to_bucket(bucket, hash_i, vector_hash)
47
49
  end
48
50
  end
49
51
 
50
- def vector_to_id(vector)
51
- storage.vector_to_id(vector)
52
+ def vector_hash_to_id(vector_hash)
53
+ storage.vector_hash_to_id(vector_hash)
52
54
  end
53
55
 
54
56
  def id_to_vector(id)
@@ -63,14 +65,15 @@ module LSH
63
65
  # Take query hashes, move them around at radius r, and use them to do another query
64
66
  # TODO: only works for binary LSH atm
65
67
  if multiprobe_radius > 0
68
+ raise Exception.new("Non-zero multiprobe radius only implemented for binary LSH") unless hashes_are_binary?
66
69
  mp_arrays = multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
67
70
  mp_arrays.each do |probes_arrays|
68
71
  probes_hashes = probes_arrays.map { |a| array_to_hash(a) }
69
72
  results += storage.query_buckets(probes_hashes)
70
73
  end
74
+ results.uniq! { |result| result[:hash] }
71
75
  end
72
- results = MathUtil.uniq(results)
73
- order_vectors_by_similarity(vector, results)
76
+ order_results_by_similarity(vector, results)
74
77
  end
75
78
 
76
79
  def query_ids(id, multiprobe_radius = 0)
@@ -79,10 +82,8 @@ module LSH
79
82
  end
80
83
 
81
84
  def query_ids_by_vector(vector, multiprobe_radius = 0)
82
- vectors = query(vector, multiprobe_radius)
83
- results = []
84
- vectors.each { |v| results << vector_to_id(v) }
85
- results
85
+ results = query(vector, multiprobe_radius)
86
+ results.map { |result| vector_hash_to_id(result[:hash]) }
86
87
  end
87
88
 
88
89
  def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
@@ -97,8 +98,11 @@ module LSH
97
98
  mp_arrays
98
99
  end
99
100
 
100
- def order_vectors_by_similarity(vector, vectors)
101
- vectors.map { |v| [ v, similarity(vector, v) ] } .sort_by { |v, sim| sim } .reverse .map { |vs| vs[0] }
101
+ def order_results_by_similarity(vector, results)
102
+ # Faster than vectors.sort - we precompute all similarities to vector
103
+ # and order using those
104
+ similarities = results.map { |result| [ result[:hash], result[:id], result[:data], similarity(vector, result[:data]) ] }
105
+ similarities.sort_by { |hash, id, vector, sim| sim } .reverse .map { |vs| { :hash => vs[0], :id => vs[1], :data => vs[2] } }
102
106
  end
103
107
 
104
108
  def hashes(vector)
@@ -108,7 +112,7 @@ module LSH
108
112
  end
109
113
  hashes
110
114
  end
111
-
115
+
112
116
  def hash(vector, projection, bias = true)
113
117
  hash = []
114
118
  dot_products = (projection * vector.transpose).column(0).to_a
@@ -128,6 +132,10 @@ module LSH
128
132
  hash
129
133
  end
130
134
 
135
+ def hashes_are_binary?
136
+ storage.parameters[:window] == Float::INFINITY
137
+ end
138
+
131
139
  def random_vector(dim)
132
140
  MathUtil.random_gaussian_matrix(1, dim)
133
141
  end
@@ -150,7 +158,7 @@ module LSH
150
158
  #value
151
159
  end
152
160
 
153
- def generate_projections(dim, k, l)
161
+ def generate_projections(dim, k, l)
154
162
  projections = []
155
163
  l.times do |i|
156
164
  projections << generate_projection(dim, k)
@@ -160,7 +168,7 @@ module LSH
160
168
 
161
169
  def generate_projection(dim, k)
162
170
  MathUtil.random_gaussian_matrix(k, dim)
163
- end
171
+ end
164
172
 
165
173
  def similarity(v1, v2)
166
174
  MathUtil.dot(v1, v2)
@@ -51,14 +51,6 @@ module LSH
51
51
  v.norm
52
52
  end
53
53
 
54
- def self.uniq(vs)
55
- # Can't use uniq as
56
- # [ v, JSON.parse(v.to_json, :create_additions => true) ].uniq.size == 2 with GSL
57
- results = []
58
- vs.each { |v| results << v unless results.member? v }
59
- results
60
- end
61
-
62
54
  end
63
55
 
64
56
  end
@@ -44,10 +44,6 @@ module LSH
44
44
  v.norm2
45
45
  end
46
46
 
47
- def self.uniq(vs)
48
- vs.uniq
49
- end
50
-
51
47
  end
52
48
 
53
49
  end
@@ -30,6 +30,9 @@ module LSH
30
30
 
31
31
  def reset!
32
32
  @buckets = nil
33
+ @vectors = nil
34
+ @vector_hash_to_id = nil
35
+ @id_to_vector = nil
33
36
  end
34
37
 
35
38
  def create_new_bucket
@@ -37,27 +40,32 @@ module LSH
37
40
  @buckets << {}
38
41
  end
39
42
 
40
- def add_vector_to_bucket(bucket, hash, vector)
43
+ def add_vector(vector, vector_hash)
44
+ @vectors ||= {}
45
+ @vectors[vector_hash] = vector
46
+ end
47
+
48
+ def add_vector_hash_to_bucket(bucket, hash, vector_hash)
41
49
  if bucket.has_key? hash
42
- bucket[hash] << vector
50
+ bucket[hash] << vector_hash
43
51
  else
44
- bucket[hash] = [vector]
52
+ bucket[hash] = [vector_hash]
45
53
  end
46
54
  end
47
55
 
48
- def add_vector_id(vector, id)
49
- @vector_to_id ||= {}
50
- @vector_to_id[vector.hash] = id
56
+ def add_vector_id(vector_hash, id)
57
+ @vector_hash_to_id ||= {}
58
+ @vector_hash_to_id[vector_hash] = id
51
59
  @id_to_vector ||= {}
52
- @id_to_vector[id] = vector
60
+ @id_to_vector[id] = vector_hash
53
61
  end
54
62
 
55
- def vector_to_id(vector)
56
- @vector_to_id[vector.hash] if @vector_to_id
63
+ def vector_hash_to_id(vector_hash)
64
+ @vector_hash_to_id[vector_hash] if @vector_hash_to_id
57
65
  end
58
66
 
59
67
  def id_to_vector(id)
60
- @id_to_vector[id] if @id_to_vector
68
+ @vectors[@id_to_vector[id]] if @id_to_vector
61
69
  end
62
70
 
63
71
  def find_bucket(i)
@@ -65,13 +73,22 @@ module LSH
65
73
  end
66
74
 
67
75
  def query_buckets(hashes)
68
- results = []
76
+ results_hashes = {}
69
77
  hashes.each_with_index do |hash, i|
70
- bucket = find_bucket(i)
71
- in_bucket = bucket[hash]
72
- results += in_bucket if in_bucket
78
+ vectors_hashes_in_bucket = @buckets[i][hash]
79
+ if vectors_hashes_in_bucket
80
+ vectors_hashes_in_bucket.each do |vector_hash|
81
+ results_hashes[vector_hash] = true
82
+ end
83
+ end
84
+ end
85
+ results_hashes.keys.map do |vector_hash|
86
+ {
87
+ :data => @vectors[vector_hash],
88
+ :hash => vector_hash,
89
+ :id => vector_hash_to_id(vector_hash)
90
+ }
73
91
  end
74
- results
75
92
  end
76
93
 
77
94
  end
@@ -28,10 +28,8 @@ module LSH
28
28
  def initialize(params = { :redis => { :host => '127.0.0.1', :port => 6379 }, :data_dir => 'data' })
29
29
  @redis = Redis.new(params[:redis])
30
30
  @data_dir = params[:data_dir]
31
- unless File.exists?(@data_dir)
32
- Dir.mkdir(@data_dir)
33
- Dir.mkdir(File.join(@data_dir, 'projections'))
34
- end
31
+ Dir.mkdir(@data_dir) unless File.exists?(@data_dir)
32
+ Dir.mkdir(File.join(@data_dir, 'projections')) unless File.exists?(File.join(@data_dir, 'projections'))
35
33
  end
36
34
 
37
35
  def reset!
@@ -42,6 +40,10 @@ module LSH
42
40
  def clear_data!
43
41
  keys = @redis.keys("lsh:bucket:*")
44
42
  @redis.del(keys) unless keys.empty?
43
+ keys = @redis.keys("lsh:vector_to_id:*")
44
+ @redis.del(keys) unless keys.empty?
45
+ keys = @redis.keys("lsh:id_to_vector:*")
46
+ @redis.del(keys) unless keys.empty?
45
47
  delete_dat_files_in_dir(@data_dir)
46
48
  end
47
49
 
@@ -107,8 +109,8 @@ module LSH
107
109
  @redis.incr "lsh:buckets"
108
110
  end
109
111
 
110
- def save_vector(vector)
111
- path = File.join(@data_dir, vector.hash.to_s+'.dat')
112
+ def save_vector(vector, vector_hash)
113
+ path = File.join(@data_dir, vector_hash.to_s+'.dat')
112
114
  vector.save(path) unless File.exists?(path)
113
115
  end
114
116
 
@@ -118,19 +120,21 @@ module LSH
118
120
  vector
119
121
  end
120
122
 
121
- def add_vector_to_bucket(bucket, hash, vector)
122
- save_vector(vector) # Writing vector to disk if not already there
123
- @redis.sadd "#{bucket}:#{hash}", vector.hash.to_s # Only storing vector's hash in Redis
123
+ def add_vector(vector, vector_hash)
124
+ save_vector(vector, vector_hash) # Writing vector to disk if not already there
124
125
  end
125
126
 
126
- def add_vector_id(vector, id)
127
- save_vector(vector) # Writing vector to disk if not already there
128
- @redis.set "lsh:vector_to_id:#{vector.hash}", id
129
- @redis.set "lsh:id_to_vector:#{id}", vector.hash.to_s
127
+ def add_vector_hash_to_bucket(bucket, hash, vector_hash)
128
+ @redis.sadd "#{bucket}:#{hash}", vector_hash.to_s # Only storing vector's hash in Redis
130
129
  end
131
130
 
132
- def vector_to_id(vector)
133
- @redis.get "lsh:vector_to_id:#{vector.hash}"
131
+ def add_vector_id(vector_hash, id)
132
+ @redis.set "lsh:vector_to_id:#{vector_hash}", id
133
+ @redis.set "lsh:id_to_vector:#{id}", vector_hash.to_s
134
+ end
135
+
136
+ def vector_hash_to_id(vector_hash)
137
+ @redis.get "lsh:vector_to_id:#{vector_hash}"
134
138
  end
135
139
 
136
140
  def id_to_vector(id)
@@ -143,21 +147,23 @@ module LSH
143
147
  end
144
148
 
145
149
  def query_buckets(hashes)
146
- vector_hashes = []
150
+ results_hashes = {}
147
151
  hashes.each_with_index do |hash, i|
148
152
  bucket = find_bucket(i)
149
- results = @redis.smembers("#{bucket}:#{hash}")
150
- vector_hashes += results if results
153
+ vector_hashes_in_bucket = @redis.smembers("#{bucket}:#{hash}")
154
+ if vector_hashes_in_bucket
155
+ vector_hashes_in_bucket.each do |vector_hash|
156
+ results_hashes[vector_hash] = true
157
+ end
158
+ end
151
159
  end
152
- # Making sure we don't load the same vectors twice if they match
153
- # in different random projections
154
- vector_hashes.uniq!
155
- results = []
156
- vector_hashes.each do |vector_hash|
157
- vector = load_vector(vector_hash)
158
- results << vector
160
+ results_hashes.keys.map do |vector_hash|
161
+ {
162
+ :data => load_vector(vector_hash),
163
+ :hash => vector_hash.to_i,
164
+ :id => vector_hash_to_id(vector_hash)
165
+ }
159
166
  end
160
- results
161
167
  end
162
168
 
163
169
  end
@@ -24,13 +24,7 @@ module LSH
24
24
  if mime_type == 'application/json'
25
25
  t0 = Time.now
26
26
  vector = JSON.parse(params[:data], :create_additions => true)
27
- result_vectors = index.query(vector, params[:radius] || 0)
28
- results = []
29
- if params[:include] == 'id'
30
- result_vectors.each { |v| results << { :id => index.vector_to_id(v), :data => v } }
31
- else
32
- result_vectors.each { |v| results << { :data => v } }
33
- end
27
+ results = index.query(vector, params[:radius] || 0)
34
28
  content_type :json
35
29
  { "time" => Time.now - t0, "results" => results }.to_json
36
30
  else
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lsh
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.0
5
5
  prerelease:
6
6
  platform: java
7
7
  authors: