lsh 0.5.0-java → 0.6.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,20 +39,16 @@ module LSH
39
39
  end
40
40
 
41
41
  def add(vector, id = nil)
42
- vector_hash = vector.hash
43
- storage.add_vector(vector, vector_hash)
44
- storage.add_vector_id(vector_hash, id) if id
42
+ id ||= storage.generate_id
43
+ storage.add_vector(vector, id)
45
44
  hashes(vector).each_with_index do |hash, i|
46
45
  hash_i = array_to_hash(hash)
47
46
  bucket = storage.find_bucket(i)
48
- storage.add_vector_hash_to_bucket(bucket, hash_i, vector_hash)
47
+ storage.add_vector_id_to_bucket(bucket, hash_i, id)
49
48
  end
49
+ id
50
50
  end
51
-
52
- def vector_hash_to_id(vector_hash)
53
- storage.vector_hash_to_id(vector_hash)
54
- end
55
-
51
+
56
52
  def id_to_vector(id)
57
53
  storage.id_to_vector(id)
58
54
  end
@@ -71,7 +67,7 @@ module LSH
71
67
  probes_hashes = probes_arrays.map { |a| array_to_hash(a) }
72
68
  results += storage.query_buckets(probes_hashes)
73
69
  end
74
- results.uniq! { |result| result[:hash] }
70
+ results.uniq! { |result| result[:id] }
75
71
  end
76
72
  order_results_by_similarity(vector, results)
77
73
  end
@@ -83,7 +79,7 @@ module LSH
83
79
 
84
80
  def query_ids_by_vector(vector, multiprobe_radius = 0)
85
81
  results = query(vector, multiprobe_radius)
86
- results.map { |result| vector_hash_to_id(result[:hash]) }
82
+ results.map { |result| result[:id] }
87
83
  end
88
84
 
89
85
  def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
@@ -99,10 +95,8 @@ module LSH
99
95
  end
100
96
 
101
97
  def order_results_by_similarity(vector, results)
102
- # Faster than vectors.sort - we precompute all similarities to vector
103
- # and order using those
104
- similarities = results.map { |result| [ result[:hash], result[:id], result[:data], similarity(vector, result[:data]) ] }
105
- similarities.sort_by { |hash, id, vector, sim| sim } .reverse .map { |vs| { :hash => vs[0], :id => vs[1], :data => vs[2] } }
98
+ vector_t = vector.transpose
99
+ results.sort_by { |result| similarity(result[:data], vector_t) } .reverse
106
100
  end
107
101
 
108
102
  def hashes(vector)
@@ -44,7 +44,7 @@ module LSH
44
44
  end
45
45
 
46
46
  def self.dot(v1, v2)
47
- (v1 * v2.transpose)[0,0]
47
+ (v1 * v2)[0,0]
48
48
  end
49
49
 
50
50
  def self.norm(v)
@@ -37,7 +37,7 @@ module LSH
37
37
  end
38
38
 
39
39
  def self.dot(v1, v2)
40
- (v1 * v2.t)[0,0]
40
+ (v1 * v2)[0,0]
41
41
  end
42
42
 
43
43
  def self.norm(v)
@@ -24,48 +24,42 @@ module LSH
24
24
  attr_accessor :parameters
25
25
  attr_reader :buckets
26
26
 
27
+ def initialize
28
+ reset!
29
+ end
30
+
27
31
  def has_index?
28
32
  projections and parameters and @buckets
29
33
  end
30
34
 
31
35
  def reset!
32
- @buckets = nil
33
- @vectors = nil
34
- @vector_hash_to_id = nil
35
- @id_to_vector = nil
36
+ @buckets = []
37
+ @vectors = {}
38
+ @next_id = 0
36
39
  end
37
40
 
38
41
  def create_new_bucket
39
- @buckets ||= []
40
42
  @buckets << {}
41
43
  end
42
44
 
43
- def add_vector(vector, vector_hash)
44
- @vectors ||= {}
45
- @vectors[vector_hash] = vector
45
+ def generate_id
46
+ @next_id += 1
47
+ end
48
+
49
+ def add_vector(vector, id)
50
+ @vectors[id] = vector
46
51
  end
47
52
 
48
- def add_vector_hash_to_bucket(bucket, hash, vector_hash)
53
+ def add_vector_id_to_bucket(bucket, hash, vector_id)
49
54
  if bucket.has_key? hash
50
- bucket[hash] << vector_hash
55
+ bucket[hash] << vector_id
51
56
  else
52
- bucket[hash] = [vector_hash]
57
+ bucket[hash] = [vector_id]
53
58
  end
54
59
  end
55
60
 
56
- def add_vector_id(vector_hash, id)
57
- @vector_hash_to_id ||= {}
58
- @vector_hash_to_id[vector_hash] = id
59
- @id_to_vector ||= {}
60
- @id_to_vector[id] = vector_hash
61
- end
62
-
63
- def vector_hash_to_id(vector_hash)
64
- @vector_hash_to_id[vector_hash] if @vector_hash_to_id
65
- end
66
-
67
61
  def id_to_vector(id)
68
- @vectors[@id_to_vector[id]] if @id_to_vector
62
+ @vectors[id]
69
63
  end
70
64
 
71
65
  def find_bucket(i)
@@ -73,20 +67,19 @@ module LSH
73
67
  end
74
68
 
75
69
  def query_buckets(hashes)
76
- results_hashes = {}
70
+ result_ids = {}
77
71
  hashes.each_with_index do |hash, i|
78
72
  vectors_hashes_in_bucket = @buckets[i][hash]
79
73
  if vectors_hashes_in_bucket
80
- vectors_hashes_in_bucket.each do |vector_hash|
81
- results_hashes[vector_hash] = true
74
+ vectors_hashes_in_bucket.each do |vector_id|
75
+ result_ids[vector_id] = true
82
76
  end
83
77
  end
84
78
  end
85
- results_hashes.keys.map do |vector_hash|
79
+ result_ids.keys.map do |vector_id|
86
80
  {
87
- :data => @vectors[vector_hash],
88
- :hash => vector_hash,
89
- :id => vector_hash_to_id(vector_hash)
81
+ :data => @vectors[vector_id],
82
+ :id => vector_id,
90
83
  }
91
84
  end
92
85
  end
@@ -24,12 +24,17 @@ module LSH
24
24
  class RedisBackend
25
25
 
26
26
  attr_reader :redis, :data_dir
27
+ attr_accessor :vector_cache, :cache_vectors
27
28
 
28
- def initialize(params = { :redis => { :host => '127.0.0.1', :port => 6379 }, :data_dir => 'data' })
29
+ def initialize(params = {})
30
+ defaults = {:redis => {}, :data_dir => "data", :cache_vectors => TRUE}
31
+ params = defaults.merge params
29
32
  @redis = Redis.new(params[:redis])
30
33
  @data_dir = params[:data_dir]
31
34
  Dir.mkdir(@data_dir) unless File.exists?(@data_dir)
32
35
  Dir.mkdir(File.join(@data_dir, 'projections')) unless File.exists?(File.join(@data_dir, 'projections'))
36
+ @cache_vectors = params[:cache_vectors]
37
+ @vector_cache = {}
33
38
  end
34
39
 
35
40
  def reset!
@@ -40,11 +45,9 @@ module LSH
40
45
  def clear_data!
41
46
  keys = @redis.keys("lsh:bucket:*")
42
47
  @redis.del(keys) unless keys.empty?
43
- keys = @redis.keys("lsh:vector_to_id:*")
44
- @redis.del(keys) unless keys.empty?
45
- keys = @redis.keys("lsh:id_to_vector:*")
46
- @redis.del(keys) unless keys.empty?
47
48
  delete_dat_files_in_dir(@data_dir)
49
+ @redis.set("lsh:max_vector_id", 0)
50
+ @vector_cache = {}
48
51
  end
49
52
 
50
53
  def clear_projections!
@@ -109,37 +112,36 @@ module LSH
109
112
  @redis.incr "lsh:buckets"
110
113
  end
111
114
 
112
- def save_vector(vector, vector_hash)
113
- path = File.join(@data_dir, vector_hash.to_s+'.dat')
114
- vector.save(path) unless File.exists?(path)
115
- end
116
-
117
- def load_vector(hash)
118
- vector = MathUtil.zeros(1, parameters[:dim])
119
- vector.load(File.join(@data_dir, hash+'.dat'))
120
- vector
115
+ def generate_id
116
+ (@redis.incr "lsh:max_vector_id").to_s
121
117
  end
122
118
 
123
- def add_vector(vector, vector_hash)
124
- save_vector(vector, vector_hash) # Writing vector to disk if not already there
119
+ def save_vector(vector, vector_id)
120
+ path = File.join(@data_dir, vector_id+'.dat')
121
+ raise "File #{path} already exists" if File.exists?(path)
122
+ vector.save(path)
123
+ @vector_cache[vector_id] = vector if @cache_vectors
125
124
  end
126
125
 
127
- def add_vector_hash_to_bucket(bucket, hash, vector_hash)
128
- @redis.sadd "#{bucket}:#{hash}", vector_hash.to_s # Only storing vector's hash in Redis
126
+ def load_vector(vector_id)
127
+ @vector_cache[vector_id] || (
128
+ vector = MathUtil.zeros(1, parameters[:dim])
129
+ vector.load(File.join(@data_dir, vector_id+'.dat'))
130
+ @vector_cache[vector_id] = vector if @cache_vectors
131
+ vector
132
+ )
129
133
  end
130
134
 
131
- def add_vector_id(vector_hash, id)
132
- @redis.set "lsh:vector_to_id:#{vector_hash}", id
133
- @redis.set "lsh:id_to_vector:#{id}", vector_hash.to_s
135
+ def add_vector(vector, vector_id)
136
+ save_vector(vector, vector_id) # Writing vector to disk if not already there
134
137
  end
135
138
 
136
- def vector_hash_to_id(vector_hash)
137
- @redis.get "lsh:vector_to_id:#{vector_hash}"
139
+ def add_vector_id_to_bucket(bucket, hash, vector_id)
140
+ @redis.sadd "#{bucket}:#{hash}", vector_id
138
141
  end
139
142
 
140
- def id_to_vector(id)
141
- vector_hash = @redis.get "lsh:id_to_vector:#{id}"
142
- load_vector(vector_hash)
143
+ def id_to_vector(vector_id)
144
+ load_vector(vector_id)
143
145
  end
144
146
 
145
147
  def find_bucket(i)
@@ -147,21 +149,16 @@ module LSH
147
149
  end
148
150
 
149
151
  def query_buckets(hashes)
150
- results_hashes = {}
151
- hashes.each_with_index do |hash, i|
152
+ keys = hashes.each_with_index.map do |hash, i|
152
153
  bucket = find_bucket(i)
153
- vector_hashes_in_bucket = @redis.smembers("#{bucket}:#{hash}")
154
- if vector_hashes_in_bucket
155
- vector_hashes_in_bucket.each do |vector_hash|
156
- results_hashes[vector_hash] = true
157
- end
158
- end
154
+ "#{bucket}:#{hash}"
159
155
  end
160
- results_hashes.keys.map do |vector_hash|
156
+ result_ids = @redis.sunion(keys)
157
+
158
+ result_ids.map do |vector_id|
161
159
  {
162
- :data => load_vector(vector_hash),
163
- :hash => vector_hash.to_i,
164
- :id => vector_hash_to_id(vector_hash)
160
+ :data => load_vector(vector_id),
161
+ :id => vector_id
165
162
  }
166
163
  end
167
164
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lsh
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  prerelease:
6
6
  platform: java
7
7
  authors:
@@ -83,6 +83,78 @@ dependencies:
83
83
  none: false
84
84
  prerelease: false
85
85
  type: :runtime
86
+ - !ruby/object:Gem::Dependency
87
+ name: mock_redis
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: !binary |-
93
+ MA==
94
+ none: false
95
+ requirement: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: !binary |-
100
+ MA==
101
+ none: false
102
+ prerelease: false
103
+ type: :development
104
+ - !ruby/object:Gem::Dependency
105
+ name: rack-test
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: !binary |-
111
+ MA==
112
+ none: false
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: !binary |-
118
+ MA==
119
+ none: false
120
+ prerelease: false
121
+ type: :development
122
+ - !ruby/object:Gem::Dependency
123
+ name: rake
124
+ version_requirements: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: !binary |-
129
+ MA==
130
+ none: false
131
+ requirement: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: !binary |-
136
+ MA==
137
+ none: false
138
+ prerelease: false
139
+ type: :development
140
+ - !ruby/object:Gem::Dependency
141
+ name: mocha
142
+ version_requirements: !ruby/object:Gem::Requirement
143
+ requirements:
144
+ - - ">="
145
+ - !ruby/object:Gem::Version
146
+ version: !binary |-
147
+ MA==
148
+ none: false
149
+ requirement: !ruby/object:Gem::Requirement
150
+ requirements:
151
+ - - ">="
152
+ - !ruby/object:Gem::Version
153
+ version: !binary |-
154
+ MA==
155
+ none: false
156
+ prerelease: false
157
+ type: :development
86
158
  description: An implementation of LSH in Ruby, using JBLAS for JRuby and GSL for MRI
87
159
  email: yves.raimond@bbc.co.uk
88
160
  executables: []