lsh 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -39,20 +39,16 @@ module LSH
39
39
  end
40
40
 
41
41
  def add(vector, id = nil)
42
- vector_hash = vector.hash
43
- storage.add_vector(vector, vector_hash)
44
- storage.add_vector_id(vector_hash, id) if id
42
+ id ||= storage.generate_id
43
+ storage.add_vector(vector, id)
45
44
  hashes(vector).each_with_index do |hash, i|
46
45
  hash_i = array_to_hash(hash)
47
46
  bucket = storage.find_bucket(i)
48
- storage.add_vector_hash_to_bucket(bucket, hash_i, vector_hash)
47
+ storage.add_vector_id_to_bucket(bucket, hash_i, id)
49
48
  end
49
+ id
50
50
  end
51
-
52
- def vector_hash_to_id(vector_hash)
53
- storage.vector_hash_to_id(vector_hash)
54
- end
55
-
51
+
56
52
  def id_to_vector(id)
57
53
  storage.id_to_vector(id)
58
54
  end
@@ -71,7 +67,7 @@ module LSH
71
67
  probes_hashes = probes_arrays.map { |a| array_to_hash(a) }
72
68
  results += storage.query_buckets(probes_hashes)
73
69
  end
74
- results.uniq! { |result| result[:hash] }
70
+ results.uniq! { |result| result[:id] }
75
71
  end
76
72
  order_results_by_similarity(vector, results)
77
73
  end
@@ -83,7 +79,7 @@ module LSH
83
79
 
84
80
  def query_ids_by_vector(vector, multiprobe_radius = 0)
85
81
  results = query(vector, multiprobe_radius)
86
- results.map { |result| vector_hash_to_id(result[:hash]) }
82
+ results.map { |result| result[:id] }
87
83
  end
88
84
 
89
85
  def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
@@ -99,10 +95,8 @@ module LSH
99
95
  end
100
96
 
101
97
  def order_results_by_similarity(vector, results)
102
- # Faster than vectors.sort - we precompute all similarities to vector
103
- # and order using those
104
- similarities = results.map { |result| [ result[:hash], result[:id], result[:data], similarity(vector, result[:data]) ] }
105
- similarities.sort_by { |hash, id, vector, sim| sim } .reverse .map { |vs| { :hash => vs[0], :id => vs[1], :data => vs[2] } }
98
+ vector_t = vector.transpose
99
+ results.sort_by { |result| similarity(result[:data], vector_t) } .reverse
106
100
  end
107
101
 
108
102
  def hashes(vector)
@@ -44,7 +44,7 @@ module LSH
44
44
  end
45
45
 
46
46
  def self.dot(v1, v2)
47
- (v1 * v2.transpose)[0,0]
47
+ (v1 * v2)[0,0]
48
48
  end
49
49
 
50
50
  def self.norm(v)
@@ -37,7 +37,7 @@ module LSH
37
37
  end
38
38
 
39
39
  def self.dot(v1, v2)
40
- (v1 * v2.t)[0,0]
40
+ (v1 * v2)[0,0]
41
41
  end
42
42
 
43
43
  def self.norm(v)
@@ -24,48 +24,42 @@ module LSH
24
24
  attr_accessor :parameters
25
25
  attr_reader :buckets
26
26
 
27
+ def initialize
28
+ reset!
29
+ end
30
+
27
31
  def has_index?
28
32
  projections and parameters and @buckets
29
33
  end
30
34
 
31
35
  def reset!
32
- @buckets = nil
33
- @vectors = nil
34
- @vector_hash_to_id = nil
35
- @id_to_vector = nil
36
+ @buckets = []
37
+ @vectors = {}
38
+ @next_id = 0
36
39
  end
37
40
 
38
41
  def create_new_bucket
39
- @buckets ||= []
40
42
  @buckets << {}
41
43
  end
42
44
 
43
- def add_vector(vector, vector_hash)
44
- @vectors ||= {}
45
- @vectors[vector_hash] = vector
45
+ def generate_id
46
+ @next_id += 1
47
+ end
48
+
49
+ def add_vector(vector, id)
50
+ @vectors[id] = vector
46
51
  end
47
52
 
48
- def add_vector_hash_to_bucket(bucket, hash, vector_hash)
53
+ def add_vector_id_to_bucket(bucket, hash, vector_id)
49
54
  if bucket.has_key? hash
50
- bucket[hash] << vector_hash
55
+ bucket[hash] << vector_id
51
56
  else
52
- bucket[hash] = [vector_hash]
57
+ bucket[hash] = [vector_id]
53
58
  end
54
59
  end
55
60
 
56
- def add_vector_id(vector_hash, id)
57
- @vector_hash_to_id ||= {}
58
- @vector_hash_to_id[vector_hash] = id
59
- @id_to_vector ||= {}
60
- @id_to_vector[id] = vector_hash
61
- end
62
-
63
- def vector_hash_to_id(vector_hash)
64
- @vector_hash_to_id[vector_hash] if @vector_hash_to_id
65
- end
66
-
67
61
  def id_to_vector(id)
68
- @vectors[@id_to_vector[id]] if @id_to_vector
62
+ @vectors[id]
69
63
  end
70
64
 
71
65
  def find_bucket(i)
@@ -73,20 +67,19 @@ module LSH
73
67
  end
74
68
 
75
69
  def query_buckets(hashes)
76
- results_hashes = {}
70
+ result_ids = {}
77
71
  hashes.each_with_index do |hash, i|
78
72
  vectors_hashes_in_bucket = @buckets[i][hash]
79
73
  if vectors_hashes_in_bucket
80
- vectors_hashes_in_bucket.each do |vector_hash|
81
- results_hashes[vector_hash] = true
74
+ vectors_hashes_in_bucket.each do |vector_id|
75
+ result_ids[vector_id] = true
82
76
  end
83
77
  end
84
78
  end
85
- results_hashes.keys.map do |vector_hash|
79
+ result_ids.keys.map do |vector_id|
86
80
  {
87
- :data => @vectors[vector_hash],
88
- :hash => vector_hash,
89
- :id => vector_hash_to_id(vector_hash)
81
+ :data => @vectors[vector_id],
82
+ :id => vector_id,
90
83
  }
91
84
  end
92
85
  end
@@ -24,12 +24,17 @@ module LSH
24
24
  class RedisBackend
25
25
 
26
26
  attr_reader :redis, :data_dir
27
+ attr_accessor :vector_cache, :cache_vectors
27
28
 
28
- def initialize(params = { :redis => { :host => '127.0.0.1', :port => 6379 }, :data_dir => 'data' })
29
+ def initialize(params = {})
30
+ defaults = {:redis => {}, :data_dir => "data", :cache_vectors => TRUE}
31
+ params = defaults.merge params
29
32
  @redis = Redis.new(params[:redis])
30
33
  @data_dir = params[:data_dir]
31
34
  Dir.mkdir(@data_dir) unless File.exists?(@data_dir)
32
35
  Dir.mkdir(File.join(@data_dir, 'projections')) unless File.exists?(File.join(@data_dir, 'projections'))
36
+ @cache_vectors = params[:cache_vectors]
37
+ @vector_cache = {}
33
38
  end
34
39
 
35
40
  def reset!
@@ -40,11 +45,9 @@ module LSH
40
45
  def clear_data!
41
46
  keys = @redis.keys("lsh:bucket:*")
42
47
  @redis.del(keys) unless keys.empty?
43
- keys = @redis.keys("lsh:vector_to_id:*")
44
- @redis.del(keys) unless keys.empty?
45
- keys = @redis.keys("lsh:id_to_vector:*")
46
- @redis.del(keys) unless keys.empty?
47
48
  delete_dat_files_in_dir(@data_dir)
49
+ @redis.set("lsh:max_vector_id", 0)
50
+ @vector_cache = {}
48
51
  end
49
52
 
50
53
  def clear_projections!
@@ -109,37 +112,36 @@ module LSH
109
112
  @redis.incr "lsh:buckets"
110
113
  end
111
114
 
112
- def save_vector(vector, vector_hash)
113
- path = File.join(@data_dir, vector_hash.to_s+'.dat')
114
- vector.save(path) unless File.exists?(path)
115
- end
116
-
117
- def load_vector(hash)
118
- vector = MathUtil.zeros(1, parameters[:dim])
119
- vector.load(File.join(@data_dir, hash+'.dat'))
120
- vector
115
+ def generate_id
116
+ (@redis.incr "lsh:max_vector_id").to_s
121
117
  end
122
118
 
123
- def add_vector(vector, vector_hash)
124
- save_vector(vector, vector_hash) # Writing vector to disk if not already there
119
+ def save_vector(vector, vector_id)
120
+ path = File.join(@data_dir, vector_id+'.dat')
121
+ raise "File #{path} already exists" if File.exists?(path)
122
+ vector.save(path)
123
+ @vector_cache[vector_id] = vector if @cache_vectors
125
124
  end
126
125
 
127
- def add_vector_hash_to_bucket(bucket, hash, vector_hash)
128
- @redis.sadd "#{bucket}:#{hash}", vector_hash.to_s # Only storing vector's hash in Redis
126
+ def load_vector(vector_id)
127
+ @vector_cache[vector_id] || (
128
+ vector = MathUtil.zeros(1, parameters[:dim])
129
+ vector.load(File.join(@data_dir, vector_id+'.dat'))
130
+ @vector_cache[vector_id] = vector if @cache_vectors
131
+ vector
132
+ )
129
133
  end
130
134
 
131
- def add_vector_id(vector_hash, id)
132
- @redis.set "lsh:vector_to_id:#{vector_hash}", id
133
- @redis.set "lsh:id_to_vector:#{id}", vector_hash.to_s
135
+ def add_vector(vector, vector_id)
136
+ save_vector(vector, vector_id) # Writing vector to disk if not already there
134
137
  end
135
138
 
136
- def vector_hash_to_id(vector_hash)
137
- @redis.get "lsh:vector_to_id:#{vector_hash}"
139
+ def add_vector_id_to_bucket(bucket, hash, vector_id)
140
+ @redis.sadd "#{bucket}:#{hash}", vector_id
138
141
  end
139
142
 
140
- def id_to_vector(id)
141
- vector_hash = @redis.get "lsh:id_to_vector:#{id}"
142
- load_vector(vector_hash)
143
+ def id_to_vector(vector_id)
144
+ load_vector(vector_id)
143
145
  end
144
146
 
145
147
  def find_bucket(i)
@@ -147,21 +149,16 @@ module LSH
147
149
  end
148
150
 
149
151
  def query_buckets(hashes)
150
- results_hashes = {}
151
- hashes.each_with_index do |hash, i|
152
+ keys = hashes.each_with_index.map do |hash, i|
152
153
  bucket = find_bucket(i)
153
- vector_hashes_in_bucket = @redis.smembers("#{bucket}:#{hash}")
154
- if vector_hashes_in_bucket
155
- vector_hashes_in_bucket.each do |vector_hash|
156
- results_hashes[vector_hash] = true
157
- end
158
- end
154
+ "#{bucket}:#{hash}"
159
155
  end
160
- results_hashes.keys.map do |vector_hash|
156
+ result_ids = @redis.sunion(keys)
157
+
158
+ result_ids.map do |vector_id|
161
159
  {
162
- :data => load_vector(vector_hash),
163
- :hash => vector_hash.to_i,
164
- :id => vector_hash_to_id(vector_hash)
160
+ :data => load_vector(vector_id),
161
+ :id => vector_id
165
162
  }
166
163
  end
167
164
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lsh
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -75,6 +75,70 @@ dependencies:
75
75
  - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
77
  version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: mock_redis
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: rack-test
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: rake
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: mocha
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :development
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
78
142
  description: An implementation of LSH in Ruby, using JBLAS for JRuby and GSL for MRI
79
143
  email: yves.raimond@bbc.co.uk
80
144
  executables: []