lsh 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,20 +39,16 @@ module LSH
39
39
  end
40
40
 
41
41
  def add(vector, id = nil)
42
- vector_hash = vector.hash
43
- storage.add_vector(vector, vector_hash)
44
- storage.add_vector_id(vector_hash, id) if id
42
+ id ||= storage.generate_id
43
+ storage.add_vector(vector, id)
45
44
  hashes(vector).each_with_index do |hash, i|
46
45
  hash_i = array_to_hash(hash)
47
46
  bucket = storage.find_bucket(i)
48
- storage.add_vector_hash_to_bucket(bucket, hash_i, vector_hash)
47
+ storage.add_vector_id_to_bucket(bucket, hash_i, id)
49
48
  end
49
+ id
50
50
  end
51
-
52
- def vector_hash_to_id(vector_hash)
53
- storage.vector_hash_to_id(vector_hash)
54
- end
55
-
51
+
56
52
  def id_to_vector(id)
57
53
  storage.id_to_vector(id)
58
54
  end
@@ -71,7 +67,7 @@ module LSH
71
67
  probes_hashes = probes_arrays.map { |a| array_to_hash(a) }
72
68
  results += storage.query_buckets(probes_hashes)
73
69
  end
74
- results.uniq! { |result| result[:hash] }
70
+ results.uniq! { |result| result[:id] }
75
71
  end
76
72
  order_results_by_similarity(vector, results)
77
73
  end
@@ -83,7 +79,7 @@ module LSH
83
79
 
84
80
  def query_ids_by_vector(vector, multiprobe_radius = 0)
85
81
  results = query(vector, multiprobe_radius)
86
- results.map { |result| vector_hash_to_id(result[:hash]) }
82
+ results.map { |result| result[:id] }
87
83
  end
88
84
 
89
85
  def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
@@ -99,10 +95,8 @@ module LSH
99
95
  end
100
96
 
101
97
  def order_results_by_similarity(vector, results)
102
- # Faster than vectors.sort - we precompute all similarities to vector
103
- # and order using those
104
- similarities = results.map { |result| [ result[:hash], result[:id], result[:data], similarity(vector, result[:data]) ] }
105
- similarities.sort_by { |hash, id, vector, sim| sim } .reverse .map { |vs| { :hash => vs[0], :id => vs[1], :data => vs[2] } }
98
+ vector_t = vector.transpose
99
+ results.sort_by { |result| similarity(result[:data], vector_t) } .reverse
106
100
  end
107
101
 
108
102
  def hashes(vector)
@@ -44,7 +44,7 @@ module LSH
44
44
  end
45
45
 
46
46
  def self.dot(v1, v2)
47
- (v1 * v2.transpose)[0,0]
47
+ (v1 * v2)[0,0]
48
48
  end
49
49
 
50
50
  def self.norm(v)
@@ -37,7 +37,7 @@ module LSH
37
37
  end
38
38
 
39
39
  def self.dot(v1, v2)
40
- (v1 * v2.t)[0,0]
40
+ (v1 * v2)[0,0]
41
41
  end
42
42
 
43
43
  def self.norm(v)
@@ -24,48 +24,42 @@ module LSH
24
24
  attr_accessor :parameters
25
25
  attr_reader :buckets
26
26
 
27
+ def initialize
28
+ reset!
29
+ end
30
+
27
31
  def has_index?
28
32
  projections and parameters and @buckets
29
33
  end
30
34
 
31
35
  def reset!
32
- @buckets = nil
33
- @vectors = nil
34
- @vector_hash_to_id = nil
35
- @id_to_vector = nil
36
+ @buckets = []
37
+ @vectors = {}
38
+ @next_id = 0
36
39
  end
37
40
 
38
41
  def create_new_bucket
39
- @buckets ||= []
40
42
  @buckets << {}
41
43
  end
42
44
 
43
- def add_vector(vector, vector_hash)
44
- @vectors ||= {}
45
- @vectors[vector_hash] = vector
45
+ def generate_id
46
+ @next_id += 1
47
+ end
48
+
49
+ def add_vector(vector, id)
50
+ @vectors[id] = vector
46
51
  end
47
52
 
48
- def add_vector_hash_to_bucket(bucket, hash, vector_hash)
53
+ def add_vector_id_to_bucket(bucket, hash, vector_id)
49
54
  if bucket.has_key? hash
50
- bucket[hash] << vector_hash
55
+ bucket[hash] << vector_id
51
56
  else
52
- bucket[hash] = [vector_hash]
57
+ bucket[hash] = [vector_id]
53
58
  end
54
59
  end
55
60
 
56
- def add_vector_id(vector_hash, id)
57
- @vector_hash_to_id ||= {}
58
- @vector_hash_to_id[vector_hash] = id
59
- @id_to_vector ||= {}
60
- @id_to_vector[id] = vector_hash
61
- end
62
-
63
- def vector_hash_to_id(vector_hash)
64
- @vector_hash_to_id[vector_hash] if @vector_hash_to_id
65
- end
66
-
67
61
  def id_to_vector(id)
68
- @vectors[@id_to_vector[id]] if @id_to_vector
62
+ @vectors[id]
69
63
  end
70
64
 
71
65
  def find_bucket(i)
@@ -73,20 +67,19 @@ module LSH
73
67
  end
74
68
 
75
69
  def query_buckets(hashes)
76
- results_hashes = {}
70
+ result_ids = {}
77
71
  hashes.each_with_index do |hash, i|
78
72
  vectors_hashes_in_bucket = @buckets[i][hash]
79
73
  if vectors_hashes_in_bucket
80
- vectors_hashes_in_bucket.each do |vector_hash|
81
- results_hashes[vector_hash] = true
74
+ vectors_hashes_in_bucket.each do |vector_id|
75
+ result_ids[vector_id] = true
82
76
  end
83
77
  end
84
78
  end
85
- results_hashes.keys.map do |vector_hash|
79
+ result_ids.keys.map do |vector_id|
86
80
  {
87
- :data => @vectors[vector_hash],
88
- :hash => vector_hash,
89
- :id => vector_hash_to_id(vector_hash)
81
+ :data => @vectors[vector_id],
82
+ :id => vector_id,
90
83
  }
91
84
  end
92
85
  end
@@ -24,12 +24,17 @@ module LSH
24
24
  class RedisBackend
25
25
 
26
26
  attr_reader :redis, :data_dir
27
+ attr_accessor :vector_cache, :cache_vectors
27
28
 
28
- def initialize(params = { :redis => { :host => '127.0.0.1', :port => 6379 }, :data_dir => 'data' })
29
+ def initialize(params = {})
30
+ defaults = {:redis => {}, :data_dir => "data", :cache_vectors => TRUE}
31
+ params = defaults.merge params
29
32
  @redis = Redis.new(params[:redis])
30
33
  @data_dir = params[:data_dir]
31
34
  Dir.mkdir(@data_dir) unless File.exists?(@data_dir)
32
35
  Dir.mkdir(File.join(@data_dir, 'projections')) unless File.exists?(File.join(@data_dir, 'projections'))
36
+ @cache_vectors = params[:cache_vectors]
37
+ @vector_cache = {}
33
38
  end
34
39
 
35
40
  def reset!
@@ -40,11 +45,9 @@ module LSH
40
45
  def clear_data!
41
46
  keys = @redis.keys("lsh:bucket:*")
42
47
  @redis.del(keys) unless keys.empty?
43
- keys = @redis.keys("lsh:vector_to_id:*")
44
- @redis.del(keys) unless keys.empty?
45
- keys = @redis.keys("lsh:id_to_vector:*")
46
- @redis.del(keys) unless keys.empty?
47
48
  delete_dat_files_in_dir(@data_dir)
49
+ @redis.set("lsh:max_vector_id", 0)
50
+ @vector_cache = {}
48
51
  end
49
52
 
50
53
  def clear_projections!
@@ -109,37 +112,36 @@ module LSH
109
112
  @redis.incr "lsh:buckets"
110
113
  end
111
114
 
112
- def save_vector(vector, vector_hash)
113
- path = File.join(@data_dir, vector_hash.to_s+'.dat')
114
- vector.save(path) unless File.exists?(path)
115
- end
116
-
117
- def load_vector(hash)
118
- vector = MathUtil.zeros(1, parameters[:dim])
119
- vector.load(File.join(@data_dir, hash+'.dat'))
120
- vector
115
+ def generate_id
116
+ (@redis.incr "lsh:max_vector_id").to_s
121
117
  end
122
118
 
123
- def add_vector(vector, vector_hash)
124
- save_vector(vector, vector_hash) # Writing vector to disk if not already there
119
+ def save_vector(vector, vector_id)
120
+ path = File.join(@data_dir, vector_id+'.dat')
121
+ raise "File #{path} already exists" if File.exists?(path)
122
+ vector.save(path)
123
+ @vector_cache[vector_id] = vector if @cache_vectors
125
124
  end
126
125
 
127
- def add_vector_hash_to_bucket(bucket, hash, vector_hash)
128
- @redis.sadd "#{bucket}:#{hash}", vector_hash.to_s # Only storing vector's hash in Redis
126
+ def load_vector(vector_id)
127
+ @vector_cache[vector_id] || (
128
+ vector = MathUtil.zeros(1, parameters[:dim])
129
+ vector.load(File.join(@data_dir, vector_id+'.dat'))
130
+ @vector_cache[vector_id] = vector if @cache_vectors
131
+ vector
132
+ )
129
133
  end
130
134
 
131
- def add_vector_id(vector_hash, id)
132
- @redis.set "lsh:vector_to_id:#{vector_hash}", id
133
- @redis.set "lsh:id_to_vector:#{id}", vector_hash.to_s
135
+ def add_vector(vector, vector_id)
136
+ save_vector(vector, vector_id) # Writing vector to disk if not already there
134
137
  end
135
138
 
136
- def vector_hash_to_id(vector_hash)
137
- @redis.get "lsh:vector_to_id:#{vector_hash}"
139
+ def add_vector_id_to_bucket(bucket, hash, vector_id)
140
+ @redis.sadd "#{bucket}:#{hash}", vector_id
138
141
  end
139
142
 
140
- def id_to_vector(id)
141
- vector_hash = @redis.get "lsh:id_to_vector:#{id}"
142
- load_vector(vector_hash)
143
+ def id_to_vector(vector_id)
144
+ load_vector(vector_id)
143
145
  end
144
146
 
145
147
  def find_bucket(i)
@@ -147,21 +149,16 @@ module LSH
147
149
  end
148
150
 
149
151
  def query_buckets(hashes)
150
- results_hashes = {}
151
- hashes.each_with_index do |hash, i|
152
+ keys = hashes.each_with_index.map do |hash, i|
152
153
  bucket = find_bucket(i)
153
- vector_hashes_in_bucket = @redis.smembers("#{bucket}:#{hash}")
154
- if vector_hashes_in_bucket
155
- vector_hashes_in_bucket.each do |vector_hash|
156
- results_hashes[vector_hash] = true
157
- end
158
- end
154
+ "#{bucket}:#{hash}"
159
155
  end
160
- results_hashes.keys.map do |vector_hash|
156
+ result_ids = @redis.sunion(keys)
157
+
158
+ result_ids.map do |vector_id|
161
159
  {
162
- :data => load_vector(vector_hash),
163
- :hash => vector_hash.to_i,
164
- :id => vector_hash_to_id(vector_hash)
160
+ :data => load_vector(vector_id),
161
+ :id => vector_id
165
162
  }
166
163
  end
167
164
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lsh
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -75,6 +75,70 @@ dependencies:
75
75
  - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
77
  version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: mock_redis
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: rack-test
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: rake
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: mocha
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :development
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
78
142
  description: An implementation of LSH in Ruby, using JBLAS for JRuby and GSL for MRI
79
143
  email: yves.raimond@bbc.co.uk
80
144
  executables: []