lsh 0.5.0-java → 0.6.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/lsh/index.rb +9 -15
- data/lib/lsh/math_util_gsl.rb +1 -1
- data/lib/lsh/math_util_jblas.rb +1 -1
- data/lib/lsh/storage/memory.rb +23 -30
- data/lib/lsh/storage/redis_backend.rb +35 -38
- metadata +73 -1
data/lib/lsh/index.rb
CHANGED
@@ -39,20 +39,16 @@ module LSH
|
|
39
39
|
end
|
40
40
|
|
41
41
|
def add(vector, id = nil)
|
42
|
-
|
43
|
-
storage.add_vector(vector,
|
44
|
-
storage.add_vector_id(vector_hash, id) if id
|
42
|
+
id ||= storage.generate_id
|
43
|
+
storage.add_vector(vector, id)
|
45
44
|
hashes(vector).each_with_index do |hash, i|
|
46
45
|
hash_i = array_to_hash(hash)
|
47
46
|
bucket = storage.find_bucket(i)
|
48
|
-
storage.
|
47
|
+
storage.add_vector_id_to_bucket(bucket, hash_i, id)
|
49
48
|
end
|
49
|
+
id
|
50
50
|
end
|
51
|
-
|
52
|
-
def vector_hash_to_id(vector_hash)
|
53
|
-
storage.vector_hash_to_id(vector_hash)
|
54
|
-
end
|
55
|
-
|
51
|
+
|
56
52
|
def id_to_vector(id)
|
57
53
|
storage.id_to_vector(id)
|
58
54
|
end
|
@@ -71,7 +67,7 @@ module LSH
|
|
71
67
|
probes_hashes = probes_arrays.map { |a| array_to_hash(a) }
|
72
68
|
results += storage.query_buckets(probes_hashes)
|
73
69
|
end
|
74
|
-
results.uniq! { |result| result[:
|
70
|
+
results.uniq! { |result| result[:id] }
|
75
71
|
end
|
76
72
|
order_results_by_similarity(vector, results)
|
77
73
|
end
|
@@ -83,7 +79,7 @@ module LSH
|
|
83
79
|
|
84
80
|
def query_ids_by_vector(vector, multiprobe_radius = 0)
|
85
81
|
results = query(vector, multiprobe_radius)
|
86
|
-
results.map { |result|
|
82
|
+
results.map { |result| result[:id] }
|
87
83
|
end
|
88
84
|
|
89
85
|
def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
|
@@ -99,10 +95,8 @@ module LSH
|
|
99
95
|
end
|
100
96
|
|
101
97
|
def order_results_by_similarity(vector, results)
|
102
|
-
|
103
|
-
|
104
|
-
similarities = results.map { |result| [ result[:hash], result[:id], result[:data], similarity(vector, result[:data]) ] }
|
105
|
-
similarities.sort_by { |hash, id, vector, sim| sim } .reverse .map { |vs| { :hash => vs[0], :id => vs[1], :data => vs[2] } }
|
98
|
+
vector_t = vector.transpose
|
99
|
+
results.sort_by { |result| similarity(result[:data], vector_t) } .reverse
|
106
100
|
end
|
107
101
|
|
108
102
|
def hashes(vector)
|
data/lib/lsh/math_util_gsl.rb
CHANGED
data/lib/lsh/math_util_jblas.rb
CHANGED
data/lib/lsh/storage/memory.rb
CHANGED
@@ -24,48 +24,42 @@ module LSH
|
|
24
24
|
attr_accessor :parameters
|
25
25
|
attr_reader :buckets
|
26
26
|
|
27
|
+
def initialize
|
28
|
+
reset!
|
29
|
+
end
|
30
|
+
|
27
31
|
def has_index?
|
28
32
|
projections and parameters and @buckets
|
29
33
|
end
|
30
34
|
|
31
35
|
def reset!
|
32
|
-
@buckets =
|
33
|
-
@vectors =
|
34
|
-
@
|
35
|
-
@id_to_vector = nil
|
36
|
+
@buckets = []
|
37
|
+
@vectors = {}
|
38
|
+
@next_id = 0
|
36
39
|
end
|
37
40
|
|
38
41
|
def create_new_bucket
|
39
|
-
@buckets ||= []
|
40
42
|
@buckets << {}
|
41
43
|
end
|
42
44
|
|
43
|
-
def
|
44
|
-
@
|
45
|
-
|
45
|
+
def generate_id
|
46
|
+
@next_id += 1
|
47
|
+
end
|
48
|
+
|
49
|
+
def add_vector(vector, id)
|
50
|
+
@vectors[id] = vector
|
46
51
|
end
|
47
52
|
|
48
|
-
def
|
53
|
+
def add_vector_id_to_bucket(bucket, hash, vector_id)
|
49
54
|
if bucket.has_key? hash
|
50
|
-
bucket[hash] <<
|
55
|
+
bucket[hash] << vector_id
|
51
56
|
else
|
52
|
-
bucket[hash] = [
|
57
|
+
bucket[hash] = [vector_id]
|
53
58
|
end
|
54
59
|
end
|
55
60
|
|
56
|
-
def add_vector_id(vector_hash, id)
|
57
|
-
@vector_hash_to_id ||= {}
|
58
|
-
@vector_hash_to_id[vector_hash] = id
|
59
|
-
@id_to_vector ||= {}
|
60
|
-
@id_to_vector[id] = vector_hash
|
61
|
-
end
|
62
|
-
|
63
|
-
def vector_hash_to_id(vector_hash)
|
64
|
-
@vector_hash_to_id[vector_hash] if @vector_hash_to_id
|
65
|
-
end
|
66
|
-
|
67
61
|
def id_to_vector(id)
|
68
|
-
@vectors[
|
62
|
+
@vectors[id]
|
69
63
|
end
|
70
64
|
|
71
65
|
def find_bucket(i)
|
@@ -73,20 +67,19 @@ module LSH
|
|
73
67
|
end
|
74
68
|
|
75
69
|
def query_buckets(hashes)
|
76
|
-
|
70
|
+
result_ids = {}
|
77
71
|
hashes.each_with_index do |hash, i|
|
78
72
|
vectors_hashes_in_bucket = @buckets[i][hash]
|
79
73
|
if vectors_hashes_in_bucket
|
80
|
-
vectors_hashes_in_bucket.each do |
|
81
|
-
|
74
|
+
vectors_hashes_in_bucket.each do |vector_id|
|
75
|
+
result_ids[vector_id] = true
|
82
76
|
end
|
83
77
|
end
|
84
78
|
end
|
85
|
-
|
79
|
+
result_ids.keys.map do |vector_id|
|
86
80
|
{
|
87
|
-
:data => @vectors[
|
88
|
-
:
|
89
|
-
:id => vector_hash_to_id(vector_hash)
|
81
|
+
:data => @vectors[vector_id],
|
82
|
+
:id => vector_id,
|
90
83
|
}
|
91
84
|
end
|
92
85
|
end
|
@@ -24,12 +24,17 @@ module LSH
|
|
24
24
|
class RedisBackend
|
25
25
|
|
26
26
|
attr_reader :redis, :data_dir
|
27
|
+
attr_accessor :vector_cache, :cache_vectors
|
27
28
|
|
28
|
-
def initialize(params = {
|
29
|
+
def initialize(params = {})
|
30
|
+
defaults = {:redis => {}, :data_dir => "data", :cache_vectors => TRUE}
|
31
|
+
params = defaults.merge params
|
29
32
|
@redis = Redis.new(params[:redis])
|
30
33
|
@data_dir = params[:data_dir]
|
31
34
|
Dir.mkdir(@data_dir) unless File.exists?(@data_dir)
|
32
35
|
Dir.mkdir(File.join(@data_dir, 'projections')) unless File.exists?(File.join(@data_dir, 'projections'))
|
36
|
+
@cache_vectors = params[:cache_vectors]
|
37
|
+
@vector_cache = {}
|
33
38
|
end
|
34
39
|
|
35
40
|
def reset!
|
@@ -40,11 +45,9 @@ module LSH
|
|
40
45
|
def clear_data!
|
41
46
|
keys = @redis.keys("lsh:bucket:*")
|
42
47
|
@redis.del(keys) unless keys.empty?
|
43
|
-
keys = @redis.keys("lsh:vector_to_id:*")
|
44
|
-
@redis.del(keys) unless keys.empty?
|
45
|
-
keys = @redis.keys("lsh:id_to_vector:*")
|
46
|
-
@redis.del(keys) unless keys.empty?
|
47
48
|
delete_dat_files_in_dir(@data_dir)
|
49
|
+
@redis.set("lsh:max_vector_id", 0)
|
50
|
+
@vector_cache = {}
|
48
51
|
end
|
49
52
|
|
50
53
|
def clear_projections!
|
@@ -109,37 +112,36 @@ module LSH
|
|
109
112
|
@redis.incr "lsh:buckets"
|
110
113
|
end
|
111
114
|
|
112
|
-
def
|
113
|
-
|
114
|
-
vector.save(path) unless File.exists?(path)
|
115
|
-
end
|
116
|
-
|
117
|
-
def load_vector(hash)
|
118
|
-
vector = MathUtil.zeros(1, parameters[:dim])
|
119
|
-
vector.load(File.join(@data_dir, hash+'.dat'))
|
120
|
-
vector
|
115
|
+
def generate_id
|
116
|
+
(@redis.incr "lsh:max_vector_id").to_s
|
121
117
|
end
|
122
118
|
|
123
|
-
def
|
124
|
-
|
119
|
+
def save_vector(vector, vector_id)
|
120
|
+
path = File.join(@data_dir, vector_id+'.dat')
|
121
|
+
raise "File #{path} already exists" if File.exists?(path)
|
122
|
+
vector.save(path)
|
123
|
+
@vector_cache[vector_id] = vector if @cache_vectors
|
125
124
|
end
|
126
125
|
|
127
|
-
def
|
128
|
-
@
|
126
|
+
def load_vector(vector_id)
|
127
|
+
@vector_cache[vector_id] || (
|
128
|
+
vector = MathUtil.zeros(1, parameters[:dim])
|
129
|
+
vector.load(File.join(@data_dir, vector_id+'.dat'))
|
130
|
+
@vector_cache[vector_id] = vector if @cache_vectors
|
131
|
+
vector
|
132
|
+
)
|
129
133
|
end
|
130
134
|
|
131
|
-
def
|
132
|
-
|
133
|
-
@redis.set "lsh:id_to_vector:#{id}", vector_hash.to_s
|
135
|
+
def add_vector(vector, vector_id)
|
136
|
+
save_vector(vector, vector_id) # Writing vector to disk if not already there
|
134
137
|
end
|
135
138
|
|
136
|
-
def
|
137
|
-
@redis.
|
139
|
+
def add_vector_id_to_bucket(bucket, hash, vector_id)
|
140
|
+
@redis.sadd "#{bucket}:#{hash}", vector_id
|
138
141
|
end
|
139
142
|
|
140
|
-
def id_to_vector(
|
141
|
-
|
142
|
-
load_vector(vector_hash)
|
143
|
+
def id_to_vector(vector_id)
|
144
|
+
load_vector(vector_id)
|
143
145
|
end
|
144
146
|
|
145
147
|
def find_bucket(i)
|
@@ -147,21 +149,16 @@ module LSH
|
|
147
149
|
end
|
148
150
|
|
149
151
|
def query_buckets(hashes)
|
150
|
-
|
151
|
-
hashes.each_with_index do |hash, i|
|
152
|
+
keys = hashes.each_with_index.map do |hash, i|
|
152
153
|
bucket = find_bucket(i)
|
153
|
-
|
154
|
-
if vector_hashes_in_bucket
|
155
|
-
vector_hashes_in_bucket.each do |vector_hash|
|
156
|
-
results_hashes[vector_hash] = true
|
157
|
-
end
|
158
|
-
end
|
154
|
+
"#{bucket}:#{hash}"
|
159
155
|
end
|
160
|
-
|
156
|
+
result_ids = @redis.sunion(keys)
|
157
|
+
|
158
|
+
result_ids.map do |vector_id|
|
161
159
|
{
|
162
|
-
:data => load_vector(
|
163
|
-
:
|
164
|
-
:id => vector_hash_to_id(vector_hash)
|
160
|
+
:data => load_vector(vector_id),
|
161
|
+
:id => vector_id
|
165
162
|
}
|
166
163
|
end
|
167
164
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lsh
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
prerelease:
|
6
6
|
platform: java
|
7
7
|
authors:
|
@@ -83,6 +83,78 @@ dependencies:
|
|
83
83
|
none: false
|
84
84
|
prerelease: false
|
85
85
|
type: :runtime
|
86
|
+
- !ruby/object:Gem::Dependency
|
87
|
+
name: mock_redis
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: !binary |-
|
93
|
+
MA==
|
94
|
+
none: false
|
95
|
+
requirement: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: !binary |-
|
100
|
+
MA==
|
101
|
+
none: false
|
102
|
+
prerelease: false
|
103
|
+
type: :development
|
104
|
+
- !ruby/object:Gem::Dependency
|
105
|
+
name: rack-test
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: !binary |-
|
111
|
+
MA==
|
112
|
+
none: false
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: !binary |-
|
118
|
+
MA==
|
119
|
+
none: false
|
120
|
+
prerelease: false
|
121
|
+
type: :development
|
122
|
+
- !ruby/object:Gem::Dependency
|
123
|
+
name: rake
|
124
|
+
version_requirements: !ruby/object:Gem::Requirement
|
125
|
+
requirements:
|
126
|
+
- - ">="
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: !binary |-
|
129
|
+
MA==
|
130
|
+
none: false
|
131
|
+
requirement: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: !binary |-
|
136
|
+
MA==
|
137
|
+
none: false
|
138
|
+
prerelease: false
|
139
|
+
type: :development
|
140
|
+
- !ruby/object:Gem::Dependency
|
141
|
+
name: mocha
|
142
|
+
version_requirements: !ruby/object:Gem::Requirement
|
143
|
+
requirements:
|
144
|
+
- - ">="
|
145
|
+
- !ruby/object:Gem::Version
|
146
|
+
version: !binary |-
|
147
|
+
MA==
|
148
|
+
none: false
|
149
|
+
requirement: !ruby/object:Gem::Requirement
|
150
|
+
requirements:
|
151
|
+
- - ">="
|
152
|
+
- !ruby/object:Gem::Version
|
153
|
+
version: !binary |-
|
154
|
+
MA==
|
155
|
+
none: false
|
156
|
+
prerelease: false
|
157
|
+
type: :development
|
86
158
|
description: An implementation of LSH in Ruby, using JBLAS for JRuby and GSL for MRI
|
87
159
|
email: yves.raimond@bbc.co.uk
|
88
160
|
executables: []
|