lsh 0.5.0-java → 0.6.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/lsh/index.rb +9 -15
- data/lib/lsh/math_util_gsl.rb +1 -1
- data/lib/lsh/math_util_jblas.rb +1 -1
- data/lib/lsh/storage/memory.rb +23 -30
- data/lib/lsh/storage/redis_backend.rb +35 -38
- metadata +73 -1
data/lib/lsh/index.rb
CHANGED
@@ -39,20 +39,16 @@ module LSH
|
|
39
39
|
end
|
40
40
|
|
41
41
|
def add(vector, id = nil)
|
42
|
-
|
43
|
-
storage.add_vector(vector,
|
44
|
-
storage.add_vector_id(vector_hash, id) if id
|
42
|
+
id ||= storage.generate_id
|
43
|
+
storage.add_vector(vector, id)
|
45
44
|
hashes(vector).each_with_index do |hash, i|
|
46
45
|
hash_i = array_to_hash(hash)
|
47
46
|
bucket = storage.find_bucket(i)
|
48
|
-
storage.
|
47
|
+
storage.add_vector_id_to_bucket(bucket, hash_i, id)
|
49
48
|
end
|
49
|
+
id
|
50
50
|
end
|
51
|
-
|
52
|
-
def vector_hash_to_id(vector_hash)
|
53
|
-
storage.vector_hash_to_id(vector_hash)
|
54
|
-
end
|
55
|
-
|
51
|
+
|
56
52
|
def id_to_vector(id)
|
57
53
|
storage.id_to_vector(id)
|
58
54
|
end
|
@@ -71,7 +67,7 @@ module LSH
|
|
71
67
|
probes_hashes = probes_arrays.map { |a| array_to_hash(a) }
|
72
68
|
results += storage.query_buckets(probes_hashes)
|
73
69
|
end
|
74
|
-
results.uniq! { |result| result[:
|
70
|
+
results.uniq! { |result| result[:id] }
|
75
71
|
end
|
76
72
|
order_results_by_similarity(vector, results)
|
77
73
|
end
|
@@ -83,7 +79,7 @@ module LSH
|
|
83
79
|
|
84
80
|
def query_ids_by_vector(vector, multiprobe_radius = 0)
|
85
81
|
results = query(vector, multiprobe_radius)
|
86
|
-
results.map { |result|
|
82
|
+
results.map { |result| result[:id] }
|
87
83
|
end
|
88
84
|
|
89
85
|
def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
|
@@ -99,10 +95,8 @@ module LSH
|
|
99
95
|
end
|
100
96
|
|
101
97
|
def order_results_by_similarity(vector, results)
|
102
|
-
|
103
|
-
|
104
|
-
similarities = results.map { |result| [ result[:hash], result[:id], result[:data], similarity(vector, result[:data]) ] }
|
105
|
-
similarities.sort_by { |hash, id, vector, sim| sim } .reverse .map { |vs| { :hash => vs[0], :id => vs[1], :data => vs[2] } }
|
98
|
+
vector_t = vector.transpose
|
99
|
+
results.sort_by { |result| similarity(result[:data], vector_t) } .reverse
|
106
100
|
end
|
107
101
|
|
108
102
|
def hashes(vector)
|
data/lib/lsh/math_util_gsl.rb
CHANGED
data/lib/lsh/math_util_jblas.rb
CHANGED
data/lib/lsh/storage/memory.rb
CHANGED
@@ -24,48 +24,42 @@ module LSH
|
|
24
24
|
attr_accessor :parameters
|
25
25
|
attr_reader :buckets
|
26
26
|
|
27
|
+
def initialize
|
28
|
+
reset!
|
29
|
+
end
|
30
|
+
|
27
31
|
def has_index?
|
28
32
|
projections and parameters and @buckets
|
29
33
|
end
|
30
34
|
|
31
35
|
def reset!
|
32
|
-
@buckets =
|
33
|
-
@vectors =
|
34
|
-
@
|
35
|
-
@id_to_vector = nil
|
36
|
+
@buckets = []
|
37
|
+
@vectors = {}
|
38
|
+
@next_id = 0
|
36
39
|
end
|
37
40
|
|
38
41
|
def create_new_bucket
|
39
|
-
@buckets ||= []
|
40
42
|
@buckets << {}
|
41
43
|
end
|
42
44
|
|
43
|
-
def
|
44
|
-
@
|
45
|
-
|
45
|
+
def generate_id
|
46
|
+
@next_id += 1
|
47
|
+
end
|
48
|
+
|
49
|
+
def add_vector(vector, id)
|
50
|
+
@vectors[id] = vector
|
46
51
|
end
|
47
52
|
|
48
|
-
def
|
53
|
+
def add_vector_id_to_bucket(bucket, hash, vector_id)
|
49
54
|
if bucket.has_key? hash
|
50
|
-
bucket[hash] <<
|
55
|
+
bucket[hash] << vector_id
|
51
56
|
else
|
52
|
-
bucket[hash] = [
|
57
|
+
bucket[hash] = [vector_id]
|
53
58
|
end
|
54
59
|
end
|
55
60
|
|
56
|
-
def add_vector_id(vector_hash, id)
|
57
|
-
@vector_hash_to_id ||= {}
|
58
|
-
@vector_hash_to_id[vector_hash] = id
|
59
|
-
@id_to_vector ||= {}
|
60
|
-
@id_to_vector[id] = vector_hash
|
61
|
-
end
|
62
|
-
|
63
|
-
def vector_hash_to_id(vector_hash)
|
64
|
-
@vector_hash_to_id[vector_hash] if @vector_hash_to_id
|
65
|
-
end
|
66
|
-
|
67
61
|
def id_to_vector(id)
|
68
|
-
@vectors[
|
62
|
+
@vectors[id]
|
69
63
|
end
|
70
64
|
|
71
65
|
def find_bucket(i)
|
@@ -73,20 +67,19 @@ module LSH
|
|
73
67
|
end
|
74
68
|
|
75
69
|
def query_buckets(hashes)
|
76
|
-
|
70
|
+
result_ids = {}
|
77
71
|
hashes.each_with_index do |hash, i|
|
78
72
|
vectors_hashes_in_bucket = @buckets[i][hash]
|
79
73
|
if vectors_hashes_in_bucket
|
80
|
-
vectors_hashes_in_bucket.each do |
|
81
|
-
|
74
|
+
vectors_hashes_in_bucket.each do |vector_id|
|
75
|
+
result_ids[vector_id] = true
|
82
76
|
end
|
83
77
|
end
|
84
78
|
end
|
85
|
-
|
79
|
+
result_ids.keys.map do |vector_id|
|
86
80
|
{
|
87
|
-
:data => @vectors[
|
88
|
-
:
|
89
|
-
:id => vector_hash_to_id(vector_hash)
|
81
|
+
:data => @vectors[vector_id],
|
82
|
+
:id => vector_id,
|
90
83
|
}
|
91
84
|
end
|
92
85
|
end
|
@@ -24,12 +24,17 @@ module LSH
|
|
24
24
|
class RedisBackend
|
25
25
|
|
26
26
|
attr_reader :redis, :data_dir
|
27
|
+
attr_accessor :vector_cache, :cache_vectors
|
27
28
|
|
28
|
-
def initialize(params = {
|
29
|
+
def initialize(params = {})
|
30
|
+
defaults = {:redis => {}, :data_dir => "data", :cache_vectors => TRUE}
|
31
|
+
params = defaults.merge params
|
29
32
|
@redis = Redis.new(params[:redis])
|
30
33
|
@data_dir = params[:data_dir]
|
31
34
|
Dir.mkdir(@data_dir) unless File.exists?(@data_dir)
|
32
35
|
Dir.mkdir(File.join(@data_dir, 'projections')) unless File.exists?(File.join(@data_dir, 'projections'))
|
36
|
+
@cache_vectors = params[:cache_vectors]
|
37
|
+
@vector_cache = {}
|
33
38
|
end
|
34
39
|
|
35
40
|
def reset!
|
@@ -40,11 +45,9 @@ module LSH
|
|
40
45
|
def clear_data!
|
41
46
|
keys = @redis.keys("lsh:bucket:*")
|
42
47
|
@redis.del(keys) unless keys.empty?
|
43
|
-
keys = @redis.keys("lsh:vector_to_id:*")
|
44
|
-
@redis.del(keys) unless keys.empty?
|
45
|
-
keys = @redis.keys("lsh:id_to_vector:*")
|
46
|
-
@redis.del(keys) unless keys.empty?
|
47
48
|
delete_dat_files_in_dir(@data_dir)
|
49
|
+
@redis.set("lsh:max_vector_id", 0)
|
50
|
+
@vector_cache = {}
|
48
51
|
end
|
49
52
|
|
50
53
|
def clear_projections!
|
@@ -109,37 +112,36 @@ module LSH
|
|
109
112
|
@redis.incr "lsh:buckets"
|
110
113
|
end
|
111
114
|
|
112
|
-
def
|
113
|
-
|
114
|
-
vector.save(path) unless File.exists?(path)
|
115
|
-
end
|
116
|
-
|
117
|
-
def load_vector(hash)
|
118
|
-
vector = MathUtil.zeros(1, parameters[:dim])
|
119
|
-
vector.load(File.join(@data_dir, hash+'.dat'))
|
120
|
-
vector
|
115
|
+
def generate_id
|
116
|
+
(@redis.incr "lsh:max_vector_id").to_s
|
121
117
|
end
|
122
118
|
|
123
|
-
def
|
124
|
-
|
119
|
+
def save_vector(vector, vector_id)
|
120
|
+
path = File.join(@data_dir, vector_id+'.dat')
|
121
|
+
raise "File #{path} already exists" if File.exists?(path)
|
122
|
+
vector.save(path)
|
123
|
+
@vector_cache[vector_id] = vector if @cache_vectors
|
125
124
|
end
|
126
125
|
|
127
|
-
def
|
128
|
-
@
|
126
|
+
def load_vector(vector_id)
|
127
|
+
@vector_cache[vector_id] || (
|
128
|
+
vector = MathUtil.zeros(1, parameters[:dim])
|
129
|
+
vector.load(File.join(@data_dir, vector_id+'.dat'))
|
130
|
+
@vector_cache[vector_id] = vector if @cache_vectors
|
131
|
+
vector
|
132
|
+
)
|
129
133
|
end
|
130
134
|
|
131
|
-
def
|
132
|
-
|
133
|
-
@redis.set "lsh:id_to_vector:#{id}", vector_hash.to_s
|
135
|
+
def add_vector(vector, vector_id)
|
136
|
+
save_vector(vector, vector_id) # Writing vector to disk if not already there
|
134
137
|
end
|
135
138
|
|
136
|
-
def
|
137
|
-
@redis.
|
139
|
+
def add_vector_id_to_bucket(bucket, hash, vector_id)
|
140
|
+
@redis.sadd "#{bucket}:#{hash}", vector_id
|
138
141
|
end
|
139
142
|
|
140
|
-
def id_to_vector(
|
141
|
-
|
142
|
-
load_vector(vector_hash)
|
143
|
+
def id_to_vector(vector_id)
|
144
|
+
load_vector(vector_id)
|
143
145
|
end
|
144
146
|
|
145
147
|
def find_bucket(i)
|
@@ -147,21 +149,16 @@ module LSH
|
|
147
149
|
end
|
148
150
|
|
149
151
|
def query_buckets(hashes)
|
150
|
-
|
151
|
-
hashes.each_with_index do |hash, i|
|
152
|
+
keys = hashes.each_with_index.map do |hash, i|
|
152
153
|
bucket = find_bucket(i)
|
153
|
-
|
154
|
-
if vector_hashes_in_bucket
|
155
|
-
vector_hashes_in_bucket.each do |vector_hash|
|
156
|
-
results_hashes[vector_hash] = true
|
157
|
-
end
|
158
|
-
end
|
154
|
+
"#{bucket}:#{hash}"
|
159
155
|
end
|
160
|
-
|
156
|
+
result_ids = @redis.sunion(keys)
|
157
|
+
|
158
|
+
result_ids.map do |vector_id|
|
161
159
|
{
|
162
|
-
:data => load_vector(
|
163
|
-
:
|
164
|
-
:id => vector_hash_to_id(vector_hash)
|
160
|
+
:data => load_vector(vector_id),
|
161
|
+
:id => vector_id
|
165
162
|
}
|
166
163
|
end
|
167
164
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lsh
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
prerelease:
|
6
6
|
platform: java
|
7
7
|
authors:
|
@@ -83,6 +83,78 @@ dependencies:
|
|
83
83
|
none: false
|
84
84
|
prerelease: false
|
85
85
|
type: :runtime
|
86
|
+
- !ruby/object:Gem::Dependency
|
87
|
+
name: mock_redis
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: !binary |-
|
93
|
+
MA==
|
94
|
+
none: false
|
95
|
+
requirement: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: !binary |-
|
100
|
+
MA==
|
101
|
+
none: false
|
102
|
+
prerelease: false
|
103
|
+
type: :development
|
104
|
+
- !ruby/object:Gem::Dependency
|
105
|
+
name: rack-test
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: !binary |-
|
111
|
+
MA==
|
112
|
+
none: false
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: !binary |-
|
118
|
+
MA==
|
119
|
+
none: false
|
120
|
+
prerelease: false
|
121
|
+
type: :development
|
122
|
+
- !ruby/object:Gem::Dependency
|
123
|
+
name: rake
|
124
|
+
version_requirements: !ruby/object:Gem::Requirement
|
125
|
+
requirements:
|
126
|
+
- - ">="
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: !binary |-
|
129
|
+
MA==
|
130
|
+
none: false
|
131
|
+
requirement: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: !binary |-
|
136
|
+
MA==
|
137
|
+
none: false
|
138
|
+
prerelease: false
|
139
|
+
type: :development
|
140
|
+
- !ruby/object:Gem::Dependency
|
141
|
+
name: mocha
|
142
|
+
version_requirements: !ruby/object:Gem::Requirement
|
143
|
+
requirements:
|
144
|
+
- - ">="
|
145
|
+
- !ruby/object:Gem::Version
|
146
|
+
version: !binary |-
|
147
|
+
MA==
|
148
|
+
none: false
|
149
|
+
requirement: !ruby/object:Gem::Requirement
|
150
|
+
requirements:
|
151
|
+
- - ">="
|
152
|
+
- !ruby/object:Gem::Version
|
153
|
+
version: !binary |-
|
154
|
+
MA==
|
155
|
+
none: false
|
156
|
+
prerelease: false
|
157
|
+
type: :development
|
86
158
|
description: An implementation of LSH in Ruby, using JBLAS for JRuby and GSL for MRI
|
87
159
|
email: yves.raimond@bbc.co.uk
|
88
160
|
executables: []
|