lsh 0.0.5-java → 0.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/lsh.rb +3 -1
- data/lib/lsh/index.rb +33 -24
- data/lib/lsh/math_util_gsl.rb +29 -1
- data/lib/lsh/math_util_jblas.rb +26 -1
- data/lib/lsh/storage/memory.rb +60 -0
- data/lib/lsh/storage/redis_backend.rb +86 -0
- metadata +40 -2
data/lib/lsh.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# ruby-lsh
|
2
2
|
#
|
3
|
-
# Copyright (c)
|
3
|
+
# Copyright (c) 2012 British Broadcasting Corporation
|
4
4
|
#
|
5
5
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
6
|
# you may not use this file except in compliance with the License.
|
@@ -15,6 +15,8 @@
|
|
15
15
|
# limitations under the License.
|
16
16
|
|
17
17
|
require_relative 'lsh/index.rb'
|
18
|
+
require_relative 'lsh/storage/memory.rb'
|
19
|
+
require_relative 'lsh/storage/redis_backend.rb'
|
18
20
|
if RUBY_PLATFORM == 'java'
|
19
21
|
require_relative 'lsh/math_util_jblas.rb'
|
20
22
|
else
|
data/lib/lsh/index.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# ruby-lsh
|
2
2
|
#
|
3
|
-
# Copyright (c)
|
3
|
+
# Copyright (c) 2012 British Broadcasting Corporation
|
4
4
|
#
|
5
5
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
6
|
# you may not use this file except in compliance with the License.
|
@@ -18,26 +18,31 @@ module LSH
|
|
18
18
|
|
19
19
|
class Index
|
20
20
|
|
21
|
-
attr_reader :projections, :buckets
|
21
|
+
attr_reader :projections, :buckets, :storage
|
22
|
+
|
23
|
+
def initialize(parameters = {}, storage = LSH::Storage::Memory.new)
|
24
|
+
@storage = storage
|
25
|
+
unless storage.has_index?
|
26
|
+
storage.parameters = parameters
|
27
|
+
# Initializing projections and buckets
|
28
|
+
storage.projections = generate_projections(
|
29
|
+
parameters[:dim],
|
30
|
+
parameters[:number_of_random_vectors],
|
31
|
+
parameters[:number_of_independent_projections]
|
32
|
+
)
|
33
|
+
parameters[:number_of_independent_projections].times { |i| storage.create_new_bucket }
|
34
|
+
end
|
35
|
+
end
|
22
36
|
|
23
|
-
def
|
24
|
-
|
25
|
-
@dim = dim
|
26
|
-
@number_of_random_vectors = k
|
27
|
-
@number_of_independent_projections = l
|
28
|
-
@projections = generate_projections(dim, k, l)
|
29
|
-
@buckets = []
|
30
|
-
l.times { |i| @buckets << {} }
|
37
|
+
def self.load(storage)
|
38
|
+
Index.new(storage.parameters, storage) if storage.has_index?
|
31
39
|
end
|
32
40
|
|
33
41
|
def add(vector)
|
34
42
|
hashes(vector).each_with_index do |hash, i|
|
35
43
|
hash_i = array_to_hash(hash)
|
36
|
-
|
37
|
-
|
38
|
-
else
|
39
|
-
@buckets[i][hash_i] = [vector]
|
40
|
-
end
|
44
|
+
bucket = storage.find_bucket(i)
|
45
|
+
storage.add_vector_to_bucket(bucket, hash_i, vector)
|
41
46
|
end
|
42
47
|
end
|
43
48
|
|
@@ -45,22 +50,25 @@ module LSH
|
|
45
50
|
results = []
|
46
51
|
hashes(vector).each_with_index do |hash, i|
|
47
52
|
hash_i = array_to_hash(hash)
|
48
|
-
bucket =
|
53
|
+
bucket = storage.find_bucket(i)
|
54
|
+
# Multiprobe LSH
|
49
55
|
# Take query hash, move it around at radius r, hash it and use the result as a query
|
50
56
|
# TODO: only works for binary LSH atm
|
51
|
-
|
57
|
+
bucket_results = storage.query_bucket(bucket, hash_i)
|
58
|
+
results += bucket_results if bucket_results
|
52
59
|
if multiprobe_radius > 0
|
53
60
|
(1..multiprobe_radius).to_a.each do |radius|
|
54
|
-
(0..(
|
61
|
+
(0..(storage.parameters[:number_of_random_vectors] - 1)).to_a.combination(radius).each do |flips|
|
55
62
|
probe = hash.clone
|
56
63
|
flips.each { |d| probe[d] = (probe[d] == 1) ? 0 : 1 }
|
57
64
|
probe_hash = array_to_hash(probe)
|
58
|
-
|
65
|
+
probe_bucket_results = storage.query_bucket(bucket, probe_hash)
|
66
|
+
results += probe_bucket_results if probe_bucket_results
|
59
67
|
end
|
60
68
|
end
|
61
69
|
end
|
62
70
|
end
|
63
|
-
results.uniq
|
71
|
+
results = MathUtil.uniq(results)
|
64
72
|
order_vectors_by_similarity(vector, results)
|
65
73
|
end
|
66
74
|
|
@@ -70,7 +78,7 @@ module LSH
|
|
70
78
|
|
71
79
|
def hashes(vector)
|
72
80
|
hashes = []
|
73
|
-
|
81
|
+
storage.projections.each do |projection|
|
74
82
|
hashes << hash(vector, projection)
|
75
83
|
end
|
76
84
|
hashes
|
@@ -80,7 +88,8 @@ module LSH
|
|
80
88
|
hash = []
|
81
89
|
projection.each do |random_vector|
|
82
90
|
dot_product = similarity(vector, random_vector)
|
83
|
-
|
91
|
+
window = storage.parameters[:window]
|
92
|
+
if window == Float::INFINITY # Binary LSH
|
84
93
|
if dot_product >= 0
|
85
94
|
hash << 1
|
86
95
|
else
|
@@ -88,7 +97,7 @@ module LSH
|
|
88
97
|
end
|
89
98
|
else
|
90
99
|
b = bias ? MathUtil.random_uniform : 0.0
|
91
|
-
hash << (b + dot_product /
|
100
|
+
hash << (b + dot_product / window).floor
|
92
101
|
end
|
93
102
|
end
|
94
103
|
hash
|
@@ -137,7 +146,7 @@ module LSH
|
|
137
146
|
end
|
138
147
|
|
139
148
|
def inspect
|
140
|
-
"LSH index; dimension: #{
|
149
|
+
"#<LSH index; dimension: #{storage.parameters.dim}; window size: #{storage.parameters.window}; #{storage.parameters.number_of_random_vectors} random vectors; #{storage.parameters.number_of_independent_projections} independent projections>"
|
141
150
|
end
|
142
151
|
|
143
152
|
end
|
data/lib/lsh/math_util_gsl.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# ruby-lsh
|
2
2
|
#
|
3
|
-
# Copyright (c)
|
3
|
+
# Copyright (c) 2012 British Broadcasting Corporation
|
4
4
|
#
|
5
5
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
6
|
# you may not use this file except in compliance with the License.
|
@@ -15,6 +15,7 @@
|
|
15
15
|
# limitations under the License.
|
16
16
|
|
17
17
|
require 'gsl'
|
18
|
+
require 'json'
|
18
19
|
|
19
20
|
module LSH
|
20
21
|
|
@@ -42,6 +43,33 @@ module LSH
|
|
42
43
|
v.norm
|
43
44
|
end
|
44
45
|
|
46
|
+
def self.uniq(vs)
|
47
|
+
# Can't use uniq as
|
48
|
+
# [ v, JSON.parse(v.to_json) ].uniq.size == 2 with GSL
|
49
|
+
results = []
|
50
|
+
vs.each { |v| results << v unless results.member? v }
|
51
|
+
results
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
module GSL
|
59
|
+
|
60
|
+
class Vector
|
61
|
+
|
62
|
+
def to_json(*a)
|
63
|
+
{
|
64
|
+
'json_class' => self.class.name,
|
65
|
+
'data' => to_a,
|
66
|
+
}.to_json(*a)
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.json_create(o)
|
70
|
+
alloc(*o['data'])
|
71
|
+
end
|
72
|
+
|
45
73
|
end
|
46
74
|
|
47
75
|
end
|
data/lib/lsh/math_util_jblas.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# ruby-lsh
|
2
2
|
#
|
3
|
-
# Copyright (c)
|
3
|
+
# Copyright (c) 2012 British Broadcasting Corporation
|
4
4
|
#
|
5
5
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
6
|
# you may not use this file except in compliance with the License.
|
@@ -40,6 +40,31 @@ module LSH
|
|
40
40
|
v.norm2
|
41
41
|
end
|
42
42
|
|
43
|
+
def self.uniq(vs)
|
44
|
+
vs.uniq
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
module JBLAS
|
53
|
+
|
54
|
+
class DoubleMatrix
|
55
|
+
|
56
|
+
def to_json(*a)
|
57
|
+
{
|
58
|
+
'json_class' => 'JBLAS::DoubleMatrix',
|
59
|
+
'data' => to_a,
|
60
|
+
}.to_json(*a)
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.json_create(o)
|
64
|
+
from_array(o['data']).t
|
65
|
+
end
|
66
|
+
|
43
67
|
end
|
44
68
|
|
45
69
|
end
|
70
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# ruby-lsh
|
2
|
+
#
|
3
|
+
# Copyright (c) 2012 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
module LSH
|
18
|
+
|
19
|
+
module Storage
|
20
|
+
|
21
|
+
class Memory
|
22
|
+
|
23
|
+
attr_accessor :projections
|
24
|
+
attr_accessor :parameters
|
25
|
+
attr_reader :buckets
|
26
|
+
|
27
|
+
def has_index?
|
28
|
+
projections and parameters and @buckets
|
29
|
+
end
|
30
|
+
|
31
|
+
def reset!
|
32
|
+
@buckets = nil
|
33
|
+
end
|
34
|
+
|
35
|
+
def create_new_bucket
|
36
|
+
@buckets ||= []
|
37
|
+
@buckets << {}
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_vector_to_bucket(bucket, hash, vector)
|
41
|
+
if bucket.has_key? hash
|
42
|
+
bucket[hash] << vector
|
43
|
+
else
|
44
|
+
bucket[hash] = [vector]
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def find_bucket(i)
|
49
|
+
@buckets[i]
|
50
|
+
end
|
51
|
+
|
52
|
+
def query_bucket(bucket, hash)
|
53
|
+
bucket[hash]
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# ruby-lsh
|
2
|
+
#
|
3
|
+
# Copyright (c) 2012 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'redis'
|
18
|
+
require 'json'
|
19
|
+
|
20
|
+
module LSH
|
21
|
+
|
22
|
+
module Storage
|
23
|
+
|
24
|
+
class RedisBackend
|
25
|
+
|
26
|
+
attr_reader :redis
|
27
|
+
|
28
|
+
def initialize(params = { :redis => { :host => '127.0.0.1', :port => 6379 } })
|
29
|
+
@redis = Redis.new(params[:redis])
|
30
|
+
end
|
31
|
+
|
32
|
+
def reset!
|
33
|
+
@redis.flushall
|
34
|
+
end
|
35
|
+
|
36
|
+
def has_index?
|
37
|
+
projections and parameters and @redis.get("buckets") > 0
|
38
|
+
end
|
39
|
+
|
40
|
+
def projections=(projections)
|
41
|
+
@redis.set "projections", projections.to_json
|
42
|
+
end
|
43
|
+
|
44
|
+
def projections
|
45
|
+
begin
|
46
|
+
@projections ||= JSON.parse(@redis.get "projections")
|
47
|
+
rescue TypeError
|
48
|
+
nil
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def parameters=(parms)
|
53
|
+
parms[:window] = 'Infinity' if parms[:window] == Float::INFINITY
|
54
|
+
@redis.set "parameters", parms.to_json
|
55
|
+
end
|
56
|
+
|
57
|
+
def parameters
|
58
|
+
@parms ||= (
|
59
|
+
parms = JSON.parse(@redis.get "parameters")
|
60
|
+
parms.keys.each { |k| parms[k.to_sym] = parms[k]; parms.delete(k) }
|
61
|
+
parms[:window] = Float::INFINITY if parms[:window] == 'Infinity'
|
62
|
+
parms
|
63
|
+
)
|
64
|
+
end
|
65
|
+
|
66
|
+
def create_new_bucket
|
67
|
+
@redis.incr "buckets"
|
68
|
+
end
|
69
|
+
|
70
|
+
def add_vector_to_bucket(bucket, hash, vector)
|
71
|
+
@redis.sadd "#{bucket}:#{hash}", vector.to_json
|
72
|
+
end
|
73
|
+
|
74
|
+
def find_bucket(i)
|
75
|
+
"bucket:#{i}" if @redis.get("buckets").to_i > i
|
76
|
+
end
|
77
|
+
|
78
|
+
def query_bucket(bucket, hash)
|
79
|
+
@redis.smembers("#{bucket}:#{hash}").map { |vector_json| JSON.parse(vector_json) }
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: lsh
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0
|
5
|
+
version: 0.1.0
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Yves Raimond
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: jblas-ruby
|
@@ -29,6 +29,42 @@ dependencies:
|
|
29
29
|
none: false
|
30
30
|
prerelease: false
|
31
31
|
type: :runtime
|
32
|
+
- !ruby/object:Gem::Dependency
|
33
|
+
name: json-jruby
|
34
|
+
version_requirements: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: !binary |-
|
39
|
+
MA==
|
40
|
+
none: false
|
41
|
+
requirement: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: !binary |-
|
46
|
+
MA==
|
47
|
+
none: false
|
48
|
+
prerelease: false
|
49
|
+
type: :runtime
|
50
|
+
- !ruby/object:Gem::Dependency
|
51
|
+
name: redis
|
52
|
+
version_requirements: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: !binary |-
|
57
|
+
MA==
|
58
|
+
none: false
|
59
|
+
requirement: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: !binary |-
|
64
|
+
MA==
|
65
|
+
none: false
|
66
|
+
prerelease: false
|
67
|
+
type: :runtime
|
32
68
|
description: An implementation of LSH in Ruby, using JBLAS for JRuby and GSL for MRI
|
33
69
|
email: yves.raimond@bbc.co.uk
|
34
70
|
executables: []
|
@@ -39,6 +75,8 @@ files:
|
|
39
75
|
- lib/lsh/index.rb
|
40
76
|
- lib/lsh/math_util_gsl.rb
|
41
77
|
- lib/lsh/math_util_jblas.rb
|
78
|
+
- lib/lsh/storage/memory.rb
|
79
|
+
- lib/lsh/storage/redis_backend.rb
|
42
80
|
homepage: https://github.com/bbcrd/ruby-lsh
|
43
81
|
licenses: []
|
44
82
|
post_install_message:
|