lsh 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/lsh/index.rb ADDED
@@ -0,0 +1,142 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module LSH
18
+
19
+ class Index
20
+
21
+ attr_reader :projections, :buckets
22
+
23
+ def initialize(dim, k, w = Float::INFINITY, l = 150)
24
+ @math = MathUtil.new
25
+ @window = w
26
+ @dim = dim
27
+ @number_of_random_vectors = k
28
+ @number_of_independent_projections = l
29
+ @projections = generate_projections(dim, k, l)
30
+ @buckets = []
31
+ l.times { |i| @buckets << {} }
32
+ end
33
+
34
+ def add(vector)
35
+ hashes(vector).each_with_index do |hash, i|
36
+ hash_i = array_to_hash(hash)
37
+ if @buckets[i].has_key? hash_i
38
+ @buckets[i][hash_i] << vector
39
+ else
40
+ @buckets[i][hash_i] = [vector]
41
+ end
42
+ end
43
+ end
44
+
45
+ def query(vector, multiprobe_radius = 0)
46
+ results = []
47
+ hashes(vector).each_with_index do |hash, i|
48
+ hash_i = array_to_hash(hash)
49
+ bucket = @buckets[i]
50
+ # Take query hash, move it around at radius r, hash it and use the result as a query
51
+ # TODO: only works for binary LSH atm
52
+ results += bucket[hash_i] if bucket[hash_i]
53
+ if multiprobe_radius > 0
54
+ (1..multiprobe_radius).to_a.each do |radius|
55
+ (0..(@number_of_random_vectors - 1)).to_a.combination(radius).each do |flips|
56
+ probe = hash.clone
57
+ flips.each { |d| probe[d] = (probe[d] == 1) ? 0 : 1 }
58
+ probe_hash = array_to_hash(probe)
59
+ results += bucket[probe_hash] if bucket.has_key?(probe_hash)
60
+ end
61
+ end
62
+ end
63
+ end
64
+ results.uniq!
65
+ order_vectors_by_similarity(vector, results)
66
+ end
67
+
68
+ def order_vectors_by_similarity(vector, vectors)
69
+ vectors.map { |v| [ v, similarity(vector, v) ] } .sort_by { |v, sim| sim } .reverse .map { |vs| vs[0] }
70
+ end
71
+
72
+ def hashes(vector)
73
+ hashes = []
74
+ @projections.each do |projection|
75
+ hashes << hash(vector, projection)
76
+ end
77
+ hashes
78
+ end
79
+
80
+ def hash(vector, projection, bias = true)
81
+ hash = []
82
+ projection.each do |random_vector|
83
+ dot_product = similarity(vector, random_vector)
84
+ if @window == Float::INFINITY # Binary LSH
85
+ if dot_product >= 0
86
+ hash << 1
87
+ else
88
+ hash << 0
89
+ end
90
+ else
91
+ b = bias ? @math.random_uniform : 0.0
92
+ hash << (b + dot_product / @window).floor
93
+ end
94
+ end
95
+ hash
96
+ end
97
+
98
+ def array_to_hash(array)
99
+ return array.hash
100
+ # Derives a 28 bit hash value from an array of integers
101
+ # http://stackoverflow.com/questions/2909106/python-whats-a-correct-and-good-way-to-implement-hash#2909572
102
+ # TODO: Check it works for non-binary LSH
103
+ #return 0 if array.size == 0
104
+ #value = (array.first << 7)
105
+ #array.each do |v|
106
+ # value = (101 * value + v) & 0xffffff
107
+ #end
108
+ #value
109
+ end
110
+
111
+ def generate_projections(dim, k, l)
112
+ projections = []
113
+ l.times do |i|
114
+ projections << generate_projection(dim, k)
115
+ end
116
+ projections
117
+ end
118
+
119
+ def generate_projection(dim, k)
120
+ vectors = []
121
+ k.times do |i|
122
+ vectors << random_vector(dim)
123
+ end
124
+ vectors
125
+ end
126
+
127
+ def random_vector_unit(dim)
128
+ r = random_vector(dim)
129
+ r /= @math.norm(r)
130
+ end
131
+
132
+ def random_vector(dim)
133
+ @math.random_gaussian_vector(dim)
134
+ end
135
+
136
+ def similarity(v1, v2)
137
+ @math.dot(v1, v2)
138
+ end
139
+
140
+ end
141
+
142
+ end
@@ -0,0 +1,46 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'gsl'
18
+
19
+ module LSH
20
+
21
+ class MathUtil
22
+
23
+ def initialize
24
+ @gsl_random = GSL::Rng.alloc
25
+ @gsl_random.set(rand(1000)) # Overriding seed
26
+ end
27
+
28
+ def random_uniform
29
+ @gsl_random.uniform
30
+ end
31
+
32
+ def random_gaussian_vector(dim)
33
+ @gsl_random.gaussian(1, dim)
34
+ end
35
+
36
+ def dot(v1, v2)
37
+ v1 * v2.col
38
+ end
39
+
40
+ def norm(v)
41
+ v.norm
42
+ end
43
+
44
+ end
45
+
46
+ end
@@ -0,0 +1,41 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'jblas'
18
+
19
+ module LSH
20
+
21
+ class MathUtil
22
+
23
+ def random_uniform
24
+ JBLAS.rand[0,0]
25
+ end
26
+
27
+ def random_gaussian_vector(dim)
28
+ JBLAS.randn(dim, 1)
29
+ end
30
+
31
+ def dot(v1, v2)
32
+ (v1.t * v2)[0,0]
33
+ end
34
+
35
+ def norm(v)
36
+ v.norm2
37
+ end
38
+
39
+ end
40
+
41
+ end
data/lib/lsh.rb ADDED
@@ -0,0 +1,22 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require_relative 'lsh/index.rb'
18
+ if RUBY_PLATFORM == 'java'
19
+ require_relative 'lsh/math_util_jblas.rb'
20
+ else
21
+ require_relative 'lsh/math_util_gsl.rb'
22
+ end
metadata ADDED
@@ -0,0 +1,69 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lsh
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Yves Raimond
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-12-13 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: jblas-ruby
16
+ version_requirements: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: !binary |-
21
+ MA==
22
+ none: false
23
+ requirement: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: !binary |-
28
+ MA==
29
+ none: false
30
+ prerelease: false
31
+ type: :runtime
32
+ description: An implementation of LSH in Ruby, using GSL
33
+ email: yves.raimond@bbc.co.uk
34
+ executables: []
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - lib/lsh.rb
39
+ - lib/lsh/index.rb
40
+ - lib/lsh/math_util_gsl.rb
41
+ - lib/lsh/math_util_jblas.rb
42
+ homepage:
43
+ licenses: []
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: !binary |-
53
+ MA==
54
+ none: false
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: !binary |-
60
+ MA==
61
+ none: false
62
+ requirements: []
63
+ rubyforge_project:
64
+ rubygems_version: 1.8.24
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: Locality Sensitive Hashing gem
68
+ test_files: []
69
+ has_rdoc: false