lsh 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/lsh/index.rb ADDED
@@ -0,0 +1,142 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module LSH
18
+
19
+ class Index
20
+
21
+ attr_reader :projections, :buckets
22
+
23
+ def initialize(dim, k, w = Float::INFINITY, l = 150)
24
+ @math = MathUtil.new
25
+ @window = w
26
+ @dim = dim
27
+ @number_of_random_vectors = k
28
+ @number_of_independent_projections = l
29
+ @projections = generate_projections(dim, k, l)
30
+ @buckets = []
31
+ l.times { |i| @buckets << {} }
32
+ end
33
+
34
+ def add(vector)
35
+ hashes(vector).each_with_index do |hash, i|
36
+ hash_i = array_to_hash(hash)
37
+ if @buckets[i].has_key? hash_i
38
+ @buckets[i][hash_i] << vector
39
+ else
40
+ @buckets[i][hash_i] = [vector]
41
+ end
42
+ end
43
+ end
44
+
45
+ def query(vector, multiprobe_radius = 0)
46
+ results = []
47
+ hashes(vector).each_with_index do |hash, i|
48
+ hash_i = array_to_hash(hash)
49
+ bucket = @buckets[i]
50
+ # Take query hash, move it around at radius r, hash it and use the result as a query
51
+ # TODO: only works for binary LSH atm
52
+ results += bucket[hash_i] if bucket[hash_i]
53
+ if multiprobe_radius > 0
54
+ (1..multiprobe_radius).to_a.each do |radius|
55
+ (0..(@number_of_random_vectors - 1)).to_a.combination(radius).each do |flips|
56
+ probe = hash.clone
57
+ flips.each { |d| probe[d] = (probe[d] == 1) ? 0 : 1 }
58
+ probe_hash = array_to_hash(probe)
59
+ results += bucket[probe_hash] if bucket.has_key?(probe_hash)
60
+ end
61
+ end
62
+ end
63
+ end
64
+ results.uniq!
65
+ order_vectors_by_similarity(vector, results)
66
+ end
67
+
68
+ def order_vectors_by_similarity(vector, vectors)
69
+ vectors.map { |v| [ v, similarity(vector, v) ] } .sort_by { |v, sim| sim } .reverse .map { |vs| vs[0] }
70
+ end
71
+
72
+ def hashes(vector)
73
+ hashes = []
74
+ @projections.each do |projection|
75
+ hashes << hash(vector, projection)
76
+ end
77
+ hashes
78
+ end
79
+
80
+ def hash(vector, projection, bias = true)
81
+ hash = []
82
+ projection.each do |random_vector|
83
+ dot_product = similarity(vector, random_vector)
84
+ if @window == Float::INFINITY # Binary LSH
85
+ if dot_product >= 0
86
+ hash << 1
87
+ else
88
+ hash << 0
89
+ end
90
+ else
91
+ b = bias ? @math.random_uniform : 0.0
92
+ hash << (b + dot_product / @window).floor
93
+ end
94
+ end
95
+ hash
96
+ end
97
+
98
+ def array_to_hash(array)
99
+ return array.hash
100
+ # Derives a 28 bit hash value from an array of integers
101
+ # http://stackoverflow.com/questions/2909106/python-whats-a-correct-and-good-way-to-implement-hash#2909572
102
+ # TODO: Check it works for non-binary LSH
103
+ #return 0 if array.size == 0
104
+ #value = (array.first << 7)
105
+ #array.each do |v|
106
+ # value = (101 * value + v) & 0xffffff
107
+ #end
108
+ #value
109
+ end
110
+
111
+ def generate_projections(dim, k, l)
112
+ projections = []
113
+ l.times do |i|
114
+ projections << generate_projection(dim, k)
115
+ end
116
+ projections
117
+ end
118
+
119
+ def generate_projection(dim, k)
120
+ vectors = []
121
+ k.times do |i|
122
+ vectors << random_vector(dim)
123
+ end
124
+ vectors
125
+ end
126
+
127
+ def random_vector_unit(dim)
128
+ r = random_vector(dim)
129
+ r /= @math.norm(r)
130
+ end
131
+
132
+ def random_vector(dim)
133
+ @math.random_gaussian_vector(dim)
134
+ end
135
+
136
+ def similarity(v1, v2)
137
+ @math.dot(v1, v2)
138
+ end
139
+
140
+ end
141
+
142
+ end
@@ -0,0 +1,46 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'gsl'
18
+
19
+ module LSH
20
+
21
+ class MathUtil
22
+
23
+ def initialize
24
+ @gsl_random = GSL::Rng.alloc
25
+ @gsl_random.set(rand(1000)) # Overriding seed
26
+ end
27
+
28
+ def random_uniform
29
+ @gsl_random.uniform
30
+ end
31
+
32
+ def random_gaussian_vector(dim)
33
+ @gsl_random.gaussian(1, dim)
34
+ end
35
+
36
+ def dot(v1, v2)
37
+ v1 * v2.col
38
+ end
39
+
40
+ def norm(v)
41
+ v.norm
42
+ end
43
+
44
+ end
45
+
46
+ end
@@ -0,0 +1,41 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'jblas'
18
+
19
+ module LSH
20
+
21
+ class MathUtil
22
+
23
+ def random_uniform
24
+ JBLAS.rand[0,0]
25
+ end
26
+
27
+ def random_gaussian_vector(dim)
28
+ JBLAS.randn(dim, 1)
29
+ end
30
+
31
+ def dot(v1, v2)
32
+ (v1.t * v2)[0,0]
33
+ end
34
+
35
+ def norm(v)
36
+ v.norm2
37
+ end
38
+
39
+ end
40
+
41
+ end
data/lib/lsh.rb ADDED
@@ -0,0 +1,22 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require_relative 'lsh/index.rb'
18
+ if RUBY_PLATFORM == 'java'
19
+ require_relative 'lsh/math_util_jblas.rb'
20
+ else
21
+ require_relative 'lsh/math_util_gsl.rb'
22
+ end
metadata ADDED
@@ -0,0 +1,69 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lsh
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Yves Raimond
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-12-13 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: jblas-ruby
16
+ version_requirements: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: !binary |-
21
+ MA==
22
+ none: false
23
+ requirement: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: !binary |-
28
+ MA==
29
+ none: false
30
+ prerelease: false
31
+ type: :runtime
32
+ description: An implementation of LSH in Ruby, using GSL
33
+ email: yves.raimond@bbc.co.uk
34
+ executables: []
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - lib/lsh.rb
39
+ - lib/lsh/index.rb
40
+ - lib/lsh/math_util_gsl.rb
41
+ - lib/lsh/math_util_jblas.rb
42
+ homepage:
43
+ licenses: []
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: !binary |-
53
+ MA==
54
+ none: false
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: !binary |-
60
+ MA==
61
+ none: false
62
+ requirements: []
63
+ rubyforge_project:
64
+ rubygems_version: 1.8.24
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: Locality Sensitive Hashing gem
68
+ test_files: []
69
+ has_rdoc: false