lsh 0.0.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/lsh.rb ADDED
@@ -0,0 +1,22 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require_relative 'lsh/index.rb'
18
+ if RUBY_PLATFORM == 'java'
19
+ require_relative 'lsh/math_util_jblas.rb'
20
+ else
21
+ require_relative 'lsh/math_util_gsl.rb'
22
+ end
data/lib/lsh/index.rb ADDED
@@ -0,0 +1,141 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module LSH
18
+
19
+ class Index
20
+
21
+ attr_reader :projections, :buckets
22
+
23
+ def initialize(dim, k, w = Float::INFINITY, l = 150)
24
+ @window = w
25
+ @dim = dim
26
+ @number_of_random_vectors = k
27
+ @number_of_independent_projections = l
28
+ @projections = generate_projections(dim, k, l)
29
+ @buckets = []
30
+ l.times { |i| @buckets << {} }
31
+ end
32
+
33
+ def add(vector)
34
+ hashes(vector).each_with_index do |hash, i|
35
+ hash_i = array_to_hash(hash)
36
+ if @buckets[i].has_key? hash_i
37
+ @buckets[i][hash_i] << vector
38
+ else
39
+ @buckets[i][hash_i] = [vector]
40
+ end
41
+ end
42
+ end
43
+
44
+ def query(vector, multiprobe_radius = 0)
45
+ results = []
46
+ hashes(vector).each_with_index do |hash, i|
47
+ hash_i = array_to_hash(hash)
48
+ bucket = @buckets[i]
49
+ # Take query hash, move it around at radius r, hash it and use the result as a query
50
+ # TODO: only works for binary LSH atm
51
+ results += bucket[hash_i] if bucket[hash_i]
52
+ if multiprobe_radius > 0
53
+ (1..multiprobe_radius).to_a.each do |radius|
54
+ (0..(@number_of_random_vectors - 1)).to_a.combination(radius).each do |flips|
55
+ probe = hash.clone
56
+ flips.each { |d| probe[d] = (probe[d] == 1) ? 0 : 1 }
57
+ probe_hash = array_to_hash(probe)
58
+ results += bucket[probe_hash] if bucket.has_key?(probe_hash)
59
+ end
60
+ end
61
+ end
62
+ end
63
+ results.uniq!
64
+ order_vectors_by_similarity(vector, results)
65
+ end
66
+
67
+ def order_vectors_by_similarity(vector, vectors)
68
+ vectors.map { |v| [ v, similarity(vector, v) ] } .sort_by { |v, sim| sim } .reverse .map { |vs| vs[0] }
69
+ end
70
+
71
+ def hashes(vector)
72
+ hashes = []
73
+ @projections.each do |projection|
74
+ hashes << hash(vector, projection)
75
+ end
76
+ hashes
77
+ end
78
+
79
+ def hash(vector, projection, bias = true)
80
+ hash = []
81
+ projection.each do |random_vector|
82
+ dot_product = similarity(vector, random_vector)
83
+ if @window == Float::INFINITY # Binary LSH
84
+ if dot_product >= 0
85
+ hash << 1
86
+ else
87
+ hash << 0
88
+ end
89
+ else
90
+ b = bias ? MathUtil.random_uniform : 0.0
91
+ hash << (b + dot_product / @window).floor
92
+ end
93
+ end
94
+ hash
95
+ end
96
+
97
+ def array_to_hash(array)
98
+ return array.hash
99
+ # Derives a 28 bit hash value from an array of integers
100
+ # http://stackoverflow.com/questions/2909106/python-whats-a-correct-and-good-way-to-implement-hash#2909572
101
+ # TODO: Check it works for non-binary LSH
102
+ #return 0 if array.size == 0
103
+ #value = (array.first << 7)
104
+ #array.each do |v|
105
+ # value = (101 * value + v) & 0xffffff
106
+ #end
107
+ #value
108
+ end
109
+
110
+ def generate_projections(dim, k, l)
111
+ projections = []
112
+ l.times do |i|
113
+ projections << generate_projection(dim, k)
114
+ end
115
+ projections
116
+ end
117
+
118
+ def generate_projection(dim, k)
119
+ vectors = []
120
+ k.times do |i|
121
+ vectors << random_vector(dim)
122
+ end
123
+ vectors
124
+ end
125
+
126
+ def random_vector_unit(dim)
127
+ r = random_vector(dim)
128
+ r /= MathUtil.norm(r)
129
+ end
130
+
131
+ def random_vector(dim)
132
+ MathUtil.random_gaussian_vector(dim)
133
+ end
134
+
135
+ def similarity(v1, v2)
136
+ MathUtil.dot(v1, v2)
137
+ end
138
+
139
+ end
140
+
141
+ end
@@ -0,0 +1,47 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'gsl'
18
+
19
+ module LSH
20
+
21
+ class MathUtil
22
+
23
+ @@gsl_random = GSL::Rng.alloc
24
+
25
+ def self.random_uniform
26
+ @@gsl_random.uniform
27
+ end
28
+
29
+ def self.random_gaussian_vector(dim)
30
+ @@gsl_random.gaussian(1, dim)
31
+ end
32
+
33
+ def self.random_gaussian_matrix(k, l)
34
+ GSL::Matrix.randn(k, l)
35
+ end
36
+
37
+ def self.dot(v1, v2)
38
+ v1 * v2.col
39
+ end
40
+
41
+ def self.norm(v)
42
+ v.norm
43
+ end
44
+
45
+ end
46
+
47
+ end
@@ -0,0 +1,45 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'jblas'
18
+
19
+ module LSH
20
+
21
+ class MathUtil
22
+
23
+ def self.random_uniform
24
+ JBLAS.rand[0,0]
25
+ end
26
+
27
+ def self.random_gaussian_vector(dim)
28
+ JBLAS.randn(1, dim)
29
+ end
30
+
31
+ def self.random_gaussian_matrix(k, l)
32
+ JBLAS.randn(k, l)
33
+ end
34
+
35
+ def self.dot(v1, v2)
36
+ (v1 * v2.t)[0,0]
37
+ end
38
+
39
+ def self.norm(v)
40
+ v.norm2
41
+ end
42
+
43
+ end
44
+
45
+ end
metadata ADDED
@@ -0,0 +1,69 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lsh
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.4
6
+ platform: java
7
+ authors:
8
+ - Yves Raimond
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-12-17 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: jblas-ruby
16
+ version_requirements: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: !binary |-
21
+ MA==
22
+ none: false
23
+ requirement: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: !binary |-
28
+ MA==
29
+ none: false
30
+ prerelease: false
31
+ type: :runtime
32
+ description: An implementation of LSH in Ruby, using JBLAS for JRuby and GSL for MRI
33
+ email: yves.raimond@bbc.co.uk
34
+ executables: []
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - lib/lsh.rb
39
+ - lib/lsh/index.rb
40
+ - lib/lsh/math_util_gsl.rb
41
+ - lib/lsh/math_util_jblas.rb
42
+ homepage: https://github.com/bbcrd/ruby-lsh
43
+ licenses: []
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: !binary |-
53
+ MA==
54
+ none: false
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: !binary |-
60
+ MA==
61
+ none: false
62
+ requirements: []
63
+ rubyforge_project:
64
+ rubygems_version: 1.8.24
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: Locality Sensitive Hashing gem
68
+ test_files: []
69
+ has_rdoc: false