lsh 0.0.4-java

Sign up to get free protection for your applications and to get access to all the features.
data/lib/lsh.rb ADDED
@@ -0,0 +1,22 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require_relative 'lsh/index.rb'
18
+ if RUBY_PLATFORM == 'java'
19
+ require_relative 'lsh/math_util_jblas.rb'
20
+ else
21
+ require_relative 'lsh/math_util_gsl.rb'
22
+ end
data/lib/lsh/index.rb ADDED
@@ -0,0 +1,141 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module LSH
18
+
19
+ class Index
20
+
21
+ attr_reader :projections, :buckets
22
+
23
+ def initialize(dim, k, w = Float::INFINITY, l = 150)
24
+ @window = w
25
+ @dim = dim
26
+ @number_of_random_vectors = k
27
+ @number_of_independent_projections = l
28
+ @projections = generate_projections(dim, k, l)
29
+ @buckets = []
30
+ l.times { |i| @buckets << {} }
31
+ end
32
+
33
+ def add(vector)
34
+ hashes(vector).each_with_index do |hash, i|
35
+ hash_i = array_to_hash(hash)
36
+ if @buckets[i].has_key? hash_i
37
+ @buckets[i][hash_i] << vector
38
+ else
39
+ @buckets[i][hash_i] = [vector]
40
+ end
41
+ end
42
+ end
43
+
44
+ def query(vector, multiprobe_radius = 0)
45
+ results = []
46
+ hashes(vector).each_with_index do |hash, i|
47
+ hash_i = array_to_hash(hash)
48
+ bucket = @buckets[i]
49
+ # Take query hash, move it around at radius r, hash it and use the result as a query
50
+ # TODO: only works for binary LSH atm
51
+ results += bucket[hash_i] if bucket[hash_i]
52
+ if multiprobe_radius > 0
53
+ (1..multiprobe_radius).to_a.each do |radius|
54
+ (0..(@number_of_random_vectors - 1)).to_a.combination(radius).each do |flips|
55
+ probe = hash.clone
56
+ flips.each { |d| probe[d] = (probe[d] == 1) ? 0 : 1 }
57
+ probe_hash = array_to_hash(probe)
58
+ results += bucket[probe_hash] if bucket.has_key?(probe_hash)
59
+ end
60
+ end
61
+ end
62
+ end
63
+ results.uniq!
64
+ order_vectors_by_similarity(vector, results)
65
+ end
66
+
67
+ def order_vectors_by_similarity(vector, vectors)
68
+ vectors.map { |v| [ v, similarity(vector, v) ] } .sort_by { |v, sim| sim } .reverse .map { |vs| vs[0] }
69
+ end
70
+
71
+ def hashes(vector)
72
+ hashes = []
73
+ @projections.each do |projection|
74
+ hashes << hash(vector, projection)
75
+ end
76
+ hashes
77
+ end
78
+
79
+ def hash(vector, projection, bias = true)
80
+ hash = []
81
+ projection.each do |random_vector|
82
+ dot_product = similarity(vector, random_vector)
83
+ if @window == Float::INFINITY # Binary LSH
84
+ if dot_product >= 0
85
+ hash << 1
86
+ else
87
+ hash << 0
88
+ end
89
+ else
90
+ b = bias ? MathUtil.random_uniform : 0.0
91
+ hash << (b + dot_product / @window).floor
92
+ end
93
+ end
94
+ hash
95
+ end
96
+
97
+ def array_to_hash(array)
98
+ return array.hash
99
+ # Derives a 28 bit hash value from an array of integers
100
+ # http://stackoverflow.com/questions/2909106/python-whats-a-correct-and-good-way-to-implement-hash#2909572
101
+ # TODO: Check it works for non-binary LSH
102
+ #return 0 if array.size == 0
103
+ #value = (array.first << 7)
104
+ #array.each do |v|
105
+ # value = (101 * value + v) & 0xffffff
106
+ #end
107
+ #value
108
+ end
109
+
110
+ def generate_projections(dim, k, l)
111
+ projections = []
112
+ l.times do |i|
113
+ projections << generate_projection(dim, k)
114
+ end
115
+ projections
116
+ end
117
+
118
+ def generate_projection(dim, k)
119
+ vectors = []
120
+ k.times do |i|
121
+ vectors << random_vector(dim)
122
+ end
123
+ vectors
124
+ end
125
+
126
+ def random_vector_unit(dim)
127
+ r = random_vector(dim)
128
+ r /= MathUtil.norm(r)
129
+ end
130
+
131
+ def random_vector(dim)
132
+ MathUtil.random_gaussian_vector(dim)
133
+ end
134
+
135
+ def similarity(v1, v2)
136
+ MathUtil.dot(v1, v2)
137
+ end
138
+
139
+ end
140
+
141
+ end
@@ -0,0 +1,47 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'gsl'
18
+
19
+ module LSH
20
+
21
+ class MathUtil
22
+
23
+ @@gsl_random = GSL::Rng.alloc
24
+
25
+ def self.random_uniform
26
+ @@gsl_random.uniform
27
+ end
28
+
29
+ def self.random_gaussian_vector(dim)
30
+ @@gsl_random.gaussian(1, dim)
31
+ end
32
+
33
+ def self.random_gaussian_matrix(k, l)
34
+ GSL::Matrix.randn(k, l)
35
+ end
36
+
37
+ def self.dot(v1, v2)
38
+ v1 * v2.col
39
+ end
40
+
41
+ def self.norm(v)
42
+ v.norm
43
+ end
44
+
45
+ end
46
+
47
+ end
@@ -0,0 +1,45 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2011 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'jblas'
18
+
19
+ module LSH
20
+
21
+ class MathUtil
22
+
23
+ def self.random_uniform
24
+ JBLAS.rand[0,0]
25
+ end
26
+
27
+ def self.random_gaussian_vector(dim)
28
+ JBLAS.randn(1, dim)
29
+ end
30
+
31
+ def self.random_gaussian_matrix(k, l)
32
+ JBLAS.randn(k, l)
33
+ end
34
+
35
+ def self.dot(v1, v2)
36
+ (v1 * v2.t)[0,0]
37
+ end
38
+
39
+ def self.norm(v)
40
+ v.norm2
41
+ end
42
+
43
+ end
44
+
45
+ end
metadata ADDED
@@ -0,0 +1,69 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lsh
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.4
6
+ platform: java
7
+ authors:
8
+ - Yves Raimond
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-12-17 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: jblas-ruby
16
+ version_requirements: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: !binary |-
21
+ MA==
22
+ none: false
23
+ requirement: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: !binary |-
28
+ MA==
29
+ none: false
30
+ prerelease: false
31
+ type: :runtime
32
+ description: An implementation of LSH in Ruby, using JBLAS for JRuby and GSL for MRI
33
+ email: yves.raimond@bbc.co.uk
34
+ executables: []
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - lib/lsh.rb
39
+ - lib/lsh/index.rb
40
+ - lib/lsh/math_util_gsl.rb
41
+ - lib/lsh/math_util_jblas.rb
42
+ homepage: https://github.com/bbcrd/ruby-lsh
43
+ licenses: []
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: !binary |-
53
+ MA==
54
+ none: false
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: !binary |-
60
+ MA==
61
+ none: false
62
+ requirements: []
63
+ rubyforge_project:
64
+ rubygems_version: 1.8.24
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: Locality Sensitive Hashing gem
68
+ test_files: []
69
+ has_rdoc: false