lsh 0.0.5 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/lsh.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  # ruby-lsh
2
2
  #
3
- # Copyright (c) 2011 British Broadcasting Corporation
3
+ # Copyright (c) 2012 British Broadcasting Corporation
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License");
6
6
  # you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
15
15
  # limitations under the License.
16
16
 
17
17
  require_relative 'lsh/index.rb'
18
+ require_relative 'lsh/storage/memory.rb'
19
+ require_relative 'lsh/storage/redis_backend.rb'
18
20
  if RUBY_PLATFORM == 'java'
19
21
  require_relative 'lsh/math_util_jblas.rb'
20
22
  else
@@ -1,6 +1,6 @@
1
1
  # ruby-lsh
2
2
  #
3
- # Copyright (c) 2011 British Broadcasting Corporation
3
+ # Copyright (c) 2012 British Broadcasting Corporation
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License");
6
6
  # you may not use this file except in compliance with the License.
@@ -18,26 +18,31 @@ module LSH
18
18
 
19
19
  class Index
20
20
 
21
- attr_reader :projections, :buckets
21
+ attr_reader :projections, :buckets, :storage
22
+
23
+ def initialize(parameters = {}, storage = LSH::Storage::Memory.new)
24
+ @storage = storage
25
+ unless storage.has_index?
26
+ storage.parameters = parameters
27
+ # Initializing projections and buckets
28
+ storage.projections = generate_projections(
29
+ parameters[:dim],
30
+ parameters[:number_of_random_vectors],
31
+ parameters[:number_of_independent_projections]
32
+ )
33
+ parameters[:number_of_independent_projections].times { |i| storage.create_new_bucket }
34
+ end
35
+ end
22
36
 
23
- def initialize(dim, k, w = Float::INFINITY, l = 150)
24
- @window = w
25
- @dim = dim
26
- @number_of_random_vectors = k
27
- @number_of_independent_projections = l
28
- @projections = generate_projections(dim, k, l)
29
- @buckets = []
30
- l.times { |i| @buckets << {} }
37
+ def self.load(storage)
38
+ Index.new(storage.parameters, storage) if storage.has_index?
31
39
  end
32
40
 
33
41
  def add(vector)
34
42
  hashes(vector).each_with_index do |hash, i|
35
43
  hash_i = array_to_hash(hash)
36
- if @buckets[i].has_key? hash_i
37
- @buckets[i][hash_i] << vector
38
- else
39
- @buckets[i][hash_i] = [vector]
40
- end
44
+ bucket = storage.find_bucket(i)
45
+ storage.add_vector_to_bucket(bucket, hash_i, vector)
41
46
  end
42
47
  end
43
48
 
@@ -45,22 +50,25 @@ module LSH
45
50
  results = []
46
51
  hashes(vector).each_with_index do |hash, i|
47
52
  hash_i = array_to_hash(hash)
48
- bucket = @buckets[i]
53
+ bucket = storage.find_bucket(i)
54
+ # Multiprobe LSH
49
55
  # Take query hash, move it around at radius r, hash it and use the result as a query
50
56
  # TODO: only works for binary LSH atm
51
- results += bucket[hash_i] if bucket[hash_i]
57
+ bucket_results = storage.query_bucket(bucket, hash_i)
58
+ results += bucket_results if bucket_results
52
59
  if multiprobe_radius > 0
53
60
  (1..multiprobe_radius).to_a.each do |radius|
54
- (0..(@number_of_random_vectors - 1)).to_a.combination(radius).each do |flips|
61
+ (0..(storage.parameters[:number_of_random_vectors] - 1)).to_a.combination(radius).each do |flips|
55
62
  probe = hash.clone
56
63
  flips.each { |d| probe[d] = (probe[d] == 1) ? 0 : 1 }
57
64
  probe_hash = array_to_hash(probe)
58
- results += bucket[probe_hash] if bucket.has_key?(probe_hash)
65
+ probe_bucket_results = storage.query_bucket(bucket, probe_hash)
66
+ results += probe_bucket_results if probe_bucket_results
59
67
  end
60
68
  end
61
69
  end
62
70
  end
63
- results.uniq!
71
+ results = MathUtil.uniq(results)
64
72
  order_vectors_by_similarity(vector, results)
65
73
  end
66
74
 
@@ -70,7 +78,7 @@ module LSH
70
78
 
71
79
  def hashes(vector)
72
80
  hashes = []
73
- @projections.each do |projection|
81
+ storage.projections.each do |projection|
74
82
  hashes << hash(vector, projection)
75
83
  end
76
84
  hashes
@@ -80,7 +88,8 @@ module LSH
80
88
  hash = []
81
89
  projection.each do |random_vector|
82
90
  dot_product = similarity(vector, random_vector)
83
- if @window == Float::INFINITY # Binary LSH
91
+ window = storage.parameters[:window]
92
+ if window == Float::INFINITY # Binary LSH
84
93
  if dot_product >= 0
85
94
  hash << 1
86
95
  else
@@ -88,7 +97,7 @@ module LSH
88
97
  end
89
98
  else
90
99
  b = bias ? MathUtil.random_uniform : 0.0
91
- hash << (b + dot_product / @window).floor
100
+ hash << (b + dot_product / window).floor
92
101
  end
93
102
  end
94
103
  hash
@@ -137,7 +146,7 @@ module LSH
137
146
  end
138
147
 
139
148
  def inspect
140
- "LSH index; dimension: #{@dim}; window size: #{@window}; #{@number_of_random_vectors} random vectors; #{@number_of_independent_projections} independent projections"
149
+ "#<LSH index; dimension: #{storage.parameters.dim}; window size: #{storage.parameters.window}; #{storage.parameters.number_of_random_vectors} random vectors; #{storage.parameters.number_of_independent_projections} independent projections>"
141
150
  end
142
151
 
143
152
  end
@@ -1,6 +1,6 @@
1
1
  # ruby-lsh
2
2
  #
3
- # Copyright (c) 2011 British Broadcasting Corporation
3
+ # Copyright (c) 2012 British Broadcasting Corporation
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License");
6
6
  # you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
15
15
  # limitations under the License.
16
16
 
17
17
  require 'gsl'
18
+ require 'json'
18
19
 
19
20
  module LSH
20
21
 
@@ -42,6 +43,33 @@ module LSH
42
43
  v.norm
43
44
  end
44
45
 
46
+ def self.uniq(vs)
47
+ # Can't use uniq as
48
+ # [ v, JSON.parse(v.to_json) ].uniq.size == 2 with GSL
49
+ results = []
50
+ vs.each { |v| results << v unless results.member? v }
51
+ results
52
+ end
53
+
54
+ end
55
+
56
+ end
57
+
58
+ module GSL
59
+
60
+ class Vector
61
+
62
+ def to_json(*a)
63
+ {
64
+ 'json_class' => self.class.name,
65
+ 'data' => to_a,
66
+ }.to_json(*a)
67
+ end
68
+
69
+ def self.json_create(o)
70
+ alloc(*o['data'])
71
+ end
72
+
45
73
  end
46
74
 
47
75
  end
@@ -1,6 +1,6 @@
1
1
  # ruby-lsh
2
2
  #
3
- # Copyright (c) 2011 British Broadcasting Corporation
3
+ # Copyright (c) 2012 British Broadcasting Corporation
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License");
6
6
  # you may not use this file except in compliance with the License.
@@ -40,6 +40,31 @@ module LSH
40
40
  v.norm2
41
41
  end
42
42
 
43
+ def self.uniq(vs)
44
+ vs.uniq
45
+ end
46
+
47
+ end
48
+
49
+ end
50
+
51
+
52
+ module JBLAS
53
+
54
+ class DoubleMatrix
55
+
56
+ def to_json(*a)
57
+ {
58
+ 'json_class' => 'JBLAS::DoubleMatrix',
59
+ 'data' => to_a,
60
+ }.to_json(*a)
61
+ end
62
+
63
+ def self.json_create(o)
64
+ from_array(o['data']).t
65
+ end
66
+
43
67
  end
44
68
 
45
69
  end
70
+
@@ -0,0 +1,60 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2012 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module LSH
18
+
19
+ module Storage
20
+
21
+ class Memory
22
+
23
+ attr_accessor :projections
24
+ attr_accessor :parameters
25
+ attr_reader :buckets
26
+
27
+ def has_index?
28
+ projections and parameters and @buckets
29
+ end
30
+
31
+ def reset!
32
+ @buckets = nil
33
+ end
34
+
35
+ def create_new_bucket
36
+ @buckets ||= []
37
+ @buckets << {}
38
+ end
39
+
40
+ def add_vector_to_bucket(bucket, hash, vector)
41
+ if bucket.has_key? hash
42
+ bucket[hash] << vector
43
+ else
44
+ bucket[hash] = [vector]
45
+ end
46
+ end
47
+
48
+ def find_bucket(i)
49
+ @buckets[i]
50
+ end
51
+
52
+ def query_bucket(bucket, hash)
53
+ bucket[hash]
54
+ end
55
+
56
+ end
57
+
58
+ end
59
+
60
+ end
@@ -0,0 +1,86 @@
1
+ # ruby-lsh
2
+ #
3
+ # Copyright (c) 2012 British Broadcasting Corporation
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'redis'
18
+ require 'json'
19
+
20
+ module LSH
21
+
22
+ module Storage
23
+
24
+ class RedisBackend
25
+
26
+ attr_reader :redis
27
+
28
+ def initialize(params = { :redis => { :host => '127.0.0.1', :port => 6379 } })
29
+ @redis = Redis.new(params[:redis])
30
+ end
31
+
32
+ def reset!
33
+ @redis.flushall
34
+ end
35
+
36
+ def has_index?
37
+ projections and parameters and @redis.get("buckets") > 0
38
+ end
39
+
40
+ def projections=(projections)
41
+ @redis.set "projections", projections.to_json
42
+ end
43
+
44
+ def projections
45
+ begin
46
+ @projections ||= JSON.parse(@redis.get "projections")
47
+ rescue TypeError
48
+ nil
49
+ end
50
+ end
51
+
52
+ def parameters=(parms)
53
+ parms[:window] = 'Infinity' if parms[:window] == Float::INFINITY
54
+ @redis.set "parameters", parms.to_json
55
+ end
56
+
57
+ def parameters
58
+ @parms ||= (
59
+ parms = JSON.parse(@redis.get "parameters")
60
+ parms.keys.each { |k| parms[k.to_sym] = parms[k]; parms.delete(k) }
61
+ parms[:window] = Float::INFINITY if parms[:window] == 'Infinity'
62
+ parms
63
+ )
64
+ end
65
+
66
+ def create_new_bucket
67
+ @redis.incr "buckets"
68
+ end
69
+
70
+ def add_vector_to_bucket(bucket, hash, vector)
71
+ @redis.sadd "#{bucket}:#{hash}", vector.to_json
72
+ end
73
+
74
+ def find_bucket(i)
75
+ "bucket:#{i}" if @redis.get("buckets").to_i > i
76
+ end
77
+
78
+ def query_bucket(bucket, hash)
79
+ @redis.smembers("#{bucket}:#{hash}").map { |vector_json| JSON.parse(vector_json) }
80
+ end
81
+
82
+ end
83
+
84
+ end
85
+
86
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lsh
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-17 00:00:00.000000000 Z
12
+ date: 2012-12-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: gsl
@@ -27,6 +27,38 @@ dependencies:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
29
  version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: json
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: redis
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
30
62
  description: An implementation of LSH in Ruby, using JBLAS for JRuby and GSL for MRI
31
63
  email: yves.raimond@bbc.co.uk
32
64
  executables: []
@@ -37,6 +69,8 @@ files:
37
69
  - lib/lsh/index.rb
38
70
  - lib/lsh/math_util_gsl.rb
39
71
  - lib/lsh/math_util_jblas.rb
72
+ - lib/lsh/storage/memory.rb
73
+ - lib/lsh/storage/redis_backend.rb
40
74
  homepage: https://github.com/bbcrd/ruby-lsh
41
75
  licenses: []
42
76
  post_install_message: