lsh 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -38,7 +38,8 @@ module LSH
38
38
  Index.new(storage.parameters, storage) if storage.has_index?
39
39
  end
40
40
 
41
- def add(vector)
41
+ def add(vector, id = nil)
42
+ storage.add_vector_id(vector, id) if id
42
43
  hashes(vector).each_with_index do |hash, i|
43
44
  hash_i = array_to_hash(hash)
44
45
  bucket = storage.find_bucket(i)
@@ -46,6 +47,14 @@ module LSH
46
47
  end
47
48
  end
48
49
 
50
+ def vector_to_id(vector)
51
+ storage.vector_to_id(vector)
52
+ end
53
+
54
+ def id_to_vector(id)
55
+ storage.id_to_vector(id)
56
+ end
57
+
49
58
  def query(vector, multiprobe_radius = 0)
50
59
  hash_arrays = hashes(vector)
51
60
  hashes = hash_arrays.map { |a| array_to_hash(a) }
@@ -64,6 +73,18 @@ module LSH
64
73
  order_vectors_by_similarity(vector, results)
65
74
  end
66
75
 
76
+ def query_ids(id, multiprobe_radius = 0)
77
+ vector = id_to_vector(id)
78
+ query_ids_by_vector(vector, multiprobe_radius)
79
+ end
80
+
81
+ def query_ids_by_vector(vector, multiprobe_radius = 0)
82
+ vectors = query(vector, multiprobe_radius)
83
+ results = []
84
+ vectors.each { |v| results << vector_to_id(v) }
85
+ results
86
+ end
87
+
67
88
  def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
68
89
  mp_arrays = []
69
90
  (1..multiprobe_radius).to_a.each do |radius|
@@ -45,6 +45,21 @@ module LSH
45
45
  end
46
46
  end
47
47
 
48
+ def add_vector_id(vector, id)
49
+ @vector_to_id ||= {}
50
+ @vector_to_id[vector.hash] = id
51
+ @id_to_vector ||= {}
52
+ @id_to_vector[id] = vector
53
+ end
54
+
55
+ def vector_to_id(vector)
56
+ @vector_to_id[vector.hash] if @vector_to_id
57
+ end
58
+
59
+ def id_to_vector(id)
60
+ @id_to_vector[id] if @id_to_vector
61
+ end
62
+
48
63
  def find_bucket(i)
49
64
  @buckets[i]
50
65
  end
@@ -112,11 +112,37 @@ module LSH
112
112
  @redis.incr "lsh:buckets"
113
113
  end
114
114
 
115
+ def save_vector(vector)
116
+ path = File.join(@data_dir, vector.hash.to_s+'.dat')
117
+ vector.save(path) unless File.exists?(path)
118
+ end
119
+
120
+ def load_vector(hash)
121
+ vector = MathUtil.zeros(parameters[:dim])
122
+ vector.load(File.join(@data_dir, hash+'.dat'))
123
+ vector
124
+ end
125
+
115
126
  def add_vector_to_bucket(bucket, hash, vector)
116
- vector.save(File.join(@data_dir, vector.hash.to_s+'.dat')) # Writing vector to disk
127
+ save_vector(vector) # Writing vector to disk if not already there
117
128
  @redis.sadd "#{bucket}:#{hash}", vector.hash.to_s # Only storing vector's hash in Redis
118
129
  end
119
130
 
131
+ def add_vector_id(vector, id)
132
+ save_vector(vector) # Writing vector to disk if not already there
133
+ @redis.set "lsh:vector_to_id:#{vector.hash}", id
134
+ @redis.set "lsh:id_to_vector:#{id}", vector.hash.to_s
135
+ end
136
+
137
+ def vector_to_id(vector)
138
+ @redis.get "lsh:vector_to_id:#{vector.hash}"
139
+ end
140
+
141
+ def id_to_vector(id)
142
+ vector_hash = @redis.get "lsh:id_to_vector:#{id}"
143
+ load_vector(vector_hash)
144
+ end
145
+
120
146
  def find_bucket(i)
121
147
  "lsh:bucket:#{i}"
122
148
  end
@@ -133,8 +159,7 @@ module LSH
133
159
  vector_hashes.uniq!
134
160
  results = []
135
161
  vector_hashes.each do |vector_hash|
136
- vector = MathUtil.zeros(parameters[:dim])
137
- vector.load(File.join(@data_dir, vector_hash+'.dat'))
162
+ vector = load_vector(vector_hash)
138
163
  results << vector
139
164
  end
140
165
  results
@@ -24,7 +24,13 @@ module LSH
24
24
  if mime_type == 'application/json'
25
25
  t0 = Time.now
26
26
  vector = JSON.parse(params[:data])
27
- results = index.query(vector)
27
+ result_vectors = index.query(vector, params[:radius] || 0)
28
+ results = []
29
+ if params[:include] == 'id'
30
+ result_vectors.each { |v| results << { :id => index.vector_to_id(v), :data => v } }
31
+ else
32
+ result_vectors.each { |v| results << { :data => v } }
33
+ end
28
34
  content_type :json
29
35
  { "time" => Time.now - t0, "results" => results }.to_json
30
36
  else
@@ -32,13 +38,36 @@ module LSH
32
38
  end
33
39
  end
34
40
 
41
+ post '/query-ids' do
42
+ if params[:data] # We're querying with a vector
43
+ mime_type = (params[:mime_type] || 'application/json')
44
+ if mime_type == 'application/json'
45
+ t0 = Time.now
46
+ vector = JSON.parse(params[:data])
47
+ results = index.query_ids_by_vector(vector, params[:radius] || 0)
48
+ content_type :json
49
+ { "time" => Time.now - t0, "results" => results }.to_json
50
+ else
51
+ raise "Unrecognised mime-type"
52
+ end
53
+ elsif params[:id] # We're querying with an id
54
+ raise "Unknown id" unless index.id_to_vector(params[:id])
55
+ t0 = Time.now
56
+ results = index.query_ids(params[:id], params[:radius] || 0)
57
+ content_type :json
58
+ { "time" => Time.now - t0, "results" => results }.to_json
59
+ else
60
+ raise "Missing query"
61
+ end
62
+ end
63
+
35
64
  post '/index' do
36
65
  raise "Missing data" unless params[:data]
37
66
  mime_type = (params[:mime_type] || 'application/json')
38
67
  if mime_type == 'application/json'
39
68
  t0 = Time.now
40
69
  vector = JSON.parse(params[:data])
41
- index.add(vector)
70
+ index.add(vector, params[:id])
42
71
  content_type :json
43
72
  { "time" => Time.now - t0, "status" => "indexed" }.to_json
44
73
  else
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lsh
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-03 00:00:00.000000000 Z
12
+ date: 2013-01-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: gsl