lsh 0.4.2-java → 0.5.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/lsh/index.rb +23 -15
- data/lib/lsh/math_util_gsl.rb +0 -8
- data/lib/lsh/math_util_jblas.rb +0 -4
- data/lib/lsh/storage/memory.rb +32 -15
- data/lib/lsh/storage/redis_backend.rb +32 -26
- data/lib/lsh/web.rb +1 -7
- metadata +1 -1
    
        data/lib/lsh/index.rb
    CHANGED
    
    | @@ -39,16 +39,18 @@ module LSH | |
| 39 39 | 
             
                end
         | 
| 40 40 |  | 
| 41 41 | 
             
                def add(vector, id = nil)
         | 
| 42 | 
            -
                   | 
| 42 | 
            +
                  vector_hash = vector.hash
         | 
| 43 | 
            +
                  storage.add_vector(vector, vector_hash)
         | 
| 44 | 
            +
                  storage.add_vector_id(vector_hash, id) if id
         | 
| 43 45 | 
             
                  hashes(vector).each_with_index do |hash, i|
         | 
| 44 46 | 
             
                    hash_i = array_to_hash(hash)
         | 
| 45 47 | 
             
                    bucket = storage.find_bucket(i)
         | 
| 46 | 
            -
                    storage. | 
| 48 | 
            +
                    storage.add_vector_hash_to_bucket(bucket, hash_i, vector_hash)
         | 
| 47 49 | 
             
                  end
         | 
| 48 50 | 
             
                end
         | 
| 49 51 |  | 
| 50 | 
            -
                def  | 
| 51 | 
            -
                  storage. | 
| 52 | 
            +
                def vector_hash_to_id(vector_hash)
         | 
| 53 | 
            +
                  storage.vector_hash_to_id(vector_hash)
         | 
| 52 54 | 
             
                end
         | 
| 53 55 |  | 
| 54 56 | 
             
                def id_to_vector(id)
         | 
| @@ -63,14 +65,15 @@ module LSH | |
| 63 65 | 
             
                  # Take query hashes, move them around at radius r, and use them to do another query
         | 
| 64 66 | 
             
                  # TODO: only works for binary LSH atm
         | 
| 65 67 | 
             
                  if multiprobe_radius > 0
         | 
| 68 | 
            +
                    raise Exception.new("Non-zero multiprobe radius only implemented for binary LSH") unless hashes_are_binary?
         | 
| 66 69 | 
             
                    mp_arrays = multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
         | 
| 67 70 | 
             
                    mp_arrays.each do |probes_arrays|
         | 
| 68 71 | 
             
                      probes_hashes = probes_arrays.map { |a| array_to_hash(a) }
         | 
| 69 72 | 
             
                      results += storage.query_buckets(probes_hashes)
         | 
| 70 73 | 
             
                    end
         | 
| 74 | 
            +
                    results.uniq! { |result| result[:hash] }
         | 
| 71 75 | 
             
                  end
         | 
| 72 | 
            -
                   | 
| 73 | 
            -
                  order_vectors_by_similarity(vector, results)
         | 
| 76 | 
            +
                  order_results_by_similarity(vector, results)
         | 
| 74 77 | 
             
                end
         | 
| 75 78 |  | 
| 76 79 | 
             
                def query_ids(id, multiprobe_radius = 0)
         | 
| @@ -79,10 +82,8 @@ module LSH | |
| 79 82 | 
             
                end
         | 
| 80 83 |  | 
| 81 84 | 
             
                def query_ids_by_vector(vector, multiprobe_radius = 0)
         | 
| 82 | 
            -
                   | 
| 83 | 
            -
                  results  | 
| 84 | 
            -
                  vectors.each { |v| results << vector_to_id(v) }
         | 
| 85 | 
            -
                  results
         | 
| 85 | 
            +
                  results = query(vector, multiprobe_radius)
         | 
| 86 | 
            +
                  results.map { |result| vector_hash_to_id(result[:hash]) }
         | 
| 86 87 | 
             
                end
         | 
| 87 88 |  | 
| 88 89 | 
             
                def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
         | 
| @@ -97,8 +98,11 @@ module LSH | |
| 97 98 | 
             
                  mp_arrays
         | 
| 98 99 | 
             
                end
         | 
| 99 100 |  | 
| 100 | 
            -
                def  | 
| 101 | 
            -
                   | 
| 101 | 
            +
                def order_results_by_similarity(vector, results)
         | 
| 102 | 
            +
                  # Faster than vectors.sort - we precompute all similarities to vector
         | 
| 103 | 
            +
                  # and order using those
         | 
| 104 | 
            +
                  similarities = results.map { |result| [ result[:hash], result[:id], result[:data], similarity(vector, result[:data]) ] }
         | 
| 105 | 
            +
                  similarities.sort_by { |hash, id, vector, sim| sim } .reverse .map { |vs| { :hash => vs[0], :id => vs[1], :data => vs[2] } }
         | 
| 102 106 | 
             
                end
         | 
| 103 107 |  | 
| 104 108 | 
             
                def hashes(vector)
         | 
| @@ -108,7 +112,7 @@ module LSH | |
| 108 112 | 
             
                  end
         | 
| 109 113 | 
             
                  hashes
         | 
| 110 114 | 
             
                end
         | 
| 111 | 
            -
             | 
| 115 | 
            +
             
         | 
| 112 116 | 
             
                def hash(vector, projection, bias = true)
         | 
| 113 117 | 
             
                  hash = []
         | 
| 114 118 | 
             
                  dot_products = (projection * vector.transpose).column(0).to_a
         | 
| @@ -128,6 +132,10 @@ module LSH | |
| 128 132 | 
             
                  hash
         | 
| 129 133 | 
             
                end
         | 
| 130 134 |  | 
| 135 | 
            +
                def hashes_are_binary?
         | 
| 136 | 
            +
                  storage.parameters[:window] == Float::INFINITY
         | 
| 137 | 
            +
                end
         | 
| 138 | 
            +
             | 
| 131 139 | 
             
                def random_vector(dim)
         | 
| 132 140 | 
             
                  MathUtil.random_gaussian_matrix(1, dim)
         | 
| 133 141 | 
             
                end
         | 
| @@ -150,7 +158,7 @@ module LSH | |
| 150 158 | 
             
                  #value
         | 
| 151 159 | 
             
                end
         | 
| 152 160 |  | 
| 153 | 
            -
             | 
| 161 | 
            +
                 def generate_projections(dim, k, l)
         | 
| 154 162 | 
             
                  projections = []
         | 
| 155 163 | 
             
                  l.times do |i|
         | 
| 156 164 | 
             
                    projections << generate_projection(dim, k)
         | 
| @@ -160,7 +168,7 @@ module LSH | |
| 160 168 |  | 
| 161 169 | 
             
                def generate_projection(dim, k)
         | 
| 162 170 | 
             
                  MathUtil.random_gaussian_matrix(k, dim)
         | 
| 163 | 
            -
             | 
| 171 | 
            +
                 end
         | 
| 164 172 |  | 
| 165 173 | 
             
                def similarity(v1, v2)
         | 
| 166 174 | 
             
                  MathUtil.dot(v1, v2)
         | 
    
        data/lib/lsh/math_util_gsl.rb
    CHANGED
    
    | @@ -51,14 +51,6 @@ module LSH | |
| 51 51 | 
             
                  v.norm
         | 
| 52 52 | 
             
                end
         | 
| 53 53 |  | 
| 54 | 
            -
                def self.uniq(vs)
         | 
| 55 | 
            -
                  # Can't use uniq as
         | 
| 56 | 
            -
                  # [ v, JSON.parse(v.to_json, :create_additions => true) ].uniq.size == 2 with GSL
         | 
| 57 | 
            -
                  results = []
         | 
| 58 | 
            -
                  vs.each { |v| results << v unless results.member? v }
         | 
| 59 | 
            -
                  results
         | 
| 60 | 
            -
                end
         | 
| 61 | 
            -
             | 
| 62 54 | 
             
              end
         | 
| 63 55 |  | 
| 64 56 | 
             
            end
         | 
    
        data/lib/lsh/math_util_jblas.rb
    CHANGED
    
    
    
        data/lib/lsh/storage/memory.rb
    CHANGED
    
    | @@ -30,6 +30,9 @@ module LSH | |
| 30 30 |  | 
| 31 31 | 
             
                  def reset!
         | 
| 32 32 | 
             
                    @buckets = nil
         | 
| 33 | 
            +
                    @vectors = nil
         | 
| 34 | 
            +
                    @vector_hash_to_id = nil
         | 
| 35 | 
            +
                    @id_to_vector = nil
         | 
| 33 36 | 
             
                  end
         | 
| 34 37 |  | 
| 35 38 | 
             
                  def create_new_bucket
         | 
| @@ -37,27 +40,32 @@ module LSH | |
| 37 40 | 
             
                    @buckets << {}
         | 
| 38 41 | 
             
                  end
         | 
| 39 42 |  | 
| 40 | 
            -
                  def  | 
| 43 | 
            +
                  def add_vector(vector, vector_hash)
         | 
| 44 | 
            +
                    @vectors ||= {}
         | 
| 45 | 
            +
                    @vectors[vector_hash] = vector
         | 
| 46 | 
            +
                  end
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                  def add_vector_hash_to_bucket(bucket, hash, vector_hash)
         | 
| 41 49 | 
             
                    if bucket.has_key? hash
         | 
| 42 | 
            -
                      bucket[hash] <<  | 
| 50 | 
            +
                      bucket[hash] << vector_hash
         | 
| 43 51 | 
             
                    else
         | 
| 44 | 
            -
                      bucket[hash] = [ | 
| 52 | 
            +
                      bucket[hash] = [vector_hash]
         | 
| 45 53 | 
             
                    end
         | 
| 46 54 | 
             
                  end
         | 
| 47 55 |  | 
| 48 | 
            -
                  def add_vector_id( | 
| 49 | 
            -
                    @ | 
| 50 | 
            -
                    @ | 
| 56 | 
            +
                  def add_vector_id(vector_hash, id)
         | 
| 57 | 
            +
                    @vector_hash_to_id ||= {}
         | 
| 58 | 
            +
                    @vector_hash_to_id[vector_hash] = id
         | 
| 51 59 | 
             
                    @id_to_vector ||= {}
         | 
| 52 | 
            -
                    @id_to_vector[id] =  | 
| 60 | 
            +
                    @id_to_vector[id] = vector_hash
         | 
| 53 61 | 
             
                  end
         | 
| 54 62 |  | 
| 55 | 
            -
                  def  | 
| 56 | 
            -
                    @ | 
| 63 | 
            +
                  def vector_hash_to_id(vector_hash)
         | 
| 64 | 
            +
                    @vector_hash_to_id[vector_hash] if @vector_hash_to_id
         | 
| 57 65 | 
             
                  end
         | 
| 58 66 |  | 
| 59 67 | 
             
                  def id_to_vector(id)
         | 
| 60 | 
            -
                    @id_to_vector[id] if @id_to_vector
         | 
| 68 | 
            +
                    @vectors[@id_to_vector[id]] if @id_to_vector
         | 
| 61 69 | 
             
                  end
         | 
| 62 70 |  | 
| 63 71 | 
             
                  def find_bucket(i)
         | 
| @@ -65,13 +73,22 @@ module LSH | |
| 65 73 | 
             
                  end
         | 
| 66 74 |  | 
| 67 75 | 
             
                  def query_buckets(hashes)
         | 
| 68 | 
            -
                     | 
| 76 | 
            +
                    results_hashes = {}
         | 
| 69 77 | 
             
                    hashes.each_with_index do |hash, i|
         | 
| 70 | 
            -
                       | 
| 71 | 
            -
                       | 
| 72 | 
            -
             | 
| 78 | 
            +
                      vectors_hashes_in_bucket = @buckets[i][hash]
         | 
| 79 | 
            +
                      if vectors_hashes_in_bucket
         | 
| 80 | 
            +
                        vectors_hashes_in_bucket.each do |vector_hash|
         | 
| 81 | 
            +
                          results_hashes[vector_hash] = true
         | 
| 82 | 
            +
                        end
         | 
| 83 | 
            +
                      end
         | 
| 84 | 
            +
                    end
         | 
| 85 | 
            +
                    results_hashes.keys.map do |vector_hash|
         | 
| 86 | 
            +
                      { 
         | 
| 87 | 
            +
                        :data => @vectors[vector_hash], 
         | 
| 88 | 
            +
                        :hash => vector_hash, 
         | 
| 89 | 
            +
                        :id => vector_hash_to_id(vector_hash)
         | 
| 90 | 
            +
                      }
         | 
| 73 91 | 
             
                    end
         | 
| 74 | 
            -
                    results
         | 
| 75 92 | 
             
                  end
         | 
| 76 93 |  | 
| 77 94 | 
             
                end
         | 
| @@ -28,10 +28,8 @@ module LSH | |
| 28 28 | 
             
                  def initialize(params = { :redis => { :host => '127.0.0.1', :port => 6379 }, :data_dir => 'data' })
         | 
| 29 29 | 
             
                    @redis = Redis.new(params[:redis])
         | 
| 30 30 | 
             
                    @data_dir = params[:data_dir]
         | 
| 31 | 
            -
                    unless File.exists?(@data_dir)
         | 
| 32 | 
            -
             | 
| 33 | 
            -
                      Dir.mkdir(File.join(@data_dir, 'projections'))
         | 
| 34 | 
            -
                    end
         | 
| 31 | 
            +
                    Dir.mkdir(@data_dir) unless File.exists?(@data_dir)
         | 
| 32 | 
            +
                    Dir.mkdir(File.join(@data_dir, 'projections')) unless File.exists?(File.join(@data_dir, 'projections'))
         | 
| 35 33 | 
             
                  end
         | 
| 36 34 |  | 
| 37 35 | 
             
                  def reset!
         | 
| @@ -42,6 +40,10 @@ module LSH | |
| 42 40 | 
             
                  def clear_data!
         | 
| 43 41 | 
             
                    keys = @redis.keys("lsh:bucket:*")
         | 
| 44 42 | 
             
                    @redis.del(keys) unless keys.empty?
         | 
| 43 | 
            +
                    keys = @redis.keys("lsh:vector_to_id:*")
         | 
| 44 | 
            +
                    @redis.del(keys) unless keys.empty?
         | 
| 45 | 
            +
                    keys = @redis.keys("lsh:id_to_vector:*")
         | 
| 46 | 
            +
                    @redis.del(keys) unless keys.empty?
         | 
| 45 47 | 
             
                    delete_dat_files_in_dir(@data_dir)
         | 
| 46 48 | 
             
                  end
         | 
| 47 49 |  | 
| @@ -107,8 +109,8 @@ module LSH | |
| 107 109 | 
             
                    @redis.incr "lsh:buckets"
         | 
| 108 110 | 
             
                  end
         | 
| 109 111 |  | 
| 110 | 
            -
                  def save_vector(vector)
         | 
| 111 | 
            -
                    path = File.join(@data_dir,  | 
| 112 | 
            +
                  def save_vector(vector, vector_hash)
         | 
| 113 | 
            +
                    path = File.join(@data_dir, vector_hash.to_s+'.dat')
         | 
| 112 114 | 
             
                    vector.save(path) unless File.exists?(path)
         | 
| 113 115 | 
             
                  end
         | 
| 114 116 |  | 
| @@ -118,19 +120,21 @@ module LSH | |
| 118 120 | 
             
                    vector
         | 
| 119 121 | 
             
                  end
         | 
| 120 122 |  | 
| 121 | 
            -
                  def  | 
| 122 | 
            -
                    save_vector(vector) # Writing vector to disk if not already there
         | 
| 123 | 
            -
                    @redis.sadd "#{bucket}:#{hash}", vector.hash.to_s # Only storing vector's hash in Redis
         | 
| 123 | 
            +
                  def add_vector(vector, vector_hash)
         | 
| 124 | 
            +
                    save_vector(vector, vector_hash) # Writing vector to disk if not already there
         | 
| 124 125 | 
             
                  end
         | 
| 125 126 |  | 
| 126 | 
            -
                  def  | 
| 127 | 
            -
                     | 
| 128 | 
            -
                    @redis.set "lsh:vector_to_id:#{vector.hash}", id
         | 
| 129 | 
            -
                    @redis.set "lsh:id_to_vector:#{id}", vector.hash.to_s
         | 
| 127 | 
            +
                  def add_vector_hash_to_bucket(bucket, hash, vector_hash)
         | 
| 128 | 
            +
                    @redis.sadd "#{bucket}:#{hash}", vector_hash.to_s # Only storing vector's hash in Redis
         | 
| 130 129 | 
             
                  end
         | 
| 131 130 |  | 
| 132 | 
            -
                  def  | 
| 133 | 
            -
                    @redis. | 
| 131 | 
            +
                  def add_vector_id(vector_hash, id)
         | 
| 132 | 
            +
                    @redis.set "lsh:vector_to_id:#{vector_hash}", id
         | 
| 133 | 
            +
                    @redis.set "lsh:id_to_vector:#{id}", vector_hash.to_s
         | 
| 134 | 
            +
                  end
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                  def vector_hash_to_id(vector_hash)
         | 
| 137 | 
            +
                    @redis.get "lsh:vector_to_id:#{vector_hash}"
         | 
| 134 138 | 
             
                  end
         | 
| 135 139 |  | 
| 136 140 | 
             
                  def id_to_vector(id)
         | 
| @@ -143,21 +147,23 @@ module LSH | |
| 143 147 | 
             
                  end
         | 
| 144 148 |  | 
| 145 149 | 
             
                  def query_buckets(hashes)
         | 
| 146 | 
            -
                     | 
| 150 | 
            +
                    results_hashes = {}
         | 
| 147 151 | 
             
                    hashes.each_with_index do |hash, i|
         | 
| 148 152 | 
             
                      bucket = find_bucket(i)
         | 
| 149 | 
            -
                       | 
| 150 | 
            -
                       | 
| 153 | 
            +
                      vector_hashes_in_bucket = @redis.smembers("#{bucket}:#{hash}")
         | 
| 154 | 
            +
                      if vector_hashes_in_bucket
         | 
| 155 | 
            +
                        vector_hashes_in_bucket.each do |vector_hash|
         | 
| 156 | 
            +
                          results_hashes[vector_hash] = true
         | 
| 157 | 
            +
                        end
         | 
| 158 | 
            +
                      end
         | 
| 151 159 | 
             
                    end
         | 
| 152 | 
            -
                     | 
| 153 | 
            -
             | 
| 154 | 
            -
             | 
| 155 | 
            -
             | 
| 156 | 
            -
             | 
| 157 | 
            -
                       | 
| 158 | 
            -
                      results << vector
         | 
| 160 | 
            +
                    results_hashes.keys.map do |vector_hash|
         | 
| 161 | 
            +
                      {
         | 
| 162 | 
            +
                        :data => load_vector(vector_hash),
         | 
| 163 | 
            +
                        :hash => vector_hash.to_i,
         | 
| 164 | 
            +
                        :id => vector_hash_to_id(vector_hash)
         | 
| 165 | 
            +
                      }
         | 
| 159 166 | 
             
                    end
         | 
| 160 | 
            -
                    results
         | 
| 161 167 | 
             
                  end
         | 
| 162 168 |  | 
| 163 169 | 
             
                end
         | 
    
        data/lib/lsh/web.rb
    CHANGED
    
    | @@ -24,13 +24,7 @@ module LSH | |
| 24 24 | 
             
                  if mime_type == 'application/json'
         | 
| 25 25 | 
             
                    t0 = Time.now
         | 
| 26 26 | 
             
                    vector = JSON.parse(params[:data], :create_additions => true)
         | 
| 27 | 
            -
                     | 
| 28 | 
            -
                    results = []
         | 
| 29 | 
            -
                    if params[:include] == 'id'
         | 
| 30 | 
            -
                      result_vectors.each { |v| results << { :id => index.vector_to_id(v), :data => v } }
         | 
| 31 | 
            -
                    else
         | 
| 32 | 
            -
                      result_vectors.each { |v| results << { :data => v } }
         | 
| 33 | 
            -
                    end
         | 
| 27 | 
            +
                    results = index.query(vector, params[:radius] || 0)
         | 
| 34 28 | 
             
                    content_type :json
         | 
| 35 29 | 
             
                    { "time" => Time.now - t0, "results" => results }.to_json
         | 
| 36 30 | 
             
                  else
         |