hanny 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 76848de111777349352ddeb7aa269ad3694504661ac2fe1c45729ebcd0f27414
4
+ data.tar.gz: a911da20689134ebecdd01b88827a2d2cc65149b497103f32f528fa754590c15
5
+ SHA512:
6
+ metadata.gz: 6c1e4fde8dc03f439454a476d16be7b48f384cb0adf24a17fce47e09387d4583e31ba66b3a9b7f6d9fb5ff6e386e9282fba68f05c9ff0cb74f88013e19d65b91
7
+ data.tar.gz: 30441f1aef6a05bc0a609d4b2176f4e04d7d0d9882579de8909aae9076dbdf68670eb4275da1a863dffa13454997597e83d229a1371f6a51bee0f58aa3740abd
@@ -0,0 +1 @@
1
+ service_name: travis-ci
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
12
+
13
+ *.swp
14
+ .DS_Store
15
+ .ruby-version
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,39 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.1
3
+ DisplayCopNames: true
4
+ DisplayStyleGuide: true
5
+
6
+ Documentation:
7
+ Enabled: false
8
+
9
+ Metrics/LineLength:
10
+ Max: 140
11
+ IgnoredPatterns: ['(\A|\s)#']
12
+
13
+ Metrics/ModuleLength:
14
+ Max: 200
15
+
16
+ Metrics/ClassLength:
17
+ Max: 200
18
+
19
+ Metrics/MethodLength:
20
+ Max: 40
21
+
22
+ Metrics/AbcSize:
23
+ Max: 60
24
+
25
+ Metrics/BlockLength:
26
+ Exclude:
27
+ - 'spec/**/*'
28
+
29
+ ParameterLists:
30
+ Max: 10
31
+
32
+ Security/MarshalLoad:
33
+ Enabled: false
34
+
35
+ Naming/UncommunicativeMethodParamName:
36
+ Enabled: false
37
+
38
+ Style/FormatStringToken:
39
+ Enabled: false
@@ -0,0 +1,11 @@
1
+ sudo: false
2
+ os: linux
3
+ dist: trusty
4
+ language: ruby
5
+ rvm:
6
+ - 2.2
7
+ - 2.3
8
+ - 2.4
9
+ - 2.5
10
+ before_install:
11
+ - gem install --no-document bundler -v '~> 1.16'
@@ -0,0 +1,74 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, gender identity and expression, level of experience,
9
+ nationality, personal appearance, race, religion, or sexual identity and
10
+ orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies both within project spaces and in public spaces
49
+ when an individual is representing the project or its community. Examples of
50
+ representing a project or community include using an official project e-mail
51
+ address, posting via an official social media account, or acting as an appointed
52
+ representative at an online or offline event. Representation of a project may be
53
+ further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at yoshoku@outlook.com. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at [http://contributor-covenant.org/version/1/4][version]
72
+
73
+ [homepage]: http://contributor-covenant.org
74
+ [version]: http://contributor-covenant.org/version/1/4/
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in hanny.gemspec
6
+ gemspec
@@ -0,0 +1,23 @@
1
+ Copyright (c) 2017 yoshoku
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,150 @@
1
+ # Hanny
2
+
3
+ [![Build Status](https://travis-ci.org/yoshoku/Hanny.svg?branch=master)](https://travis-ci.org/yoshoku/Hanny)
4
+ [![Coverage Status](https://coveralls.io/repos/github/yoshoku/Hanny/badge.svg?branch=master)](https://coveralls.io/github/yoshoku/Hanny?branch=master)
5
+ [![Gem Version](https://badge.fury.io/rb/hanny.svg)](https://badge.fury.io/rb/hanny)
6
+ [![BSD 2-Clause License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://github.com/yoshoku/Hanny/blob/master/LICENSE.txt)
7
+
8
+ Hanny is a Hash-based Approximate Nearest Neighbor (ANN) search library in Ruby.
9
+ Hash-based ANN converts vector data into binary codes and builds a hash table by using the binary codes as hash keys.
10
+ To build the hash table, Hanny uses Locality Sensitive Hashing (LSH) of approximating cosine similarity.
11
+ It is known that if the code length is sufficiently long (ex. greater than 128-bit), LSH can obtain high search performance.
12
+ In the experiment, Hanny achieved about twenty times faster search speed than the brute-force search by Euclidean distance.
13
+
14
+ ## Installation
15
+
16
+ Add this line to your application's Gemfile:
17
+
18
+ ```ruby
19
+ gem 'hanny'
20
+ ```
21
+
22
+ And then execute:
23
+
24
+ $ bundle
25
+
26
+ Or install it yourself as:
27
+
28
+ $ gem install hanny
29
+
30
+ ## Usage
31
+
32
+ ```ruby
33
+ require 'hanny'
34
+
35
+ # Prepare vector data for search targets and queries with Numo::DFloat (shape: [n_samples, n_features]).
36
+ targets = Numo::DFloat.new(5000, 512).rand
37
+ queries = Numo::DFloat.new(10, 512).rand
38
+
39
+ # Build a search index with 256-bit binary code.
40
+ index = Hanny::LSHIndex.new(code_length: 256)
41
+ index.build_index(targets)
42
+
43
+ # Obtain the Array<Integer> that has the data indices of 10-nearest neighbors for each query.
44
+ candidates = index.search_knn(queries, n_neighbors: 10)
45
+
46
+ # Obtain the Array<Integer> that has the data indices whithin Hamming radius of 4 for each query.
47
+ candidates = index.search_radius(queries, radius: 4)
48
+
49
+ # Calculate pairwise euclidean distances between the query and its neighbors.
50
+ query_id = 0
51
+ distances = Hanny::Utils.euclidean_distance(queries[query_id, true], targets[candidates[query_id], true])
52
+
53
+ # Add new data to the search index.
54
+ appended_data_ids = index.append_data(new_data)
55
+
56
+ # Remove the data from the search index.
57
+ removed_data_ids = index.remove_data([0, 1, 2])
58
+
59
+ # Save and load the search index with Marshal.
60
+ File.open('index.dat', 'wb') { |f| f.write(Marshal.dump(index)) }
61
+ index = Marshal.load(File.binread('index.dat'))
62
+ ```
63
+
64
+ ## Experiment
65
+
66
+ I confirmed the search speed of Hanny's LSH with [MNIST](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#mnist) data set.
67
+ The experiment is carried out on MacBook Early 2016 (Core m3 1.1 GHz CPU and 8 GB memory).
68
+
69
+ Code:
70
+ ```ruby
71
+ require 'benchmark'
72
+ require 'svmkit'
73
+ require 'hanny'
74
+
75
+ # Load MNIST data set.
76
+ samples, labels = SVMKit::Dataset.load_libsvm_file('mnist')
77
+ samples = Numo::DFloat.cast(samples)
78
+ queries = samples[0..5, true]
79
+ targets = samples[6..-1, true]
80
+ qlabels = labels[0..5]
81
+ tlabels = labels[6..-1]
82
+
83
+ # Build LSH search index.
84
+ index = Hanny::LSHIndex.new(code_length: 128, random_seed: 1)
85
+ index.build_index(targets)
86
+
87
+ # Run a benchmark test for finding 5-nearest neighbors.
88
+ n_queries = queries.shape[0]
89
+ n_neighbors = 5
90
+ Benchmark.bm 50 do |r|
91
+ r.report 'LSH' do
92
+ candidates = index.search_knn(queries, n_neighbors: n_neighbors)
93
+ n_queries.times do |m|
94
+ STDERR.write("\nquery label: %d, neighbors label: " % qlabels[m])
95
+ candidates[m].each { |n| STDERR.write("%d, " % tlabels[n]) }
96
+ end
97
+ STDERR.write("\n")
98
+ end
99
+ r.report 'Brute-force' do
100
+ distance_mat = Hanny::Utils.euclidean_distance(queries, targets)
101
+ candidates = Array.new(n_queries) do |n|
102
+ distance_mat[n, true].to_a.map.with_index.sort_by(&:first).map(&:last)[0...n_neighbors]
103
+ end
104
+ n_queries.times do |m|
105
+ STDERR.write("\nquery label: %d, neighbors label: " % qlabels[m])
106
+ candidates[m].each { |n| STDERR.write("%d, " % tlabels[n]) }
107
+ end
108
+ STDERR.write("\n")
109
+ end
110
+ end
111
+ ```
112
+
113
+ Result:
114
+ ```bash
115
+ user system total real
116
+ LSH
117
+ query label: 5, neighbors label: 5, 5, 5, 5, 5,
118
+ query label: 0, neighbors label: 0, 0, 0, 0, 0,
119
+ query label: 4, neighbors label: 4, 4, 4, 4, 4,
120
+ query label: 1, neighbors label: 1, 1, 1, 1, 1,
121
+ query label: 9, neighbors label: 9, 9, 9, 9, 9,
122
+ query label: 2, neighbors label: 2, 2, 2, 2, 2,
123
+ 0.290000 0.010000 0.300000 ( 0.307445)
124
+ Brute-force
125
+ query label: 5, neighbors label: 5, 5, 5, 3, 5,
126
+ query label: 0, neighbors label: 0, 0, 0, 0, 0,
127
+ query label: 4, neighbors label: 4, 4, 4, 4, 4,
128
+ query label: 1, neighbors label: 1, 1, 1, 1, 1,
129
+ query label: 9, neighbors label: 9, 9, 9, 9, 9,
130
+ query label: 2, neighbors label: 2, 2, 2, 2, 2,
131
+ 6.350000 0.280000 6.630000 ( 6.682365)
132
+ ```
133
+
134
+ ## Development
135
+
136
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
137
+
138
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
139
+
140
+ ## Contributing
141
+
142
+ Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/Hanny. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
143
+
144
+ ## License
145
+
146
+ The gem is available as open source under the terms of the [BSD 2-clause License](https://opensource.org/licenses/BSD-2-Clause).
147
+
148
+ ## Code of Conduct
149
+
150
+ Everyone interacting in the Hanny project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/Hanny/blob/master/CODE_OF_CONDUCT.md).
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task default: :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'hanny'
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require 'irb'
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,38 @@
1
+
2
+ lib = File.expand_path('lib', __dir__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'hanny/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'hanny'
8
+ spec.version = Hanny::VERSION
9
+ spec.authors = ['yoshoku']
10
+ spec.email = ['yoshoku@outlook.com']
11
+
12
+ spec.summary = 'Hanny is a Hash-based Approximate Nearest Neighbor search library in Ruby.'
13
+ spec.description = <<MSG
14
+ Hanny is a Hash-based Approximate Nearest Neighbor (ANN) search library in Ruby.
15
+ Hash-based ANN converts vector data into binary codes and builds a hash table by using the binary codes as hash keys.
16
+ To build the hash table, Hanny uses Locality Sensitive Hashing (LSH) of approximating cosine similarity.
17
+ It is known that if the code length is sufficiently long (ex. greater than 128-bit), LSH can obtain high search performance.
18
+ In the experiment, Hanny achieved about twenty times faster search speed than the brute-force search by Euclidean distance.
19
+ MSG
20
+ spec.homepage = 'https://github.com/yoshoku/hanny'
21
+ spec.license = 'BSD-2-Clause'
22
+
23
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
24
+ f.match(%r{^(test|spec|features)/})
25
+ end
26
+ spec.bindir = 'exe'
27
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
+ spec.require_paths = ['lib']
29
+
30
+ spec.required_ruby_version = '>= 2.1'
31
+
32
+ spec.add_runtime_dependency 'numo-narray', '>= 0.9.0'
33
+
34
+ spec.add_development_dependency 'bundler', '~> 1.16'
35
+ spec.add_development_dependency 'coveralls', '~> 0.8'
36
+ spec.add_development_dependency 'rake', '~> 10.0'
37
+ spec.add_development_dependency 'rspec', '~> 3.0'
38
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'zlib'
4
+ require 'numo/narray'
5
+
6
+ require 'hanny/version'
7
+ require 'hanny/utils'
8
+ require 'hanny/lsh_index'
@@ -0,0 +1,273 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Hanny
4
+ # LSHIndex is a class that builds a search index with Locality Sensitive Hashing (LSH) [1].
5
+ # It is known that if the code length is sufficiently long (ex. greater than 128-bit),
6
+ # LSH can obtain higher search performance than many popular hashing methods [2].
7
+ # In search process, LSHIndex obtains search results by sorting the data stored in hash table with Hamming distances
8
+ # between query binary code and binary hash keys.
9
+ #
10
+ # @example
11
+ # # Prepare vector data for search targets and queries with Numo::DFloat (shape: [n_samples, n_features]).
12
+ # targets = Numo::DFloat.new(5000, 512).rand
13
+ # queries = Numo::DFloat.new(10, 512).rand
14
+ #
15
+ # # Build a search index with 256-bit binary code via LSH.
16
+ # # Although LSHIndex works without setting random_seed, it recommends setting random_seed for reproducibility.
17
+ # index = Hanny::LSHIndex.new(code_length: 256, random_seed: 1)
18
+ # index.build_index(targets)
19
+ #
20
+ # # Obtain the Array<Integer> that has the data indices of 10-neighbors for each query.
21
+ # candidates = index.search_knn(queries, n_neighbors: 10)
22
+ #
23
+ # # Save and load the search index with Marshal.
24
+ # File.open('index.dat', 'wb') { |f| f.write(Marshal.dump(index)) }
25
+ # index = Marshal.load(File.binread('index.dat'))
26
+ #
27
+ # *References:*
28
+ # 1. Moses S. Charikar, "Similarity Estimation Techniques from Rounding Algorithms," Proc. of the 34-th Annual ACM Symposium on Theory of Computing, pp. 380--388, (2002).
29
+ # 1. Deng Cai, "A Revisit of Hashing Algorithms for Approximate Nearest Neighbor Search," CoRR abs/1612.07545 (2016).
30
+ class LSHIndex
31
+ # Return the code length of hash key.
32
+ # @return [Integer]
33
+ attr_reader :code_length
34
+
35
+ # Return the number of samples of indexed data.
36
+ # @return [Integer]
37
+ attr_reader :n_samples
38
+
39
+ # Return the number of features of indexed data.
40
+ # @return [Integer]
41
+ attr_reader :n_features
42
+
43
+ # Return the number of hash keys.
44
+ # @return [Integer]
45
+ attr_reader :n_keys
46
+
47
+ # Return the hash table.
48
+ # @return [Hash]
49
+ attr_reader :hash_table
50
+
51
+ # Return the binary hash codes.
52
+ # @return [Numo::Bit]
53
+ attr_reader :hash_codes
54
+
55
+ # Return the seed to initialize random number generator.
56
+ # @return [Integer]
57
+ attr_reader :random_seed
58
+
59
+ # Return the random generator to generate random matrix.
60
+ # @return [Random]
61
+ attr_reader :rng
62
+
63
+ # Create a new nearest neighbor index.
64
+ # @param code_length [Integer] The length of binary code for hash key.
65
+ # @param random_seed [Integer/NilClass] The seed value using to initialize the random generator.
66
+ def initialize(code_length: 256, random_seed: nil)
67
+ @code_length = code_length
68
+ @n_samples = nil
69
+ @n_features = nil
70
+ @n_keys = nil
71
+ @last_id = nil
72
+ @weight_mat = nil
73
+ @hash_table = nil
74
+ @hash_codes = nil
75
+ @random_seed = random_seed
76
+ @random_seed ||= srand
77
+ @rng = Random.new(@random_seed)
78
+ end
79
+
80
+ # Convert data into binary codes.
81
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be converted to binary codes.
82
+ # @return [Numo::Bit] The binary codes converted from given data.
83
+ def hash_function(x)
84
+ x.dot(@weight_mat).ge(0.0)
85
+ end
86
+
87
+ # Build a search index.
88
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The dataset for building search index.
89
+ # @return [SVC] The search index itself that has constructed the hash table.
90
+ def build_index(x)
91
+ # Initialize some variables.
92
+ @n_samples, @n_features = x.shape
93
+ @hash_table = {}
94
+ @hash_codes = []
95
+ @weight_mat = Utils.rand_normal([@n_features, @code_length], @rng)
96
+ # Convert samples to binary codes.
97
+ bin_x = hash_function(x)
98
+ # Store samples to binary hash table.
99
+ @n_samples.times do |m|
100
+ bin_code = bin_x[m, true]
101
+ hash_key = symbolized_hash_key(bin_code)
102
+ unless @hash_table.key?(hash_key)
103
+ @hash_codes.push(bin_code.to_a)
104
+ @hash_table[hash_key] = []
105
+ end
106
+ @hash_table[hash_key].push(m)
107
+ end
108
+ @hash_codes = Numo::Bit.cast(@hash_codes)
109
+ # Update some variables.
110
+ @n_keys = @hash_codes.shape[0]
111
+ @last_id = @n_samples
112
+ self
113
+ end
114
+
115
+ # Append new data to the search index.
116
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The dataset to append to search index.
117
+ # @return [Array<Integer>] The indices of appended data in search index
118
+ def append_data(x)
119
+ # Initialize some variables.
120
+ n_new_samples, = x.shape
121
+ bin_x = hash_function(x)
122
+ added_data_ids = []
123
+ # Store samples to binary hash table.
124
+ new_codes = []
125
+ n_new_samples.times do |m|
126
+ bin_code = bin_x[m, true]
127
+ hash_key = symbolized_hash_key(bin_code)
128
+ unless @hash_table.key?(hash_key)
129
+ new_codes.push(bin_code.to_a)
130
+ @hash_table[hash_key] = []
131
+ end
132
+ new_data_id = @last_id + m
133
+ @hash_table[hash_key].push(new_data_id)
134
+ added_data_ids.push(new_data_id)
135
+ end
136
+ # Update hash codes.
137
+ unless new_codes.empty?
138
+ new_codes = Numo::Bit.cast(new_codes)
139
+ @hash_codes = @hash_codes.concatenate(new_codes)
140
+ @n_keys = @hash_codes.shape[0]
141
+ end
142
+ @last_id += n_new_samples
143
+ @n_samples += n_new_samples
144
+ added_data_ids
145
+ end
146
+
147
+ # Remove data from the search index.
148
+ # The indices of removed data will never be assigned unless the search index is rebuilt.
149
+ # @param data_ids [Array<Integer>] The data indices to be removed.
150
+ # @return [Array<Integer>] The indices of removed data in search index
151
+ def remove_data(data_ids)
152
+ removed_data_ids = []
153
+ data_ids.each do |query_id|
154
+ # Remove data id from hash table.
155
+ hash_key = @hash_table.keys.select { |k| @hash_table[k].include?(query_id) }.first
156
+ next if hash_key.nil?
157
+ @hash_table[hash_key].delete(query_id)
158
+ removed_data_ids.push(query_id)
159
+ # Remove the hash key if there is no data.
160
+ next unless @hash_table[hash_key].empty?
161
+ target_id = distances_to_hash_codes(decoded_hash_key(hash_key)).index(0)
162
+ @hash_codes = @hash_codes.delete(target_id, 0)
163
+ end
164
+ @n_samples -= removed_data_ids.size
165
+ removed_data_ids
166
+ end
167
+
168
+ # Perform k-nearest neighbor search.
169
+ # @param q [Numo::DFloat] (shape: [n_queries, n_features]) The data for search queries.
170
+ # @param n_neighbors [Integer] The number of neighbors.
171
+ # @return [Array<Integer>] The data indices of search result.
172
+ def search_knn(q, n_neighbors: 10)
173
+ # Initialize some variables.
174
+ n_queries, = q.shape
175
+ candidates = Array.new(n_queries) { [] }
176
+ # Binarize queries.
177
+ bin_q = hash_function(q)
178
+ # Find k-nearest neighbors for each query.
179
+ n_queries.times do |m|
180
+ sort_with_index(distances_to_hash_codes(bin_q[m, true])).each do |_, n|
181
+ candidates[m] = candidates[m] | @hash_table[symbolized_hash_key(@hash_codes[n, true])]
182
+ break if candidates[m].size >= n_neighbors
183
+ end
184
+ candidates[m] = candidates[m].shift(n_neighbors)
185
+ end
186
+ candidates
187
+ end
188
+
189
+ # Perform hamming radius nearest neighbor search.
190
+ # @param q [Numo::DFloat] (shape: [n_queries, n_features]) The data for search queries.
191
+ # @param radius [Float] The hamming radius for search range.
192
+ # @return [Array<Integer>] The data indices of search result.
193
+ def search_radius(q, radius: 1)
194
+ # Initialize some variables.
195
+ n_queries, = q.shape
196
+ candidates = Array.new(n_queries) { [] }
197
+ # Binarize queries.
198
+ bin_q = hash_function(q)
199
+ # Find k-nearest neighbors for each query.
200
+ n_queries.times do |m|
201
+ sort_with_index(distances_to_hash_codes(bin_q[m, true])).each do |d, n|
202
+ break if d > radius
203
+ candidates[m] = candidates[m] | @hash_table[symbolized_hash_key(@hash_codes[n, true])]
204
+ end
205
+ end
206
+ candidates
207
+ end
208
+
209
+ # Dump marshal data.
210
+ # @return [Hash] The marshal data for search index.
211
+ def marshal_dump
212
+ { code_length: @code_length,
213
+ n_samples: @n_samples,
214
+ n_features: @n_features,
215
+ n_keys: @n_keys,
216
+ last_id: @last_id,
217
+ weight_mat: @weight_mat,
218
+ bias_vec: @bias_vec,
219
+ hash_table: @hash_table,
220
+ hash_codes: @hash_codes,
221
+ random_seed: @random_seed,
222
+ rng: @rng }
223
+ end
224
+
225
+ # Load marshal data.
226
+ # @return [nil]
227
+ def marshal_load(obj)
228
+ @code_length = obj[:code_length]
229
+ @n_samples = obj[:n_samples]
230
+ @n_features = obj[:n_features]
231
+ @n_keys = obj[:n_keys]
232
+ @last_id = obj[:last_id]
233
+ @weight_mat = obj[:weight_mat]
234
+ @bias_vec = obj[:bias_vec]
235
+ @hash_table = obj[:hash_table]
236
+ @hash_codes = obj[:hash_codes]
237
+ @random_seed = obj[:random_seed]
238
+ @rng = obj[:rng]
239
+ nil
240
+ end
241
+
242
+ private
243
+
244
+ # Convert binary code to symbol as hash key.
245
+ # @param bin_code [Numo::Bit]
246
+ # @return [Symbol]
247
+ def symbolized_hash_key(bin_code)
248
+ Zlib::Deflate.deflate(bin_code.to_a.join, Zlib::BEST_SPEED).to_sym
249
+ end
250
+
251
+ # Calculate hamming distances between binary code and binary hash keys.
252
+ # @param bin_code [Numo::Bit]
253
+ # @return [Array<Float>]
254
+ def distances_to_hash_codes(bin_code)
255
+ (bin_code ^ @hash_codes).count(1).to_a
256
+ end
257
+
258
+ # Sort array elements with indices.
259
+ # @param arr [Array<Float>]
260
+ # @return [Array<Float, Integer>]
261
+ def sort_with_index(arr)
262
+ arr.map.with_index.sort_by(&:first)
263
+ end
264
+
265
+ # Convert hash key symbol to binary code.
266
+ # @param hash_key [Symbol]
267
+ # @return [Numo::Bit]
268
+ def decoded_hash_key(hash_key)
269
+ bin_code = Zlib::Inflate.inflate(hash_key.to_s).split('').map(&:to_i)
270
+ Numo::Bit[*bin_code]
271
+ end
272
+ end
273
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Hanny
4
+ # This module consists of utility methods.
5
+ module Utils
6
+ class << self
7
+ # Calculate pairwise euclidean distances between x and y.
8
+ # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
9
+ # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
10
+ # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
11
+ def euclidean_distance(x, y = nil)
12
+ y = x if y.nil?
13
+ x = Numo::DFloat[x] if x.shape[1].nil?
14
+ y = Numo::DFloat[y] if y.shape[1].nil?
15
+ sum_x_vec = (x**2).sum(1)
16
+ sum_y_vec = (y**2).sum(1)
17
+ dot_xy_mat = x.dot(y.transpose)
18
+ distance_matrix = dot_xy_mat * -2.0 +
19
+ sum_x_vec.tile(y.shape[0], 1).transpose +
20
+ sum_y_vec.tile(x.shape[0], 1)
21
+ Numo::NMath.sqrt(distance_matrix.abs)
22
+ end
23
+
24
+ # Generate a uniform random matrix with random number generator.
25
+ # @param shape [Array<Integer>] The size of random matrix.
26
+ # @param rng [Random] The random number generator
27
+ # @return [Numo::DFloat] The generated uniform random matrix.
28
+ def rand_uniform(shape, rng)
29
+ rnd_vals = Array.new(shape.inject(:*)) { rng.rand }
30
+ Numo::DFloat.asarray(rnd_vals).reshape(shape[0], shape[1])
31
+ end
32
+
33
+ # Generate a normal random matrix with random number generator.
34
+ # @param shape [Array<Integer>] The size of random matrix.
35
+ # @param rng [Random] The random number generator
36
+ # @return [Numo::DFloat] The generated normal random matrix.
37
+ def rand_normal(shape, rng, mu = 0.0, sigma = 1.0)
38
+ a = rand_uniform(shape, rng)
39
+ b = rand_uniform(shape, rng)
40
+ (Numo::NMath.sqrt(Numo::NMath.log(a) * -2.0) * Numo::NMath.sin(b * 2.0 * Math::PI)) * sigma + mu
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Hanny is a hash-based approximate nearest neighbor search library.
4
+ module Hanny
5
+ # @!visibility private
6
+ VERSION = '0.1.0'.freeze
7
+ end
metadata ADDED
@@ -0,0 +1,136 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hanny
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - yoshoku
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-05-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: numo-narray
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.16'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.16'
41
+ - !ruby/object:Gem::Dependency
42
+ name: coveralls
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.8'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.8'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
83
+ description: |
84
+ Hanny is a Hash-based Approximate Nearest Neighbor (ANN) search library in Ruby.
85
+ Hash-based ANN converts vector data into binary codes and builds a hash table by using the binary codes as hash keys.
86
+ To build the hash table, Hanny uses Locality Sensitive Hashing (LSH) of approximating cosine similarity.
87
+ It is known that if the code length is sufficiently long (ex. greater than 128-bit), LSH can obtain high search performance.
88
+ In the experiment, Hanny achieved about twenty times faster search speed than the brute-force search by Euclidean distance.
89
+ email:
90
+ - yoshoku@outlook.com
91
+ executables: []
92
+ extensions: []
93
+ extra_rdoc_files: []
94
+ files:
95
+ - ".coveralls.yml"
96
+ - ".gitignore"
97
+ - ".rspec"
98
+ - ".rubocop.yml"
99
+ - ".travis.yml"
100
+ - CODE_OF_CONDUCT.md
101
+ - Gemfile
102
+ - LICENSE.txt
103
+ - README.md
104
+ - Rakefile
105
+ - bin/console
106
+ - bin/setup
107
+ - hanny.gemspec
108
+ - lib/hanny.rb
109
+ - lib/hanny/lsh_index.rb
110
+ - lib/hanny/utils.rb
111
+ - lib/hanny/version.rb
112
+ homepage: https://github.com/yoshoku/hanny
113
+ licenses:
114
+ - BSD-2-Clause
115
+ metadata: {}
116
+ post_install_message:
117
+ rdoc_options: []
118
+ require_paths:
119
+ - lib
120
+ required_ruby_version: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '2.1'
125
+ required_rubygems_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ version: '0'
130
+ requirements: []
131
+ rubyforge_project:
132
+ rubygems_version: 2.7.6
133
+ signing_key:
134
+ specification_version: 4
135
+ summary: Hanny is a Hash-based Approximate Nearest Neighbor search library in Ruby.
136
+ test_files: []