annoy-rb 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,106 @@
1
+ #ifndef KISSRANDOM_H
2
+ #define KISSRANDOM_H
3
+
4
+ #if defined(_MSC_VER) && _MSC_VER == 1500
5
+ typedef unsigned __int32 uint32_t;
6
+ typedef unsigned __int64 uint64_t;
7
+ #else
8
+ #include <stdint.h>
9
+ #endif
10
+
11
+ // KISS = "keep it simple, stupid", but high quality random number generator
12
+ // http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> "Use a good RNG and build it into your code"
13
+ // http://mathforum.org/kb/message.jspa?messageID=6627731
14
+ // https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator)
15
+
16
+ // 32 bit KISS
17
+ struct Kiss32Random {
18
+ uint32_t x;
19
+ uint32_t y;
20
+ uint32_t z;
21
+ uint32_t c;
22
+
23
+ // seed must be != 0
24
+ Kiss32Random(uint32_t seed = 123456789) {
25
+ x = seed;
26
+ y = 362436000;
27
+ z = 521288629;
28
+ c = 7654321;
29
+ }
30
+
31
+ uint32_t kiss() {
32
+ // Linear congruence generator
33
+ x = 69069 * x + 12345;
34
+
35
+ // Xor shift
36
+ y ^= y << 13;
37
+ y ^= y >> 17;
38
+ y ^= y << 5;
39
+
40
+ // Multiply-with-carry
41
+ uint64_t t = 698769069ULL * z + c;
42
+ c = t >> 32;
43
+ z = (uint32_t) t;
44
+
45
+ return x + y + z;
46
+ }
47
+ inline int flip() {
48
+ // Draw random 0 or 1
49
+ return kiss() & 1;
50
+ }
51
+ inline size_t index(size_t n) {
52
+ // Draw random integer between 0 and n-1 where n is at most the number of data points you have
53
+ return kiss() % n;
54
+ }
55
+ inline void set_seed(uint32_t seed) {
56
+ x = seed;
57
+ }
58
+ };
59
+
60
+ // 64 bit KISS. Use this if you have more than about 2^24 data points ("big data" ;) )
61
+ struct Kiss64Random {
62
+ uint64_t x;
63
+ uint64_t y;
64
+ uint64_t z;
65
+ uint64_t c;
66
+
67
+ // seed must be != 0
68
+ Kiss64Random(uint64_t seed = 1234567890987654321ULL) {
69
+ x = seed;
70
+ y = 362436362436362436ULL;
71
+ z = 1066149217761810ULL;
72
+ c = 123456123456123456ULL;
73
+ }
74
+
75
+ uint64_t kiss() {
76
+ // Linear congruence generator
77
+ z = 6906969069LL*z+1234567;
78
+
79
+ // Xor shift
80
+ y ^= (y<<13);
81
+ y ^= (y>>17);
82
+ y ^= (y<<43);
83
+
84
+ // Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t)
85
+ uint64_t t = (x<<58)+c;
86
+ c = (x>>6);
87
+ x += t;
88
+ c += (x<t);
89
+
90
+ return x + y + z;
91
+ }
92
+ inline int flip() {
93
+ // Draw random 0 or 1
94
+ return kiss() & 1;
95
+ }
96
+ inline size_t index(size_t n) {
97
+ // Draw random integer between 0 and n-1 where n is at most the number of data points you have
98
+ return kiss() % n;
99
+ }
100
+ inline void set_seed(uint32_t seed) {
101
+ x = seed;
102
+ }
103
+ };
104
+
105
+ #endif
106
+ // vim: tabstop=2 shiftwidth=2
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'annoy/version'
4
+ require 'annoy/annoy'
5
+
6
+ module Annoy
7
+ # AnnoyIndex is a class that provides functions for k-nearest neighbors search.
8
+ # The methods in this class are implemented similarly to Annoy's Python API (https://github.com/spotify/annoy#full-python-api).
9
+ #
10
+ # @example
11
+ # require 'annoy'
12
+ #
13
+ # index = AnnoyIndex.new(n_features: 100, metric: 'euclidean')
14
+ #
15
+ # 5000.times do |item_id|
16
+ # item_vec = Array.new(100) { rand - 0.5 }
17
+ # index.add_item(item_id, item_vec)
18
+ # end
19
+ #
20
+ # index.build(10)
21
+ #
22
+ # index.get_nns_by_item(0, 100)
23
+ #
24
+ class AnnoyIndex
25
+ # Returns the number of features of indexed item.
26
+ # @return [Integer]
27
+ attr_reader :n_features
28
+
29
+ # Returns the metric of index.
30
+ # @return [String]
31
+ attr_reader :metric
32
+
33
+ # Create a new search index.
34
+ #
35
+ # @param n_features [Integer] The number of features (dimensions) of stored vector.
36
+ # @param metric [String] The distance metric between vectors ('angular', 'dot', 'hamming', 'euclidean', or 'manhattan').
37
+ def initialize(n_features:, metric: 'angular')
38
+ raise ArgumentError, 'Expect n_features to be Integer.' unless n_features.is_a?(Numeric)
39
+
40
+ @n_features = n_features.to_i
41
+ @metric = metric
42
+
43
+ @index = case @metric
44
+ when 'angular'
45
+ AnnoyIndexAngular.new(@n_features)
46
+ when 'dot'
47
+ AnnoyIndexDotProduct.new(@n_features)
48
+ when 'hamming'
49
+ AnnoyIndexHamming.new(@n_features)
50
+ when 'euclidean'
51
+ AnnoyIndexEuclidean.new(@n_features)
52
+ when 'manhattan'
53
+ AnnoyIndexManhattan.new(@n_features)
54
+ else
55
+ raise ArgumentError, "No such metric: #{@metric}."
56
+ end
57
+ end
58
+
59
+ # Add item to be indexed.
60
+ #
61
+ # @param i [Integer] The ID of item.
62
+ # @param v [Array] The vector of item.
63
+ # @return [Boolean]
64
+ def add_item(i, v)
65
+ @index.add_item(i, v)
66
+ end
67
+
68
+ # Build a forest of index trees. After building, no more items can be added.
69
+ #
70
+ # @param n_trees [Integer] The number of trees. More trees gives higher search precision.
71
+ # @return [Boolean]
72
+ def build(n_trees)
73
+ @index.build(n_trees)
74
+ end
75
+
76
+ # Save the search index to disk. After saving, no more items can be added.
77
+ #
78
+ # @param filename [String] The filename of search index.
79
+ # @return [Boolean]
80
+ def save(filename, prefault: false)
81
+ @index.save(filename, prefault)
82
+ end
83
+
84
+ # Load a search index from disk.
85
+ #
86
+ # @param filename [String] The filename of search index.
87
+ # @param prefault [Boolean] The flag indicating whether to pre-read the entire file into memory.
88
+ # @return [Boolean]
89
+ def load(filename, prefault: false)
90
+ @index.load(filename, prefault)
91
+ end
92
+
93
+ # Unload the search index.
94
+ #
95
+ # @return [Boolean]
96
+ def unload
97
+ @index.unload
98
+ end
99
+
100
+ # Search the n closest items.
101
+ #
102
+ # @param i [Integer] The ID of query item.
103
+ # @param n [Integer] The number of nearest neighbors.
104
+ # @param search_k [Integer] The maximum number of nodes inspected during the search. If -1 is given, it sets to n * n_trees.
105
+ # @param include_distances [Boolean] The flag indicating whether to returns all corresponding distances.
106
+ # @return [Array<Integer> or Array<Array<Integer>, Array<Float>>]
107
+ def get_nns_by_item(i, n, search_k: -1, include_distances: false)
108
+ @index.get_nns_by_item(i, n, search_k, include_distances)
109
+ end
110
+
111
+ # Search the n closest items.
112
+ #
113
+ # @param v [Array] The vector of query item.
114
+ # @param n [Integer] The number of nearest neighbors.
115
+ # @param search_k [Integer] The maximum number of nodes inspected during the search. If -1 is given, it sets to n * n_trees.
116
+ # @param include_distances [Boolean] The flag indicating whether to returns all corresponding distances.
117
+ # @return [Array<Integer> or Array<Array<Integer>, Array<Float>>]
118
+ def get_nns_by_vector(v, n, search_k: -1, include_distances: false)
119
+ @index.get_nns_by_vector(v, n, search_k, include_distances)
120
+ end
121
+
122
+ # Return the item vector.
123
+ #
124
+ # @param i [Integer] The ID of item.
125
+ # @return [Array]
126
+ def get_item(i)
127
+ @index.get_item(i)
128
+ end
129
+
130
+ # Calculate the distances between items.
131
+ #
132
+ # @param i [Integer] The ID of item.
133
+ # @param j [Integer] The ID of item.
134
+ # @return [Float or Integer]
135
+ def get_distance(i, j)
136
+ @index.get_distance(i, j)
137
+ end
138
+
139
+ # Return the number of items in the search index.
140
+ # @return [Integer]
141
+ def n_items
142
+ @index.get_n_items
143
+ end
144
+
145
+ # Return the number of trees in the search index.
146
+ # @return [Integer]
147
+ def n_trees
148
+ @index.get_n_trees
149
+ end
150
+
151
+ # Prepare annoy to build the index in the specified file instead of RAM.
152
+ # (call this method before adding items, no need to save after building).
153
+ #
154
+ # @param filename [String] The filename of search index.
155
+ # @return [Boolean]
156
+ def on_disk_build(filename)
157
+ @index.on_disk_build(filename)
158
+ end
159
+
160
+ # Set to verbose mode.
161
+ #
162
+ # @param flag [Boolean]
163
+ def verbose(flag)
164
+ @index.verbose(flag)
165
+ end
166
+
167
+ # Set seed for the random number generator.
168
+ #
169
+ # @param s [Integer]
170
+ def seed(s)
171
+ @index.set_seed(s)
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Annoy.rb is a Ruby wrapper for Annoy (Approximate Nearest Neighbors Oh Yeah).
4
+ module Annoy
5
+ # The version of Annoy.rb you are using.
6
+ VERSION = '0.1.0'.freeze
7
+ end
metadata ADDED
@@ -0,0 +1,65 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: annoy-rb
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - yoshoku
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-08-08 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Annoy.rb is a Ruby binding for the Annoy (Approximate Nearest Neighbors
14
+ Oh Yeah).
15
+ email:
16
+ - yoshoku@outlook.com
17
+ executables: []
18
+ extensions:
19
+ - ext/annoy/extconf.rb
20
+ extra_rdoc_files: []
21
+ files:
22
+ - ".gitignore"
23
+ - ".rspec"
24
+ - ".travis.yml"
25
+ - CHANGELOG.md
26
+ - CODE_OF_CONDUCT.md
27
+ - Gemfile
28
+ - LICENSE.txt
29
+ - README.md
30
+ - Rakefile
31
+ - annoy-rb.gemspec
32
+ - ext/annoy/annoy.cpp
33
+ - ext/annoy/annoy.hpp
34
+ - ext/annoy/extconf.rb
35
+ - ext/annoy/src/annoylib.h
36
+ - ext/annoy/src/kissrandom.h
37
+ - lib/annoy.rb
38
+ - lib/annoy/version.rb
39
+ homepage: https://github.com/yoshoku/annoy.rb
40
+ licenses:
41
+ - Apache-2.0
42
+ metadata:
43
+ homepage_uri: https://github.com/yoshoku/annoy.rb
44
+ source_code_uri: https://github.com/yoshoku/annoy.rb
45
+ changelog_uri: https://github.com/yoshoku/annoy.rb/blob/master/CHANGELOG.md
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ requirements: []
61
+ rubygems_version: 3.1.2
62
+ signing_key:
63
+ specification_version: 4
64
+ summary: Ruby binding for the Annoy (Approximate Nearest Neighbors Oh Yeah).
65
+ test_files: []