annoy-rb 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +21 -0
- data/.rspec +3 -0
- data/.travis.yml +12 -0
- data/CHANGELOG.md +2 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +177 -0
- data/README.md +58 -0
- data/Rakefile +14 -0
- data/annoy-rb.gemspec +27 -0
- data/ext/annoy/annoy.cpp +30 -0
- data/ext/annoy/annoy.hpp +300 -0
- data/ext/annoy/extconf.rb +9 -0
- data/ext/annoy/src/annoylib.h +1334 -0
- data/ext/annoy/src/kissrandom.h +106 -0
- data/lib/annoy.rb +174 -0
- data/lib/annoy/version.rb +7 -0
- metadata +65 -0
@@ -0,0 +1,106 @@
|
|
1
|
+
#ifndef KISSRANDOM_H
|
2
|
+
#define KISSRANDOM_H
|
3
|
+
|
4
|
+
#if defined(_MSC_VER) && _MSC_VER == 1500
|
5
|
+
typedef unsigned __int32 uint32_t;
|
6
|
+
typedef unsigned __int64 uint64_t;
|
7
|
+
#else
|
8
|
+
#include <stdint.h>
|
9
|
+
#endif
|
10
|
+
|
11
|
+
// KISS = "keep it simple, stupid", but high quality random number generator
|
12
|
+
// http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> "Use a good RNG and build it into your code"
|
13
|
+
// http://mathforum.org/kb/message.jspa?messageID=6627731
|
14
|
+
// https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator)
|
15
|
+
|
16
|
+
// 32 bit KISS
|
17
|
+
struct Kiss32Random {
|
18
|
+
uint32_t x;
|
19
|
+
uint32_t y;
|
20
|
+
uint32_t z;
|
21
|
+
uint32_t c;
|
22
|
+
|
23
|
+
// seed must be != 0
|
24
|
+
Kiss32Random(uint32_t seed = 123456789) {
|
25
|
+
x = seed;
|
26
|
+
y = 362436000;
|
27
|
+
z = 521288629;
|
28
|
+
c = 7654321;
|
29
|
+
}
|
30
|
+
|
31
|
+
uint32_t kiss() {
|
32
|
+
// Linear congruence generator
|
33
|
+
x = 69069 * x + 12345;
|
34
|
+
|
35
|
+
// Xor shift
|
36
|
+
y ^= y << 13;
|
37
|
+
y ^= y >> 17;
|
38
|
+
y ^= y << 5;
|
39
|
+
|
40
|
+
// Multiply-with-carry
|
41
|
+
uint64_t t = 698769069ULL * z + c;
|
42
|
+
c = t >> 32;
|
43
|
+
z = (uint32_t) t;
|
44
|
+
|
45
|
+
return x + y + z;
|
46
|
+
}
|
47
|
+
inline int flip() {
|
48
|
+
// Draw random 0 or 1
|
49
|
+
return kiss() & 1;
|
50
|
+
}
|
51
|
+
inline size_t index(size_t n) {
|
52
|
+
// Draw random integer between 0 and n-1 where n is at most the number of data points you have
|
53
|
+
return kiss() % n;
|
54
|
+
}
|
55
|
+
inline void set_seed(uint32_t seed) {
|
56
|
+
x = seed;
|
57
|
+
}
|
58
|
+
};
|
59
|
+
|
60
|
+
// 64 bit KISS. Use this if you have more than about 2^24 data points ("big data" ;) )
|
61
|
+
struct Kiss64Random {
|
62
|
+
uint64_t x;
|
63
|
+
uint64_t y;
|
64
|
+
uint64_t z;
|
65
|
+
uint64_t c;
|
66
|
+
|
67
|
+
// seed must be != 0
|
68
|
+
Kiss64Random(uint64_t seed = 1234567890987654321ULL) {
|
69
|
+
x = seed;
|
70
|
+
y = 362436362436362436ULL;
|
71
|
+
z = 1066149217761810ULL;
|
72
|
+
c = 123456123456123456ULL;
|
73
|
+
}
|
74
|
+
|
75
|
+
uint64_t kiss() {
|
76
|
+
// Linear congruence generator
|
77
|
+
z = 6906969069LL*z+1234567;
|
78
|
+
|
79
|
+
// Xor shift
|
80
|
+
y ^= (y<<13);
|
81
|
+
y ^= (y>>17);
|
82
|
+
y ^= (y<<43);
|
83
|
+
|
84
|
+
// Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t)
|
85
|
+
uint64_t t = (x<<58)+c;
|
86
|
+
c = (x>>6);
|
87
|
+
x += t;
|
88
|
+
c += (x<t);
|
89
|
+
|
90
|
+
return x + y + z;
|
91
|
+
}
|
92
|
+
inline int flip() {
|
93
|
+
// Draw random 0 or 1
|
94
|
+
return kiss() & 1;
|
95
|
+
}
|
96
|
+
inline size_t index(size_t n) {
|
97
|
+
// Draw random integer between 0 and n-1 where n is at most the number of data points you have
|
98
|
+
return kiss() % n;
|
99
|
+
}
|
100
|
+
inline void set_seed(uint32_t seed) {
|
101
|
+
x = seed;
|
102
|
+
}
|
103
|
+
};
|
104
|
+
|
105
|
+
#endif
|
106
|
+
// vim: tabstop=2 shiftwidth=2
|
data/lib/annoy.rb
ADDED
@@ -0,0 +1,174 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'annoy/version'
|
4
|
+
require 'annoy/annoy'
|
5
|
+
|
6
|
+
module Annoy
|
7
|
+
# AnnoyIndex is a class that provides functions for k-nearest neighbors search.
|
8
|
+
# The methods in this class are implemented similarly to Annoy's Python API (https://github.com/spotify/annoy#full-python-api).
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# require 'annoy'
|
12
|
+
#
|
13
|
+
# index = AnnoyIndex.new(n_features: 100, metric: 'euclidean')
|
14
|
+
#
|
15
|
+
# 5000.times do |item_id|
|
16
|
+
# item_vec = Array.new(100) { rand - 0.5 }
|
17
|
+
# index.add_item(item_id, item_vec)
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# index.build(10)
|
21
|
+
#
|
22
|
+
# index.get_nns_by_item(0, 100)
|
23
|
+
#
|
24
|
+
class AnnoyIndex
|
25
|
+
# Returns the number of features of indexed item.
|
26
|
+
# @return [Integer]
|
27
|
+
attr_reader :n_features
|
28
|
+
|
29
|
+
# Returns the metric of index.
|
30
|
+
# @return [String]
|
31
|
+
attr_reader :metric
|
32
|
+
|
33
|
+
# Create a new search index.
|
34
|
+
#
|
35
|
+
# @param n_features [Integer] The number of features (dimensions) of stored vector.
|
36
|
+
# @param metric [String] The distance metric between vectors ('angular', 'dot', 'hamming', 'euclidean', or 'manhattan').
|
37
|
+
def initialize(n_features:, metric: 'angular')
|
38
|
+
raise ArgumentError, 'Expect n_features to be Integer.' unless n_features.is_a?(Numeric)
|
39
|
+
|
40
|
+
@n_features = n_features.to_i
|
41
|
+
@metric = metric
|
42
|
+
|
43
|
+
@index = case @metric
|
44
|
+
when 'angular'
|
45
|
+
AnnoyIndexAngular.new(@n_features)
|
46
|
+
when 'dot'
|
47
|
+
AnnoyIndexDotProduct.new(@n_features)
|
48
|
+
when 'hamming'
|
49
|
+
AnnoyIndexHamming.new(@n_features)
|
50
|
+
when 'euclidean'
|
51
|
+
AnnoyIndexEuclidean.new(@n_features)
|
52
|
+
when 'manhattan'
|
53
|
+
AnnoyIndexManhattan.new(@n_features)
|
54
|
+
else
|
55
|
+
raise ArgumentError, "No such metric: #{@metric}."
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Add item to be indexed.
|
60
|
+
#
|
61
|
+
# @param i [Integer] The ID of item.
|
62
|
+
# @param v [Array] The vector of item.
|
63
|
+
# @return [Boolean]
|
64
|
+
def add_item(i, v)
|
65
|
+
@index.add_item(i, v)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Build a forest of index trees. After building, no more items can be added.
|
69
|
+
#
|
70
|
+
# @param n_trees [Integer] The number of trees. More trees gives higher search precision.
|
71
|
+
# @return [Boolean]
|
72
|
+
def build(n_trees)
|
73
|
+
@index.build(n_trees)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Save the search index to disk. After saving, no more items can be added.
|
77
|
+
#
|
78
|
+
# @param filename [String] The filename of search index.
|
79
|
+
# @return [Boolean]
|
80
|
+
def save(filename, prefault: false)
|
81
|
+
@index.save(filename, prefault)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Load a search index from disk.
|
85
|
+
#
|
86
|
+
# @param filename [String] The filename of search index.
|
87
|
+
# @param prefault [Boolean] The flag indicating whether to pre-read the entire file into memory.
|
88
|
+
# @return [Boolean]
|
89
|
+
def load(filename, prefault: false)
|
90
|
+
@index.load(filename, prefault)
|
91
|
+
end
|
92
|
+
|
93
|
+
# Unload the search index.
|
94
|
+
#
|
95
|
+
# @return [Boolean]
|
96
|
+
def unload
|
97
|
+
@index.unload
|
98
|
+
end
|
99
|
+
|
100
|
+
# Search the n closest items.
|
101
|
+
#
|
102
|
+
# @param i [Integer] The ID of query item.
|
103
|
+
# @param n [Integer] The number of nearest neighbors.
|
104
|
+
# @param search_k [Integer] The maximum number of nodes inspected during the search. If -1 is given, it sets to n * n_trees.
|
105
|
+
# @param include_distances [Boolean] The flag indicating whether to returns all corresponding distances.
|
106
|
+
# @return [Array<Integer> or Array<Array<Integer>, Array<Float>>]
|
107
|
+
def get_nns_by_item(i, n, search_k: -1, include_distances: false)
|
108
|
+
@index.get_nns_by_item(i, n, search_k, include_distances)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Search the n closest items.
|
112
|
+
#
|
113
|
+
# @param v [Array] The vector of query item.
|
114
|
+
# @param n [Integer] The number of nearest neighbors.
|
115
|
+
# @param search_k [Integer] The maximum number of nodes inspected during the search. If -1 is given, it sets to n * n_trees.
|
116
|
+
# @param include_distances [Boolean] The flag indicating whether to returns all corresponding distances.
|
117
|
+
# @return [Array<Integer> or Array<Array<Integer>, Array<Float>>]
|
118
|
+
def get_nns_by_vector(v, n, search_k: -1, include_distances: false)
|
119
|
+
@index.get_nns_by_vector(v, n, search_k, include_distances)
|
120
|
+
end
|
121
|
+
|
122
|
+
# Return the item vector.
|
123
|
+
#
|
124
|
+
# @param i [Integer] The ID of item.
|
125
|
+
# @return [Array]
|
126
|
+
def get_item(i)
|
127
|
+
@index.get_item(i)
|
128
|
+
end
|
129
|
+
|
130
|
+
# Calculate the distances between items.
|
131
|
+
#
|
132
|
+
# @param i [Integer] The ID of item.
|
133
|
+
# @param j [Integer] The ID of item.
|
134
|
+
# @return [Float or Integer]
|
135
|
+
def get_distance(i, j)
|
136
|
+
@index.get_distance(i, j)
|
137
|
+
end
|
138
|
+
|
139
|
+
# Return the number of items in the search index.
|
140
|
+
# @return [Integer]
|
141
|
+
def n_items
|
142
|
+
@index.get_n_items
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return the number of trees in the search index.
|
146
|
+
# @return [Integer]
|
147
|
+
def n_trees
|
148
|
+
@index.get_n_trees
|
149
|
+
end
|
150
|
+
|
151
|
+
# Prepare annoy to build the index in the specified file instead of RAM.
|
152
|
+
# (call this method before adding items, no need to save after building).
|
153
|
+
#
|
154
|
+
# @param filename [String] The filename of search index.
|
155
|
+
# @return [Boolean]
|
156
|
+
def on_disk_build(filename)
|
157
|
+
@index.on_disk_build(filename)
|
158
|
+
end
|
159
|
+
|
160
|
+
# Set to verbose mode.
|
161
|
+
#
|
162
|
+
# @param flag [Boolean]
|
163
|
+
def verbose(flag)
|
164
|
+
@index.verbose(flag)
|
165
|
+
end
|
166
|
+
|
167
|
+
# Set seed for the random number generator.
|
168
|
+
#
|
169
|
+
# @param s [Integer]
|
170
|
+
def seed(s)
|
171
|
+
@index.set_seed(s)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
metadata
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: annoy-rb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- yoshoku
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-08-08 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Annoy.rb is a Ruby binding for the Annoy (Approximate Nearest Neighbors
|
14
|
+
Oh Yeah).
|
15
|
+
email:
|
16
|
+
- yoshoku@outlook.com
|
17
|
+
executables: []
|
18
|
+
extensions:
|
19
|
+
- ext/annoy/extconf.rb
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- ".gitignore"
|
23
|
+
- ".rspec"
|
24
|
+
- ".travis.yml"
|
25
|
+
- CHANGELOG.md
|
26
|
+
- CODE_OF_CONDUCT.md
|
27
|
+
- Gemfile
|
28
|
+
- LICENSE.txt
|
29
|
+
- README.md
|
30
|
+
- Rakefile
|
31
|
+
- annoy-rb.gemspec
|
32
|
+
- ext/annoy/annoy.cpp
|
33
|
+
- ext/annoy/annoy.hpp
|
34
|
+
- ext/annoy/extconf.rb
|
35
|
+
- ext/annoy/src/annoylib.h
|
36
|
+
- ext/annoy/src/kissrandom.h
|
37
|
+
- lib/annoy.rb
|
38
|
+
- lib/annoy/version.rb
|
39
|
+
homepage: https://github.com/yoshoku/annoy.rb
|
40
|
+
licenses:
|
41
|
+
- Apache-2.0
|
42
|
+
metadata:
|
43
|
+
homepage_uri: https://github.com/yoshoku/annoy.rb
|
44
|
+
source_code_uri: https://github.com/yoshoku/annoy.rb
|
45
|
+
changelog_uri: https://github.com/yoshoku/annoy.rb/blob/master/CHANGELOG.md
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
60
|
+
requirements: []
|
61
|
+
rubygems_version: 3.1.2
|
62
|
+
signing_key:
|
63
|
+
specification_version: 4
|
64
|
+
summary: Ruby binding for the Annoy (Approximate Nearest Neighbors Oh Yeah).
|
65
|
+
test_files: []
|