annoy-rb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +21 -0
- data/.rspec +3 -0
- data/.travis.yml +12 -0
- data/CHANGELOG.md +2 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +177 -0
- data/README.md +58 -0
- data/Rakefile +14 -0
- data/annoy-rb.gemspec +27 -0
- data/ext/annoy/annoy.cpp +30 -0
- data/ext/annoy/annoy.hpp +300 -0
- data/ext/annoy/extconf.rb +9 -0
- data/ext/annoy/src/annoylib.h +1334 -0
- data/ext/annoy/src/kissrandom.h +106 -0
- data/lib/annoy.rb +174 -0
- data/lib/annoy/version.rb +7 -0
- metadata +65 -0
@@ -0,0 +1,106 @@
|
|
1
|
+
#ifndef KISSRANDOM_H
|
2
|
+
#define KISSRANDOM_H
|
3
|
+
|
4
|
+
#if defined(_MSC_VER) && _MSC_VER == 1500
|
5
|
+
typedef unsigned __int32 uint32_t;
|
6
|
+
typedef unsigned __int64 uint64_t;
|
7
|
+
#else
|
8
|
+
#include <stdint.h>
|
9
|
+
#endif
|
10
|
+
|
11
|
+
// KISS = "keep it simple, stupid", but high quality random number generator
|
12
|
+
// http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> "Use a good RNG and build it into your code"
|
13
|
+
// http://mathforum.org/kb/message.jspa?messageID=6627731
|
14
|
+
// https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator)
|
15
|
+
|
16
|
+
// 32 bit KISS
|
17
|
+
struct Kiss32Random {
|
18
|
+
uint32_t x;
|
19
|
+
uint32_t y;
|
20
|
+
uint32_t z;
|
21
|
+
uint32_t c;
|
22
|
+
|
23
|
+
// seed must be != 0
|
24
|
+
Kiss32Random(uint32_t seed = 123456789) {
|
25
|
+
x = seed;
|
26
|
+
y = 362436000;
|
27
|
+
z = 521288629;
|
28
|
+
c = 7654321;
|
29
|
+
}
|
30
|
+
|
31
|
+
uint32_t kiss() {
|
32
|
+
// Linear congruence generator
|
33
|
+
x = 69069 * x + 12345;
|
34
|
+
|
35
|
+
// Xor shift
|
36
|
+
y ^= y << 13;
|
37
|
+
y ^= y >> 17;
|
38
|
+
y ^= y << 5;
|
39
|
+
|
40
|
+
// Multiply-with-carry
|
41
|
+
uint64_t t = 698769069ULL * z + c;
|
42
|
+
c = t >> 32;
|
43
|
+
z = (uint32_t) t;
|
44
|
+
|
45
|
+
return x + y + z;
|
46
|
+
}
|
47
|
+
inline int flip() {
|
48
|
+
// Draw random 0 or 1
|
49
|
+
return kiss() & 1;
|
50
|
+
}
|
51
|
+
inline size_t index(size_t n) {
|
52
|
+
// Draw random integer between 0 and n-1 where n is at most the number of data points you have
|
53
|
+
return kiss() % n;
|
54
|
+
}
|
55
|
+
inline void set_seed(uint32_t seed) {
|
56
|
+
x = seed;
|
57
|
+
}
|
58
|
+
};
|
59
|
+
|
60
|
+
// 64 bit KISS. Use this if you have more than about 2^24 data points ("big data" ;) )
|
61
|
+
struct Kiss64Random {
|
62
|
+
uint64_t x;
|
63
|
+
uint64_t y;
|
64
|
+
uint64_t z;
|
65
|
+
uint64_t c;
|
66
|
+
|
67
|
+
// seed must be != 0
|
68
|
+
Kiss64Random(uint64_t seed = 1234567890987654321ULL) {
|
69
|
+
x = seed;
|
70
|
+
y = 362436362436362436ULL;
|
71
|
+
z = 1066149217761810ULL;
|
72
|
+
c = 123456123456123456ULL;
|
73
|
+
}
|
74
|
+
|
75
|
+
uint64_t kiss() {
|
76
|
+
// Linear congruence generator
|
77
|
+
z = 6906969069LL*z+1234567;
|
78
|
+
|
79
|
+
// Xor shift
|
80
|
+
y ^= (y<<13);
|
81
|
+
y ^= (y>>17);
|
82
|
+
y ^= (y<<43);
|
83
|
+
|
84
|
+
// Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t)
|
85
|
+
uint64_t t = (x<<58)+c;
|
86
|
+
c = (x>>6);
|
87
|
+
x += t;
|
88
|
+
c += (x<t);
|
89
|
+
|
90
|
+
return x + y + z;
|
91
|
+
}
|
92
|
+
inline int flip() {
|
93
|
+
// Draw random 0 or 1
|
94
|
+
return kiss() & 1;
|
95
|
+
}
|
96
|
+
inline size_t index(size_t n) {
|
97
|
+
// Draw random integer between 0 and n-1 where n is at most the number of data points you have
|
98
|
+
return kiss() % n;
|
99
|
+
}
|
100
|
+
inline void set_seed(uint32_t seed) {
|
101
|
+
x = seed;
|
102
|
+
}
|
103
|
+
};
|
104
|
+
|
105
|
+
#endif
|
106
|
+
// vim: tabstop=2 shiftwidth=2
|
data/lib/annoy.rb
ADDED
@@ -0,0 +1,174 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'annoy/version'
|
4
|
+
require 'annoy/annoy'
|
5
|
+
|
6
|
+
module Annoy
|
7
|
+
# AnnoyIndex is a class that provides functions for k-nearest neighbors search.
|
8
|
+
# The methods in this class are implemented similarly to Annoy's Python API (https://github.com/spotify/annoy#full-python-api).
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# require 'annoy'
|
12
|
+
#
|
13
|
+
# index = AnnoyIndex.new(n_features: 100, metric: 'euclidean')
|
14
|
+
#
|
15
|
+
# 5000.times do |item_id|
|
16
|
+
# item_vec = Array.new(100) { rand - 0.5 }
|
17
|
+
# index.add_item(item_id, item_vec)
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# index.build(10)
|
21
|
+
#
|
22
|
+
# index.get_nns_by_item(0, 100)
|
23
|
+
#
|
24
|
+
class AnnoyIndex
|
25
|
+
# Returns the number of features of indexed item.
|
26
|
+
# @return [Integer]
|
27
|
+
attr_reader :n_features
|
28
|
+
|
29
|
+
# Returns the metric of index.
|
30
|
+
# @return [String]
|
31
|
+
attr_reader :metric
|
32
|
+
|
33
|
+
# Create a new search index.
|
34
|
+
#
|
35
|
+
# @param n_features [Integer] The number of features (dimensions) of stored vector.
|
36
|
+
# @param metric [String] The distance metric between vectors ('angular', 'dot', 'hamming', 'euclidean', or 'manhattan').
|
37
|
+
def initialize(n_features:, metric: 'angular')
|
38
|
+
raise ArgumentError, 'Expect n_features to be Integer.' unless n_features.is_a?(Numeric)
|
39
|
+
|
40
|
+
@n_features = n_features.to_i
|
41
|
+
@metric = metric
|
42
|
+
|
43
|
+
@index = case @metric
|
44
|
+
when 'angular'
|
45
|
+
AnnoyIndexAngular.new(@n_features)
|
46
|
+
when 'dot'
|
47
|
+
AnnoyIndexDotProduct.new(@n_features)
|
48
|
+
when 'hamming'
|
49
|
+
AnnoyIndexHamming.new(@n_features)
|
50
|
+
when 'euclidean'
|
51
|
+
AnnoyIndexEuclidean.new(@n_features)
|
52
|
+
when 'manhattan'
|
53
|
+
AnnoyIndexManhattan.new(@n_features)
|
54
|
+
else
|
55
|
+
raise ArgumentError, "No such metric: #{@metric}."
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Add item to be indexed.
|
60
|
+
#
|
61
|
+
# @param i [Integer] The ID of item.
|
62
|
+
# @param v [Array] The vector of item.
|
63
|
+
# @return [Boolean]
|
64
|
+
def add_item(i, v)
|
65
|
+
@index.add_item(i, v)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Build a forest of index trees. After building, no more items can be added.
|
69
|
+
#
|
70
|
+
# @param n_trees [Integer] The number of trees. More trees gives higher search precision.
|
71
|
+
# @return [Boolean]
|
72
|
+
def build(n_trees)
|
73
|
+
@index.build(n_trees)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Save the search index to disk. After saving, no more items can be added.
|
77
|
+
#
|
78
|
+
# @param filename [String] The filename of search index.
|
79
|
+
# @return [Boolean]
|
80
|
+
def save(filename, prefault: false)
|
81
|
+
@index.save(filename, prefault)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Load a search index from disk.
|
85
|
+
#
|
86
|
+
# @param filename [String] The filename of search index.
|
87
|
+
# @param prefault [Boolean] The flag indicating whether to pre-read the entire file into memory.
|
88
|
+
# @return [Boolean]
|
89
|
+
def load(filename, prefault: false)
|
90
|
+
@index.load(filename, prefault)
|
91
|
+
end
|
92
|
+
|
93
|
+
# Unload the search index.
|
94
|
+
#
|
95
|
+
# @return [Boolean]
|
96
|
+
def unload
|
97
|
+
@index.unload
|
98
|
+
end
|
99
|
+
|
100
|
+
# Search the n closest items.
|
101
|
+
#
|
102
|
+
# @param i [Integer] The ID of query item.
|
103
|
+
# @param n [Integer] The number of nearest neighbors.
|
104
|
+
# @param search_k [Integer] The maximum number of nodes inspected during the search. If -1 is given, it sets to n * n_trees.
|
105
|
+
# @param include_distances [Boolean] The flag indicating whether to returns all corresponding distances.
|
106
|
+
# @return [Array<Integer> or Array<Array<Integer>, Array<Float>>]
|
107
|
+
def get_nns_by_item(i, n, search_k: -1, include_distances: false)
|
108
|
+
@index.get_nns_by_item(i, n, search_k, include_distances)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Search the n closest items.
|
112
|
+
#
|
113
|
+
# @param v [Array] The vector of query item.
|
114
|
+
# @param n [Integer] The number of nearest neighbors.
|
115
|
+
# @param search_k [Integer] The maximum number of nodes inspected during the search. If -1 is given, it sets to n * n_trees.
|
116
|
+
# @param include_distances [Boolean] The flag indicating whether to returns all corresponding distances.
|
117
|
+
# @return [Array<Integer> or Array<Array<Integer>, Array<Float>>]
|
118
|
+
def get_nns_by_vector(v, n, search_k: -1, include_distances: false)
|
119
|
+
@index.get_nns_by_vector(v, n, search_k, include_distances)
|
120
|
+
end
|
121
|
+
|
122
|
+
# Return the item vector.
|
123
|
+
#
|
124
|
+
# @param i [Integer] The ID of item.
|
125
|
+
# @return [Array]
|
126
|
+
def get_item(i)
|
127
|
+
@index.get_item(i)
|
128
|
+
end
|
129
|
+
|
130
|
+
# Calculate the distances between items.
|
131
|
+
#
|
132
|
+
# @param i [Integer] The ID of item.
|
133
|
+
# @param j [Integer] The ID of item.
|
134
|
+
# @return [Float or Integer]
|
135
|
+
def get_distance(i, j)
|
136
|
+
@index.get_distance(i, j)
|
137
|
+
end
|
138
|
+
|
139
|
+
# Return the number of items in the search index.
|
140
|
+
# @return [Integer]
|
141
|
+
def n_items
|
142
|
+
@index.get_n_items
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return the number of trees in the search index.
|
146
|
+
# @return [Integer]
|
147
|
+
def n_trees
|
148
|
+
@index.get_n_trees
|
149
|
+
end
|
150
|
+
|
151
|
+
# Prepare annoy to build the index in the specified file instead of RAM.
|
152
|
+
# (call this method before adding items, no need to save after building).
|
153
|
+
#
|
154
|
+
# @param filename [String] The filename of search index.
|
155
|
+
# @return [Boolean]
|
156
|
+
def on_disk_build(filename)
|
157
|
+
@index.on_disk_build(filename)
|
158
|
+
end
|
159
|
+
|
160
|
+
# Set to verbose mode.
|
161
|
+
#
|
162
|
+
# @param flag [Boolean]
|
163
|
+
def verbose(flag)
|
164
|
+
@index.verbose(flag)
|
165
|
+
end
|
166
|
+
|
167
|
+
# Set seed for the random number generator.
|
168
|
+
#
|
169
|
+
# @param s [Integer]
|
170
|
+
def seed(s)
|
171
|
+
@index.set_seed(s)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
metadata
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: annoy-rb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- yoshoku
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-08-08 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Annoy.rb is a Ruby binding for the Annoy (Approximate Nearest Neighbors
|
14
|
+
Oh Yeah).
|
15
|
+
email:
|
16
|
+
- yoshoku@outlook.com
|
17
|
+
executables: []
|
18
|
+
extensions:
|
19
|
+
- ext/annoy/extconf.rb
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- ".gitignore"
|
23
|
+
- ".rspec"
|
24
|
+
- ".travis.yml"
|
25
|
+
- CHANGELOG.md
|
26
|
+
- CODE_OF_CONDUCT.md
|
27
|
+
- Gemfile
|
28
|
+
- LICENSE.txt
|
29
|
+
- README.md
|
30
|
+
- Rakefile
|
31
|
+
- annoy-rb.gemspec
|
32
|
+
- ext/annoy/annoy.cpp
|
33
|
+
- ext/annoy/annoy.hpp
|
34
|
+
- ext/annoy/extconf.rb
|
35
|
+
- ext/annoy/src/annoylib.h
|
36
|
+
- ext/annoy/src/kissrandom.h
|
37
|
+
- lib/annoy.rb
|
38
|
+
- lib/annoy/version.rb
|
39
|
+
homepage: https://github.com/yoshoku/annoy.rb
|
40
|
+
licenses:
|
41
|
+
- Apache-2.0
|
42
|
+
metadata:
|
43
|
+
homepage_uri: https://github.com/yoshoku/annoy.rb
|
44
|
+
source_code_uri: https://github.com/yoshoku/annoy.rb
|
45
|
+
changelog_uri: https://github.com/yoshoku/annoy.rb/blob/master/CHANGELOG.md
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
60
|
+
requirements: []
|
61
|
+
rubygems_version: 3.1.2
|
62
|
+
signing_key:
|
63
|
+
specification_version: 4
|
64
|
+
summary: Ruby binding for the Annoy (Approximate Nearest Neighbors Oh Yeah).
|
65
|
+
test_files: []
|