faiss 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +103 -3
  4. data/ext/faiss/ext.cpp +99 -32
  5. data/ext/faiss/extconf.rb +12 -2
  6. data/lib/faiss/ext.bundle +0 -0
  7. data/lib/faiss/index.rb +3 -3
  8. data/lib/faiss/index_binary.rb +3 -3
  9. data/lib/faiss/kmeans.rb +1 -1
  10. data/lib/faiss/pca_matrix.rb +2 -2
  11. data/lib/faiss/product_quantizer.rb +3 -3
  12. data/lib/faiss/version.rb +1 -1
  13. data/vendor/faiss/AutoTune.cpp +719 -0
  14. data/vendor/faiss/AutoTune.h +212 -0
  15. data/vendor/faiss/Clustering.cpp +261 -0
  16. data/vendor/faiss/Clustering.h +101 -0
  17. data/vendor/faiss/IVFlib.cpp +339 -0
  18. data/vendor/faiss/IVFlib.h +132 -0
  19. data/vendor/faiss/Index.cpp +171 -0
  20. data/vendor/faiss/Index.h +261 -0
  21. data/vendor/faiss/Index2Layer.cpp +437 -0
  22. data/vendor/faiss/Index2Layer.h +85 -0
  23. data/vendor/faiss/IndexBinary.cpp +77 -0
  24. data/vendor/faiss/IndexBinary.h +163 -0
  25. data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
  26. data/vendor/faiss/IndexBinaryFlat.h +54 -0
  27. data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
  28. data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
  29. data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
  30. data/vendor/faiss/IndexBinaryHNSW.h +56 -0
  31. data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
  32. data/vendor/faiss/IndexBinaryIVF.h +211 -0
  33. data/vendor/faiss/IndexFlat.cpp +508 -0
  34. data/vendor/faiss/IndexFlat.h +175 -0
  35. data/vendor/faiss/IndexHNSW.cpp +1090 -0
  36. data/vendor/faiss/IndexHNSW.h +170 -0
  37. data/vendor/faiss/IndexIVF.cpp +909 -0
  38. data/vendor/faiss/IndexIVF.h +353 -0
  39. data/vendor/faiss/IndexIVFFlat.cpp +502 -0
  40. data/vendor/faiss/IndexIVFFlat.h +118 -0
  41. data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
  42. data/vendor/faiss/IndexIVFPQ.h +161 -0
  43. data/vendor/faiss/IndexIVFPQR.cpp +219 -0
  44. data/vendor/faiss/IndexIVFPQR.h +65 -0
  45. data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
  46. data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
  47. data/vendor/faiss/IndexLSH.cpp +225 -0
  48. data/vendor/faiss/IndexLSH.h +87 -0
  49. data/vendor/faiss/IndexLattice.cpp +143 -0
  50. data/vendor/faiss/IndexLattice.h +68 -0
  51. data/vendor/faiss/IndexPQ.cpp +1188 -0
  52. data/vendor/faiss/IndexPQ.h +199 -0
  53. data/vendor/faiss/IndexPreTransform.cpp +288 -0
  54. data/vendor/faiss/IndexPreTransform.h +91 -0
  55. data/vendor/faiss/IndexReplicas.cpp +123 -0
  56. data/vendor/faiss/IndexReplicas.h +76 -0
  57. data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
  58. data/vendor/faiss/IndexScalarQuantizer.h +127 -0
  59. data/vendor/faiss/IndexShards.cpp +317 -0
  60. data/vendor/faiss/IndexShards.h +100 -0
  61. data/vendor/faiss/InvertedLists.cpp +623 -0
  62. data/vendor/faiss/InvertedLists.h +334 -0
  63. data/vendor/faiss/LICENSE +21 -0
  64. data/vendor/faiss/MatrixStats.cpp +252 -0
  65. data/vendor/faiss/MatrixStats.h +62 -0
  66. data/vendor/faiss/MetaIndexes.cpp +351 -0
  67. data/vendor/faiss/MetaIndexes.h +126 -0
  68. data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
  69. data/vendor/faiss/OnDiskInvertedLists.h +127 -0
  70. data/vendor/faiss/VectorTransform.cpp +1157 -0
  71. data/vendor/faiss/VectorTransform.h +322 -0
  72. data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
  73. data/vendor/faiss/c_api/AutoTune_c.h +64 -0
  74. data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
  75. data/vendor/faiss/c_api/Clustering_c.h +117 -0
  76. data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
  77. data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
  78. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
  79. data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
  80. data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
  81. data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
  82. data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
  83. data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
  84. data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
  85. data/vendor/faiss/c_api/IndexShards_c.h +42 -0
  86. data/vendor/faiss/c_api/Index_c.cpp +105 -0
  87. data/vendor/faiss/c_api/Index_c.h +183 -0
  88. data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
  89. data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
  90. data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
  91. data/vendor/faiss/c_api/clone_index_c.h +32 -0
  92. data/vendor/faiss/c_api/error_c.h +42 -0
  93. data/vendor/faiss/c_api/error_impl.cpp +27 -0
  94. data/vendor/faiss/c_api/error_impl.h +16 -0
  95. data/vendor/faiss/c_api/faiss_c.h +58 -0
  96. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
  97. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
  98. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
  99. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
  100. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
  101. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
  102. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
  103. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
  104. data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
  105. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
  106. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
  107. data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
  108. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
  109. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
  110. data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
  111. data/vendor/faiss/c_api/index_factory_c.h +30 -0
  112. data/vendor/faiss/c_api/index_io_c.cpp +42 -0
  113. data/vendor/faiss/c_api/index_io_c.h +50 -0
  114. data/vendor/faiss/c_api/macros_impl.h +110 -0
  115. data/vendor/faiss/clone_index.cpp +147 -0
  116. data/vendor/faiss/clone_index.h +38 -0
  117. data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
  118. data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
  119. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
  120. data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
  121. data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
  122. data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
  123. data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
  124. data/vendor/faiss/gpu/GpuCloner.h +82 -0
  125. data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
  126. data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
  127. data/vendor/faiss/gpu/GpuDistance.h +52 -0
  128. data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
  129. data/vendor/faiss/gpu/GpuIndex.h +148 -0
  130. data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
  131. data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
  132. data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
  133. data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
  134. data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
  135. data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
  136. data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
  137. data/vendor/faiss/gpu/GpuResources.cpp +52 -0
  138. data/vendor/faiss/gpu/GpuResources.h +73 -0
  139. data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
  140. data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
  141. data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
  142. data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
  143. data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
  144. data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
  145. data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
  146. data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
  147. data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
  148. data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
  149. data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
  150. data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
  151. data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
  152. data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
  153. data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
  154. data/vendor/faiss/gpu/test/TestUtils.h +93 -0
  155. data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
  156. data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
  157. data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
  158. data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
  159. data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
  160. data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
  161. data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
  162. data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
  163. data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
  164. data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
  165. data/vendor/faiss/gpu/utils/Timer.h +52 -0
  166. data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
  167. data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
  168. data/vendor/faiss/impl/FaissAssert.h +95 -0
  169. data/vendor/faiss/impl/FaissException.cpp +66 -0
  170. data/vendor/faiss/impl/FaissException.h +71 -0
  171. data/vendor/faiss/impl/HNSW.cpp +818 -0
  172. data/vendor/faiss/impl/HNSW.h +275 -0
  173. data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
  174. data/vendor/faiss/impl/PolysemousTraining.h +158 -0
  175. data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
  176. data/vendor/faiss/impl/ProductQuantizer.h +242 -0
  177. data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
  178. data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
  179. data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
  180. data/vendor/faiss/impl/ThreadedIndex.h +80 -0
  181. data/vendor/faiss/impl/index_read.cpp +793 -0
  182. data/vendor/faiss/impl/index_write.cpp +558 -0
  183. data/vendor/faiss/impl/io.cpp +142 -0
  184. data/vendor/faiss/impl/io.h +98 -0
  185. data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
  186. data/vendor/faiss/impl/lattice_Zn.h +199 -0
  187. data/vendor/faiss/index_factory.cpp +392 -0
  188. data/vendor/faiss/index_factory.h +25 -0
  189. data/vendor/faiss/index_io.h +75 -0
  190. data/vendor/faiss/misc/test_blas.cpp +84 -0
  191. data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
  192. data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
  193. data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
  194. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
  195. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
  196. data/vendor/faiss/tests/test_merge.cpp +258 -0
  197. data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
  198. data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
  199. data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
  200. data/vendor/faiss/tests/test_params_override.cpp +231 -0
  201. data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
  202. data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
  203. data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
  204. data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
  205. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
  206. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
  207. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
  208. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
  209. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
  210. data/vendor/faiss/utils/Heap.cpp +122 -0
  211. data/vendor/faiss/utils/Heap.h +495 -0
  212. data/vendor/faiss/utils/WorkerThread.cpp +126 -0
  213. data/vendor/faiss/utils/WorkerThread.h +61 -0
  214. data/vendor/faiss/utils/distances.cpp +765 -0
  215. data/vendor/faiss/utils/distances.h +243 -0
  216. data/vendor/faiss/utils/distances_simd.cpp +809 -0
  217. data/vendor/faiss/utils/extra_distances.cpp +336 -0
  218. data/vendor/faiss/utils/extra_distances.h +54 -0
  219. data/vendor/faiss/utils/hamming-inl.h +472 -0
  220. data/vendor/faiss/utils/hamming.cpp +792 -0
  221. data/vendor/faiss/utils/hamming.h +220 -0
  222. data/vendor/faiss/utils/random.cpp +192 -0
  223. data/vendor/faiss/utils/random.h +60 -0
  224. data/vendor/faiss/utils/utils.cpp +783 -0
  225. data/vendor/faiss/utils/utils.h +181 -0
  226. metadata +216 -2
@@ -0,0 +1,38 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // -*- c++ -*-
9
+
10
+ // I/O code for indexes
11
+
12
+ #pragma once
13
+
14
+
15
+
16
+ namespace faiss {
17
+
18
+ struct Index;
19
+ struct IndexIVF;
20
+ struct VectorTransform;
21
+
22
+
23
+ /* cloning functions */
24
+ Index *clone_index (const Index *);
25
+
26
+ /** Cloner class, useful to override classes with other cloning
27
+ * functions. The cloning function above just calls
28
+ * Cloner::clone_Index. */
29
+ struct Cloner {
30
+ virtual VectorTransform *clone_VectorTransform (const VectorTransform *);
31
+ virtual Index *clone_Index (const Index *);
32
+ virtual IndexIVF *clone_IndexIVF (const IndexIVF *);
33
+ virtual ~Cloner() {}
34
+ };
35
+
36
+
37
+
38
+ } // namespace faiss
@@ -0,0 +1,151 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+
9
+
10
+ #include <cmath>
11
+ #include <cstdio>
12
+ #include <cstdlib>
13
+
14
+ #include <sys/time.h>
15
+
16
+
17
+ #include <faiss/IndexPQ.h>
18
+ #include <faiss/IndexIVFFlat.h>
19
+ #include <faiss/IndexFlat.h>
20
+ #include <faiss/index_io.h>
21
+
22
+ double elapsed ()
23
+ {
24
+ struct timeval tv;
25
+ gettimeofday (&tv, nullptr);
26
+ return tv.tv_sec + tv.tv_usec * 1e-6;
27
+ }
28
+
29
+
30
+ int main ()
31
+ {
32
+ double t0 = elapsed();
33
+
34
+ // dimension of the vectors to index
35
+ int d = 128;
36
+
37
+ // size of the database we plan to index
38
+ size_t nb = 1000 * 1000;
39
+
40
+ // make a set of nt training vectors in the unit cube
41
+ // (could be the database)
42
+ size_t nt = 100 * 1000;
43
+
44
+ //---------------------------------------------------------------
45
+ // Define the core quantizer
46
+ // We choose a multiple inverted index for faster training with less data
47
+ // and because it usually offers best accuracy/speed trade-offs
48
+ //
49
+ // We here assume that its lifespan of this coarse quantizer will cover the
50
+ // lifespan of the inverted-file quantizer IndexIVFFlat below
51
+ // With dynamic allocation, one may give the responsability to free the
52
+ // quantizer to the inverted-file index (with attribute do_delete_quantizer)
53
+ //
54
+ // Note: a regular clustering algorithm would be defined as:
55
+ // faiss::IndexFlatL2 coarse_quantizer (d);
56
+ //
57
+ // Use nhash=2 subquantizers used to define the product coarse quantizer
58
+ // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
59
+ // meaning (2^12)^nhash distinct inverted lists
60
+ size_t nhash = 2;
61
+ size_t nbits_subq = int (log2 (nb+1) / 2); // good choice in general
62
+ size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
63
+
64
+ faiss::MultiIndexQuantizer coarse_quantizer (d, nhash, nbits_subq);
65
+
66
+ printf ("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
67
+ nhash, nbits_subq, ncentroids, nb);
68
+
69
+ // the coarse quantizer should not be dealloced before the index
70
+ // 4 = nb of bytes per code (d must be a multiple of this)
71
+ // 8 = nb of bits per sub-code (almost always 8)
72
+ faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
73
+ faiss::IndexIVFFlat index (&coarse_quantizer, d, ncentroids, metric);
74
+ index.quantizer_trains_alone = true;
75
+
76
+ // define the number of probes. 2048 is for high-dim, overkilled in practice
77
+ // Use 4-1024 depending on the trade-off speed accuracy that you want
78
+ index.nprobe = 2048;
79
+
80
+
81
+ { // training
82
+ printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
83
+ elapsed() - t0, nt, d);
84
+
85
+ std::vector <float> trainvecs (nt * d);
86
+ for (size_t i = 0; i < nt * d; i++) {
87
+ trainvecs[i] = drand48();
88
+ }
89
+
90
+ printf ("[%.3f s] Training the index\n", elapsed() - t0);
91
+ index.verbose = true;
92
+ index.train (nt, trainvecs.data());
93
+ }
94
+
95
+ size_t nq;
96
+ std::vector<float> queries;
97
+
98
+ { // populating the database
99
+ printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
100
+ elapsed() - t0, nb);
101
+
102
+ std::vector <float> database (nb * d);
103
+ for (size_t i = 0; i < nb * d; i++) {
104
+ database[i] = drand48();
105
+ }
106
+
107
+ printf ("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
108
+
109
+ index.add (nb, database.data());
110
+
111
+ // remember a few elements from the database as queries
112
+ int i0 = 1234;
113
+ int i1 = 1244;
114
+
115
+ nq = i1 - i0;
116
+ queries.resize (nq * d);
117
+ for (int i = i0; i < i1; i++) {
118
+ for (int j = 0; j < d; j++) {
119
+ queries [(i - i0) * d + j] = database [i * d + j];
120
+ }
121
+ }
122
+ }
123
+
124
+ { // searching the database
125
+ int k = 5;
126
+ printf ("[%.3f s] Searching the %d nearest neighbors "
127
+ "of %ld vectors in the index\n",
128
+ elapsed() - t0, k, nq);
129
+
130
+ std::vector<faiss::Index::idx_t> nns (k * nq);
131
+ std::vector<float> dis (k * nq);
132
+
133
+ index.search (nq, queries.data(), k, dis.data(), nns.data());
134
+
135
+ printf ("[%.3f s] Query results (vector ids, then distances):\n",
136
+ elapsed() - t0);
137
+
138
+ for (int i = 0; i < nq; i++) {
139
+ printf ("query %2d: ", i);
140
+ for (int j = 0; j < k; j++) {
141
+ printf ("%7ld ", nns[j + i * k]);
142
+ }
143
+ printf ("\n dis: ");
144
+ for (int j = 0; j < k; j++) {
145
+ printf ("%7g ", dis[j + i * k]);
146
+ }
147
+ printf ("\n");
148
+ }
149
+ }
150
+ return 0;
151
+ }
@@ -0,0 +1,199 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+
9
+
10
+ #include <cmath>
11
+ #include <cstdio>
12
+ #include <cstdlib>
13
+
14
+ #include <sys/time.h>
15
+
16
+
17
+ #include <faiss/IndexPQ.h>
18
+ #include <faiss/IndexIVFPQ.h>
19
+ #include <faiss/IndexFlat.h>
20
+ #include <faiss/index_io.h>
21
+
22
+ double elapsed ()
23
+ {
24
+ struct timeval tv;
25
+ gettimeofday (&tv, nullptr);
26
+ return tv.tv_sec + tv.tv_usec * 1e-6;
27
+ }
28
+
29
+
30
+ int main ()
31
+ {
32
+ double t0 = elapsed();
33
+
34
+ // dimension of the vectors to index
35
+ int d = 64;
36
+
37
+ // size of the database we plan to index
38
+ size_t nb = 1000 * 1000;
39
+ size_t add_bs = 10000; // # size of the blocks to add
40
+
41
+ // make a set of nt training vectors in the unit cube
42
+ // (could be the database)
43
+ size_t nt = 100 * 1000;
44
+
45
+ //---------------------------------------------------------------
46
+ // Define the core quantizer
47
+ // We choose a multiple inverted index for faster training with less data
48
+ // and because it usually offers best accuracy/speed trade-offs
49
+ //
50
+ // We here assume that its lifespan of this coarse quantizer will cover the
51
+ // lifespan of the inverted-file quantizer IndexIVFFlat below
52
+ // With dynamic allocation, one may give the responsability to free the
53
+ // quantizer to the inverted-file index (with attribute do_delete_quantizer)
54
+ //
55
+ // Note: a regular clustering algorithm would be defined as:
56
+ // faiss::IndexFlatL2 coarse_quantizer (d);
57
+ //
58
+ // Use nhash=2 subquantizers used to define the product coarse quantizer
59
+ // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
60
+ // meaning (2^12)^nhash distinct inverted lists
61
+ //
62
+ // The parameter bytes_per_code is determined by the memory
63
+ // constraint, the dataset will use nb * (bytes_per_code + 8)
64
+ // bytes.
65
+ //
66
+ // The parameter nbits_subq is determined by the size of the dataset to index.
67
+ //
68
+ size_t nhash = 2;
69
+ size_t nbits_subq = 9;
70
+ size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
71
+ int bytes_per_code = 16;
72
+
73
+ faiss::MultiIndexQuantizer coarse_quantizer (d, nhash, nbits_subq);
74
+
75
+ printf ("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
76
+ nhash, nbits_subq, ncentroids, nb);
77
+
78
+ // the coarse quantizer should not be dealloced before the index
79
+ // 4 = nb of bytes per code (d must be a multiple of this)
80
+ // 8 = nb of bits per sub-code (almost always 8)
81
+ faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
82
+ faiss::IndexIVFPQ index (&coarse_quantizer, d, ncentroids, bytes_per_code, 8);
83
+ index.quantizer_trains_alone = true;
84
+
85
+ // define the number of probes. 2048 is for high-dim, overkill in practice
86
+ // Use 4-1024 depending on the trade-off speed accuracy that you want
87
+ index.nprobe = 2048;
88
+
89
+
90
+ { // training.
91
+
92
+ // The distribution of the training vectors should be the same
93
+ // as the database vectors. It could be a sub-sample of the
94
+ // database vectors, if sampling is not biased. Here we just
95
+ // randomly generate the vectors.
96
+
97
+ printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
98
+ elapsed() - t0, nt, d);
99
+
100
+ std::vector <float> trainvecs (nt * d);
101
+ for (size_t i = 0; i < nt; i++) {
102
+ for (size_t j = 0; j < d; j++) {
103
+ trainvecs[i * d + j] = drand48();
104
+ }
105
+ }
106
+
107
+ printf ("[%.3f s] Training the index\n", elapsed() - t0);
108
+ index.verbose = true;
109
+ index.train (nt, trainvecs.data());
110
+ }
111
+
112
+ // the index can be re-loaded later with
113
+ // faiss::Index * idx = faiss::read_index("/tmp/trained_index.faissindex");
114
+ faiss::write_index(&index, "/tmp/trained_index.faissindex");
115
+
116
+ size_t nq;
117
+ std::vector<float> queries;
118
+
119
+ { // populating the database
120
+ printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
121
+ elapsed() - t0, nb);
122
+
123
+ std::vector <float> database (nb * d);
124
+ std::vector <long> ids (nb);
125
+ for (size_t i = 0; i < nb; i++) {
126
+ for (size_t j = 0; j < d; j++) {
127
+ database[i * d + j] = drand48();
128
+ }
129
+ ids[i] = 8760000000L + i;
130
+ }
131
+
132
+ printf ("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
133
+
134
+ for (size_t begin = 0; begin < nb; begin += add_bs) {
135
+ size_t end = std::min (begin + add_bs, nb);
136
+ index.add_with_ids (end - begin,
137
+ database.data() + d * begin,
138
+ ids.data() + begin);
139
+ }
140
+
141
+ // remember a few elements from the database as queries
142
+ int i0 = 1234;
143
+ int i1 = 1244;
144
+
145
+ nq = i1 - i0;
146
+ queries.resize (nq * d);
147
+ for (int i = i0; i < i1; i++) {
148
+ for (int j = 0; j < d; j++) {
149
+ queries [(i - i0) * d + j] = database [i * d + j];
150
+ }
151
+ }
152
+ }
153
+
154
+ // A few notes on the internal format of the index:
155
+ //
156
+ // - the positing lists for PQ codes are index.codes, which is a
157
+ // std::vector < std::vector<uint8_t> >
158
+ // if n is the length of posting list #i, codes[i] has length bytes_per_code * n
159
+ //
160
+ // - the corresponding ids are stored in index.ids
161
+ //
162
+ // - given a vector float *x, finding which k centroids are
163
+ // closest to it (ie to find the nearest neighbors) can be done with
164
+ //
165
+ // long *centroid_ids = new long[k];
166
+ // float *distances = new float[k];
167
+ // index.quantizer->search (1, x, k, dis, centroids_ids);
168
+ //
169
+
170
+ faiss::write_index(&index, "/tmp/populated_index.faissindex");
171
+
172
+ { // searching the database
173
+ int k = 5;
174
+ printf ("[%.3f s] Searching the %d nearest neighbors "
175
+ "of %ld vectors in the index\n",
176
+ elapsed() - t0, k, nq);
177
+
178
+ std::vector<faiss::Index::idx_t> nns (k * nq);
179
+ std::vector<float> dis (k * nq);
180
+
181
+ index.search (nq, queries.data(), k, dis.data(), nns.data());
182
+
183
+ printf ("[%.3f s] Query results (vector ids, then distances):\n",
184
+ elapsed() - t0);
185
+
186
+ for (int i = 0; i < nq; i++) {
187
+ printf ("query %2d: ", i);
188
+ for (int j = 0; j < k; j++) {
189
+ printf ("%7ld ", nns[j + i * k]);
190
+ }
191
+ printf ("\n dis: ");
192
+ for (int j = 0; j < k; j++) {
193
+ printf ("%7g ", dis[j + i * k]);
194
+ }
195
+ printf ("\n");
196
+ }
197
+ }
198
+ return 0;
199
+ }
@@ -0,0 +1,146 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+
9
+
10
+ #include <cmath>
11
+ #include <cstdio>
12
+ #include <cstdlib>
13
+
14
+ #include <sys/time.h>
15
+
16
+
17
+ #include <faiss/IndexIVFPQ.h>
18
+ #include <faiss/IndexFlat.h>
19
+ #include <faiss/index_io.h>
20
+
21
+ double elapsed ()
22
+ {
23
+ struct timeval tv;
24
+ gettimeofday (&tv, NULL);
25
+ return tv.tv_sec + tv.tv_usec * 1e-6;
26
+ }
27
+
28
+
29
+ int main ()
30
+ {
31
+
32
+ double t0 = elapsed();
33
+
34
+ // dimension of the vectors to index
35
+ int d = 128;
36
+
37
+ // size of the database we plan to index
38
+ size_t nb = 200 * 1000;
39
+
40
+ // make a set of nt training vectors in the unit cube
41
+ // (could be the database)
42
+ size_t nt = 100 * 1000;
43
+
44
+ // make the index object and train it
45
+ faiss::IndexFlatL2 coarse_quantizer (d);
46
+
47
+ // a reasonable number of centroids to index nb vectors
48
+ int ncentroids = int (4 * sqrt (nb));
49
+
50
+ // the coarse quantizer should not be dealloced before the index
51
+ // 4 = nb of bytes per code (d must be a multiple of this)
52
+ // 8 = nb of bits per sub-code (almost always 8)
53
+ faiss::IndexIVFPQ index (&coarse_quantizer, d,
54
+ ncentroids, 4, 8);
55
+
56
+
57
+ { // training
58
+ printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
59
+ elapsed() - t0, nt, d);
60
+
61
+ std::vector <float> trainvecs (nt * d);
62
+ for (size_t i = 0; i < nt * d; i++) {
63
+ trainvecs[i] = drand48();
64
+ }
65
+
66
+ printf ("[%.3f s] Training the index\n",
67
+ elapsed() - t0);
68
+ index.verbose = true;
69
+
70
+ index.train (nt, trainvecs.data());
71
+ }
72
+
73
+ { // I/O demo
74
+ const char *outfilename = "/tmp/index_trained.faissindex";
75
+ printf ("[%.3f s] storing the pre-trained index to %s\n",
76
+ elapsed() - t0, outfilename);
77
+
78
+ write_index (&index, outfilename);
79
+ }
80
+
81
+ size_t nq;
82
+ std::vector<float> queries;
83
+
84
+ { // populating the database
85
+ printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
86
+ elapsed() - t0, nb);
87
+
88
+ std::vector <float> database (nb * d);
89
+ for (size_t i = 0; i < nb * d; i++) {
90
+ database[i] = drand48();
91
+ }
92
+
93
+ printf ("[%.3f s] Adding the vectors to the index\n",
94
+ elapsed() - t0);
95
+
96
+ index.add (nb, database.data());
97
+
98
+ printf ("[%.3f s] imbalance factor: %g\n",
99
+ elapsed() - t0, index.invlists->imbalance_factor ());
100
+
101
+ // remember a few elements from the database as queries
102
+ int i0 = 1234;
103
+ int i1 = 1243;
104
+
105
+ nq = i1 - i0;
106
+ queries.resize (nq * d);
107
+ for (int i = i0; i < i1; i++) {
108
+ for (int j = 0; j < d; j++) {
109
+ queries [(i - i0) * d + j] = database [i * d + j];
110
+ }
111
+ }
112
+
113
+ }
114
+
115
+ { // searching the database
116
+ int k = 5;
117
+ printf ("[%.3f s] Searching the %d nearest neighbors "
118
+ "of %ld vectors in the index\n",
119
+ elapsed() - t0, k, nq);
120
+
121
+ std::vector<faiss::Index::idx_t> nns (k * nq);
122
+ std::vector<float> dis (k * nq);
123
+
124
+ index.search (nq, queries.data(), k, dis.data(), nns.data());
125
+
126
+ printf ("[%.3f s] Query results (vector ids, then distances):\n",
127
+ elapsed() - t0);
128
+
129
+ for (int i = 0; i < nq; i++) {
130
+ printf ("query %2d: ", i);
131
+ for (int j = 0; j < k; j++) {
132
+ printf ("%7ld ", nns[j + i * k]);
133
+ }
134
+ printf ("\n dis: ");
135
+ for (int j = 0; j < k; j++) {
136
+ printf ("%7g ", dis[j + i * k]);
137
+ }
138
+ printf ("\n");
139
+ }
140
+
141
+ printf ("note that the nearest neighbor is not at "
142
+ "distance 0 due to quantization errors\n");
143
+ }
144
+
145
+ return 0;
146
+ }