faiss 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +25 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +16 -4
  5. data/ext/faiss/ext.cpp +12 -308
  6. data/ext/faiss/extconf.rb +6 -3
  7. data/ext/faiss/index.cpp +189 -0
  8. data/ext/faiss/index_binary.cpp +75 -0
  9. data/ext/faiss/kmeans.cpp +40 -0
  10. data/ext/faiss/numo.hpp +867 -0
  11. data/ext/faiss/pca_matrix.cpp +33 -0
  12. data/ext/faiss/product_quantizer.cpp +53 -0
  13. data/ext/faiss/utils.cpp +13 -0
  14. data/ext/faiss/utils.h +5 -0
  15. data/lib/faiss.rb +0 -5
  16. data/lib/faiss/version.rb +1 -1
  17. data/vendor/faiss/faiss/AutoTune.cpp +36 -33
  18. data/vendor/faiss/faiss/AutoTune.h +6 -3
  19. data/vendor/faiss/faiss/Clustering.cpp +16 -12
  20. data/vendor/faiss/faiss/Index.cpp +3 -4
  21. data/vendor/faiss/faiss/Index.h +3 -3
  22. data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
  23. data/vendor/faiss/faiss/IndexBinary.h +1 -1
  24. data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
  25. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
  26. data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
  27. data/vendor/faiss/faiss/IndexFlat.h +0 -51
  28. data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
  29. data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
  30. data/vendor/faiss/faiss/IndexIVF.h +22 -15
  31. data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
  32. data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
  33. data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
  34. data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
  35. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
  36. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
  37. data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
  38. data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
  39. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
  40. data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
  41. data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
  42. data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
  43. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
  44. data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
  45. data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
  46. data/vendor/faiss/faiss/IndexRefine.h +73 -0
  47. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
  48. data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
  49. data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
  50. data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
  51. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
  52. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
  53. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
  54. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
  55. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
  56. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
  57. data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
  58. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
  59. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
  60. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
  61. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
  62. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
  63. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
  64. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
  65. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
  66. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
  67. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
  68. data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
  69. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
  70. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
  71. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
  72. data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
  73. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
  74. data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
  75. data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
  76. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
  77. data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
  78. data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
  79. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
  80. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
  81. data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
  82. data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
  83. data/vendor/faiss/faiss/impl/io.cpp +33 -2
  84. data/vendor/faiss/faiss/impl/io.h +7 -2
  85. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
  86. data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
  87. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
  88. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
  89. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
  90. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
  91. data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
  92. data/vendor/faiss/faiss/index_factory.cpp +112 -7
  93. data/vendor/faiss/faiss/index_io.h +1 -48
  94. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
  95. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
  96. data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
  97. data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
  98. data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
  99. data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
  100. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
  101. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
  102. data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
  103. data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
  104. data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
  105. data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
  106. data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
  107. data/vendor/faiss/faiss/utils/Heap.h +61 -50
  108. data/vendor/faiss/faiss/utils/distances.cpp +164 -319
  109. data/vendor/faiss/faiss/utils/distances.h +28 -20
  110. data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
  111. data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
  112. data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
  113. data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
  114. data/vendor/faiss/faiss/utils/hamming.h +2 -7
  115. data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
  116. data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
  117. data/vendor/faiss/faiss/utils/partitioning.h +69 -0
  118. data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
  119. data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
  120. data/vendor/faiss/faiss/utils/simdlib.h +31 -0
  121. data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
  122. data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
  123. metadata +54 -149
  124. data/lib/faiss/index.rb +0 -20
  125. data/lib/faiss/index_binary.rb +0 -20
  126. data/lib/faiss/kmeans.rb +0 -15
  127. data/lib/faiss/pca_matrix.rb +0 -15
  128. data/lib/faiss/product_quantizer.rb +0 -22
  129. data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
  130. data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
  131. data/vendor/faiss/c_api/AutoTune_c.h +0 -66
  132. data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
  133. data/vendor/faiss/c_api/Clustering_c.h +0 -123
  134. data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
  135. data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
  136. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
  137. data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
  138. data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
  139. data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
  140. data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
  141. data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
  142. data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
  143. data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
  144. data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
  145. data/vendor/faiss/c_api/IndexShards_c.h +0 -39
  146. data/vendor/faiss/c_api/Index_c.cpp +0 -105
  147. data/vendor/faiss/c_api/Index_c.h +0 -183
  148. data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
  149. data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
  150. data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
  151. data/vendor/faiss/c_api/clone_index_c.h +0 -32
  152. data/vendor/faiss/c_api/error_c.h +0 -42
  153. data/vendor/faiss/c_api/error_impl.cpp +0 -27
  154. data/vendor/faiss/c_api/error_impl.h +0 -16
  155. data/vendor/faiss/c_api/faiss_c.h +0 -58
  156. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
  157. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
  158. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
  159. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
  160. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
  161. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
  162. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
  163. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
  164. data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
  165. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
  166. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
  167. data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
  168. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
  169. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
  170. data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
  171. data/vendor/faiss/c_api/index_factory_c.h +0 -30
  172. data/vendor/faiss/c_api/index_io_c.cpp +0 -42
  173. data/vendor/faiss/c_api/index_io_c.h +0 -50
  174. data/vendor/faiss/c_api/macros_impl.h +0 -110
  175. data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
  176. data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
  177. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
  178. data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
  179. data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
  180. data/vendor/faiss/misc/test_blas.cpp +0 -87
  181. data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
  182. data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
  183. data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
  184. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
  185. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
  186. data/vendor/faiss/tests/test_merge.cpp +0 -260
  187. data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
  188. data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
  189. data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
  190. data/vendor/faiss/tests/test_params_override.cpp +0 -236
  191. data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
  192. data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
  193. data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
  194. data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
  195. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
  196. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
  197. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
  198. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
  199. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -1,151 +0,0 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
3
- *
4
- * This source code is licensed under the MIT license found in the
5
- * LICENSE file in the root directory of this source tree.
6
- */
7
-
8
-
9
-
10
- #include <cmath>
11
- #include <cstdio>
12
- #include <cstdlib>
13
- #include <random>
14
-
15
- #include <sys/time.h>
16
-
17
-
18
- #include <faiss/IndexIVFPQ.h>
19
- #include <faiss/IndexFlat.h>
20
- #include <faiss/index_io.h>
21
-
22
- double elapsed ()
23
- {
24
- struct timeval tv;
25
- gettimeofday (&tv, NULL);
26
- return tv.tv_sec + tv.tv_usec * 1e-6;
27
- }
28
-
29
-
30
- int main ()
31
- {
32
-
33
- double t0 = elapsed();
34
-
35
- // dimension of the vectors to index
36
- int d = 128;
37
-
38
- // size of the database we plan to index
39
- size_t nb = 200 * 1000;
40
-
41
- // make a set of nt training vectors in the unit cube
42
- // (could be the database)
43
- size_t nt = 100 * 1000;
44
-
45
- // make the index object and train it
46
- faiss::IndexFlatL2 coarse_quantizer (d);
47
-
48
- // a reasonable number of centroids to index nb vectors
49
- int ncentroids = int (4 * sqrt (nb));
50
-
51
- // the coarse quantizer should not be dealloced before the index
52
- // 4 = nb of bytes per code (d must be a multiple of this)
53
- // 8 = nb of bits per sub-code (almost always 8)
54
- faiss::IndexIVFPQ index (&coarse_quantizer, d,
55
- ncentroids, 4, 8);
56
-
57
-
58
- std::mt19937 rng;
59
-
60
- { // training
61
- printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
62
- elapsed() - t0, nt, d);
63
-
64
- std::vector <float> trainvecs (nt * d);
65
- std::uniform_real_distribution<> distrib;
66
- for (size_t i = 0; i < nt * d; i++) {
67
- trainvecs[i] = distrib(rng);
68
- }
69
-
70
- printf ("[%.3f s] Training the index\n",
71
- elapsed() - t0);
72
- index.verbose = true;
73
-
74
- index.train (nt, trainvecs.data());
75
- }
76
-
77
- { // I/O demo
78
- const char *outfilename = "/tmp/index_trained.faissindex";
79
- printf ("[%.3f s] storing the pre-trained index to %s\n",
80
- elapsed() - t0, outfilename);
81
-
82
- write_index (&index, outfilename);
83
- }
84
-
85
- size_t nq;
86
- std::vector<float> queries;
87
-
88
- { // populating the database
89
- printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
90
- elapsed() - t0, nb);
91
-
92
- std::vector <float> database (nb * d);
93
- std::uniform_real_distribution<> distrib;
94
- for (size_t i = 0; i < nb * d; i++) {
95
- database[i] = distrib(rng);
96
- }
97
-
98
- printf ("[%.3f s] Adding the vectors to the index\n",
99
- elapsed() - t0);
100
-
101
- index.add (nb, database.data());
102
-
103
- printf ("[%.3f s] imbalance factor: %g\n",
104
- elapsed() - t0, index.invlists->imbalance_factor ());
105
-
106
- // remember a few elements from the database as queries
107
- int i0 = 1234;
108
- int i1 = 1243;
109
-
110
- nq = i1 - i0;
111
- queries.resize (nq * d);
112
- for (int i = i0; i < i1; i++) {
113
- for (int j = 0; j < d; j++) {
114
- queries [(i - i0) * d + j] = database [i * d + j];
115
- }
116
- }
117
-
118
- }
119
-
120
- { // searching the database
121
- int k = 5;
122
- printf ("[%.3f s] Searching the %d nearest neighbors "
123
- "of %ld vectors in the index\n",
124
- elapsed() - t0, k, nq);
125
-
126
- std::vector<faiss::Index::idx_t> nns (k * nq);
127
- std::vector<float> dis (k * nq);
128
-
129
- index.search (nq, queries.data(), k, dis.data(), nns.data());
130
-
131
- printf ("[%.3f s] Query results (vector ids, then distances):\n",
132
- elapsed() - t0);
133
-
134
- for (int i = 0; i < nq; i++) {
135
- printf ("query %2d: ", i);
136
- for (int j = 0; j < k; j++) {
137
- printf ("%7ld ", nns[j + i * k]);
138
- }
139
- printf ("\n dis: ");
140
- for (int j = 0; j < k; j++) {
141
- printf ("%7g ", dis[j + i * k]);
142
- }
143
- printf ("\n");
144
- }
145
-
146
- printf ("note that the nearest neighbor is not at "
147
- "distance 0 due to quantization errors\n");
148
- }
149
-
150
- return 0;
151
- }
@@ -1,252 +0,0 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
3
- *
4
- * This source code is licensed under the MIT license found in the
5
- * LICENSE file in the root directory of this source tree.
6
- */
7
-
8
-
9
-
10
- #include <cmath>
11
- #include <cstdio>
12
- #include <cstdlib>
13
- #include <cassert>
14
- #include <cstring>
15
-
16
- #include <sys/types.h>
17
- #include <sys/stat.h>
18
- #include <unistd.h>
19
-
20
- #include <sys/time.h>
21
-
22
- #include <faiss/AutoTune.h>
23
- #include <faiss/index_factory.h>
24
-
25
- /**
26
- * To run this demo, please download the ANN_SIFT1M dataset from
27
- *
28
- * http://corpus-texmex.irisa.fr/
29
- *
30
- * and unzip it to the sudirectory sift1M.
31
- **/
32
-
33
- /*****************************************************
34
- * I/O functions for fvecs and ivecs
35
- *****************************************************/
36
-
37
-
38
- float * fvecs_read (const char *fname,
39
- size_t *d_out, size_t *n_out)
40
- {
41
- FILE *f = fopen(fname, "r");
42
- if(!f) {
43
- fprintf(stderr, "could not open %s\n", fname);
44
- perror("");
45
- abort();
46
- }
47
- int d;
48
- fread(&d, 1, sizeof(int), f);
49
- assert((d > 0 && d < 1000000) || !"unreasonable dimension");
50
- fseek(f, 0, SEEK_SET);
51
- struct stat st;
52
- fstat(fileno(f), &st);
53
- size_t sz = st.st_size;
54
- assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
55
- size_t n = sz / ((d + 1) * 4);
56
-
57
- *d_out = d; *n_out = n;
58
- float *x = new float[n * (d + 1)];
59
- size_t nr = fread(x, sizeof(float), n * (d + 1), f);
60
- assert(nr == n * (d + 1) || !"could not read whole file");
61
-
62
- // shift array to remove row headers
63
- for(size_t i = 0; i < n; i++)
64
- memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
65
-
66
- fclose(f);
67
- return x;
68
- }
69
-
70
- // not very clean, but works as long as sizeof(int) == sizeof(float)
71
- int *ivecs_read(const char *fname, size_t *d_out, size_t *n_out)
72
- {
73
- return (int*)fvecs_read(fname, d_out, n_out);
74
- }
75
-
76
- double elapsed ()
77
- {
78
- struct timeval tv;
79
- gettimeofday (&tv, nullptr);
80
- return tv.tv_sec + tv.tv_usec * 1e-6;
81
- }
82
-
83
-
84
-
85
- int main()
86
- {
87
- double t0 = elapsed();
88
-
89
- // this is typically the fastest one.
90
- const char *index_key = "IVF4096,Flat";
91
-
92
- // these ones have better memory usage
93
- // const char *index_key = "Flat";
94
- // const char *index_key = "PQ32";
95
- // const char *index_key = "PCA80,Flat";
96
- // const char *index_key = "IVF4096,PQ8+16";
97
- // const char *index_key = "IVF4096,PQ32";
98
- // const char *index_key = "IMI2x8,PQ32";
99
- // const char *index_key = "IMI2x8,PQ8+16";
100
- // const char *index_key = "OPQ16_64,IMI2x8,PQ8+16";
101
-
102
- faiss::Index * index;
103
-
104
- size_t d;
105
-
106
- {
107
- printf ("[%.3f s] Loading train set\n", elapsed() - t0);
108
-
109
- size_t nt;
110
- float *xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);
111
-
112
- printf ("[%.3f s] Preparing index \"%s\" d=%ld\n",
113
- elapsed() - t0, index_key, d);
114
- index = faiss::index_factory(d, index_key);
115
-
116
- printf ("[%.3f s] Training on %ld vectors\n", elapsed() - t0, nt);
117
-
118
- index->train(nt, xt);
119
- delete [] xt;
120
- }
121
-
122
-
123
- {
124
- printf ("[%.3f s] Loading database\n", elapsed() - t0);
125
-
126
- size_t nb, d2;
127
- float *xb = fvecs_read("sift1M/sift_base.fvecs", &d2, &nb);
128
- assert(d == d2 || !"dataset does not have same dimension as train set");
129
-
130
- printf ("[%.3f s] Indexing database, size %ld*%ld\n",
131
- elapsed() - t0, nb, d);
132
-
133
- index->add(nb, xb);
134
-
135
- delete [] xb;
136
- }
137
-
138
- size_t nq;
139
- float *xq;
140
-
141
- {
142
- printf ("[%.3f s] Loading queries\n", elapsed() - t0);
143
-
144
- size_t d2;
145
- xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq);
146
- assert(d == d2 || !"query does not have same dimension as train set");
147
-
148
- }
149
-
150
- size_t k; // nb of results per query in the GT
151
- faiss::Index::idx_t *gt; // nq * k matrix of ground-truth nearest-neighbors
152
-
153
- {
154
- printf ("[%.3f s] Loading ground truth for %ld queries\n",
155
- elapsed() - t0, nq);
156
-
157
- // load ground-truth and convert int to long
158
- size_t nq2;
159
- int *gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
160
- assert(nq2 == nq || !"incorrect nb of ground truth entries");
161
-
162
- gt = new faiss::Index::idx_t[k * nq];
163
- for(int i = 0; i < k * nq; i++) {
164
- gt[i] = gt_int[i];
165
- }
166
- delete [] gt_int;
167
- }
168
-
169
- // Result of the auto-tuning
170
- std::string selected_params;
171
-
172
- { // run auto-tuning
173
-
174
- printf ("[%.3f s] Preparing auto-tune criterion 1-recall at 1 "
175
- "criterion, with k=%ld nq=%ld\n", elapsed() - t0, k, nq);
176
-
177
- faiss::OneRecallAtRCriterion crit(nq, 1);
178
- crit.set_groundtruth (k, nullptr, gt);
179
- crit.nnn = k; // by default, the criterion will request only 1 NN
180
-
181
- printf ("[%.3f s] Preparing auto-tune parameters\n", elapsed() - t0);
182
-
183
- faiss::ParameterSpace params;
184
- params.initialize(index);
185
-
186
- printf ("[%.3f s] Auto-tuning over %ld parameters (%ld combinations)\n",
187
- elapsed() - t0, params.parameter_ranges.size(),
188
- params.n_combinations());
189
-
190
- faiss::OperatingPoints ops;
191
- params.explore (index, nq, xq, crit, &ops);
192
-
193
- printf ("[%.3f s] Found the following operating points: \n",
194
- elapsed() - t0);
195
-
196
- ops.display ();
197
-
198
- // keep the first parameter that obtains > 0.5 1-recall@1
199
- for (int i = 0; i < ops.optimal_pts.size(); i++) {
200
- if (ops.optimal_pts[i].perf > 0.5) {
201
- selected_params = ops.optimal_pts[i].key;
202
- break;
203
- }
204
- }
205
- assert (selected_params.size() >= 0 ||
206
- !"could not find good enough op point");
207
- }
208
-
209
-
210
- { // Use the found configuration to perform a search
211
-
212
- faiss::ParameterSpace params;
213
-
214
- printf ("[%.3f s] Setting parameter configuration \"%s\" on index\n",
215
- elapsed() - t0, selected_params.c_str());
216
-
217
- params.set_index_parameters (index, selected_params.c_str());
218
-
219
- printf ("[%.3f s] Perform a search on %ld queries\n",
220
- elapsed() - t0, nq);
221
-
222
- // output buffers
223
- faiss::Index::idx_t *I = new faiss::Index::idx_t[nq * k];
224
- float *D = new float[nq * k];
225
-
226
- index->search(nq, xq, k, D, I);
227
-
228
- printf ("[%.3f s] Compute recalls\n", elapsed() - t0);
229
-
230
- // evaluate result by hand.
231
- int n_1 = 0, n_10 = 0, n_100 = 0;
232
- for(int i = 0; i < nq; i++) {
233
- int gt_nn = gt[i * k];
234
- for(int j = 0; j < k; j++) {
235
- if (I[i * k + j] == gt_nn) {
236
- if(j < 1) n_1++;
237
- if(j < 10) n_10++;
238
- if(j < 100) n_100++;
239
- }
240
- }
241
- }
242
- printf("R@1 = %.4f\n", n_1 / float(nq));
243
- printf("R@10 = %.4f\n", n_10 / float(nq));
244
- printf("R@100 = %.4f\n", n_100 / float(nq));
245
-
246
- }
247
-
248
- delete [] xq;
249
- delete [] gt;
250
- delete index;
251
- return 0;
252
- }
@@ -1,185 +0,0 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
3
- *
4
- * This source code is licensed under the MIT license found in the
5
- * LICENSE file in the root directory of this source tree.
6
- */
7
-
8
- #include <cstdio>
9
- #include <cstdlib>
10
-
11
- #include <faiss/Clustering.h>
12
- #include <faiss/utils/random.h>
13
- #include <faiss/utils/distances.h>
14
- #include <faiss/IndexFlat.h>
15
- #include <faiss/IndexHNSW.h>
16
-
17
-
18
- namespace {
19
-
20
-
21
- enum WeightedKMeansType {
22
- WKMT_FlatL2,
23
- WKMT_FlatIP,
24
- WKMT_FlatIP_spherical,
25
- WKMT_HNSW,
26
- };
27
-
28
-
29
- float weighted_kmeans_clustering (size_t d, size_t n, size_t k,
30
- const float *input,
31
- const float *weights,
32
- float *centroids,
33
- WeightedKMeansType index_num)
34
- {
35
- using namespace faiss;
36
- Clustering clus (d, k);
37
- clus.verbose = true;
38
-
39
- std::unique_ptr<Index> index;
40
-
41
- switch (index_num) {
42
- case WKMT_FlatL2:
43
- index.reset(new IndexFlatL2 (d));
44
- break;
45
- case WKMT_FlatIP:
46
- index.reset(new IndexFlatIP (d));
47
- break;
48
- case WKMT_FlatIP_spherical:
49
- index.reset(new IndexFlatIP (d));
50
- clus.spherical = true;
51
- break;
52
- case WKMT_HNSW:
53
- IndexHNSWFlat *ihnsw = new IndexHNSWFlat (d, 32);
54
- ihnsw->hnsw.efSearch = 128;
55
- index.reset(ihnsw);
56
- break;
57
- }
58
-
59
- clus.train(n, input, *index.get(), weights);
60
- // on output the index contains the centroids.
61
- memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
62
- return clus.iteration_stats.back().obj;
63
- }
64
-
65
-
66
- int d = 32;
67
- float sigma = 0.1;
68
-
69
- #define BIGTEST
70
-
71
- #ifdef BIGTEST
72
- // the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs
73
- int nc = 200000;
74
- int n_big = 4;
75
- int n_small = 2;
76
- #else
77
- int nc = 5;
78
- int n_big = 100;
79
- int n_small = 10;
80
- #endif
81
-
82
- int n; // number of training points
83
-
84
- void generate_trainset (std::vector<float> & ccent,
85
- std::vector<float> & x,
86
- std::vector<float> & weights)
87
- {
88
- // same sampling as test_build_blocks.py test_weighted
89
-
90
- ccent.resize (d * 2 * nc);
91
- faiss::float_randn (ccent.data(), d * 2 * nc, 123);
92
- faiss::fvec_renorm_L2 (d, 2 * nc, ccent.data());
93
- n = nc * n_big + nc * n_small;
94
- x.resize(d * n);
95
- weights.resize(n);
96
- faiss::float_randn (x.data(), x.size(), 1234);
97
-
98
- float *xi = x.data();
99
- float *w = weights.data();
100
- for (int ci = 0; ci < nc * 2; ci++) { // loop over centroids
101
- int np = ci < nc ? n_big : n_small; // nb of points around this centroid
102
- for (int i = 0; i < np; i++) {
103
- for (int j = 0; j < d; j++) {
104
- xi[j] = xi[j] * sigma + ccent[ci * d + j];
105
- }
106
- *w++ = ci < nc ? 0.1 : 10;
107
- xi += d;
108
- }
109
- }
110
- }
111
-
112
- }
113
-
114
-
115
- int main(int argc, char **argv) {
116
- std::vector<float> ccent;
117
- std::vector<float> x;
118
- std::vector<float> weights;
119
-
120
- printf("generate training set\n");
121
- generate_trainset(ccent, x, weights);
122
-
123
- std::vector<float> centroids;
124
- centroids.resize(nc * d);
125
-
126
- int the_index_num = -1;
127
- int the_with_weights = -1;
128
-
129
- if (argc == 3) {
130
- the_index_num = atoi(argv[1]);
131
- the_with_weights = atoi(argv[2]);
132
- }
133
-
134
-
135
- for (int index_num = WKMT_FlatL2;
136
- index_num <= WKMT_HNSW;
137
- index_num++) {
138
-
139
- if (the_index_num >= 0 && index_num != the_index_num) {
140
- continue;
141
- }
142
-
143
- for (int with_weights = 0; with_weights <= 1; with_weights++) {
144
- if (the_with_weights >= 0 && with_weights != the_with_weights) {
145
- continue;
146
- }
147
-
148
- printf("=================== index_num=%d Run %s weights\n",
149
- index_num, with_weights ? "with" : "without");
150
-
151
- weighted_kmeans_clustering (
152
- d, n, nc, x.data(),
153
- with_weights ? weights.data() : nullptr,
154
- centroids.data(), (WeightedKMeansType)index_num
155
- );
156
-
157
- { // compute distance of points to centroids
158
- faiss::IndexFlatL2 cent_index(d);
159
- cent_index.add(nc, centroids.data());
160
- std::vector<float> dis (n);
161
- std::vector<faiss::Index::idx_t> idx (n);
162
-
163
- cent_index.search (nc * 2, ccent.data(), 1,
164
- dis.data(), idx.data());
165
-
166
- float dis1 = 0, dis2 = 0;
167
- for (int i = 0; i < nc ; i++) {
168
- dis1 += dis[i];
169
- }
170
- printf("average distance of points from big clusters: %g\n",
171
- dis1 / nc);
172
-
173
- for (int i = 0; i < nc ; i++) {
174
- dis2 += dis[i + nc];
175
- }
176
-
177
- printf("average distance of points from small clusters: %g\n",
178
- dis2 / nc);
179
-
180
- }
181
-
182
- }
183
- }
184
- return 0;
185
- }