faiss 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +1 -1
  5. data/ext/faiss/extconf.rb +1 -1
  6. data/lib/faiss/version.rb +1 -1
  7. data/vendor/faiss/faiss/AutoTune.cpp +36 -33
  8. data/vendor/faiss/faiss/AutoTune.h +6 -3
  9. data/vendor/faiss/faiss/Clustering.cpp +16 -12
  10. data/vendor/faiss/faiss/Index.cpp +3 -4
  11. data/vendor/faiss/faiss/Index.h +3 -3
  12. data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
  13. data/vendor/faiss/faiss/IndexBinary.h +1 -1
  14. data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
  15. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
  16. data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
  17. data/vendor/faiss/faiss/IndexFlat.h +0 -51
  18. data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
  19. data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
  20. data/vendor/faiss/faiss/IndexIVF.h +22 -15
  21. data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
  22. data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
  23. data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
  24. data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
  25. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
  26. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
  27. data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
  28. data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
  29. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
  30. data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
  31. data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
  32. data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
  33. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
  34. data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
  35. data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
  36. data/vendor/faiss/faiss/IndexRefine.h +73 -0
  37. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
  38. data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
  39. data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
  40. data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
  41. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
  42. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
  43. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
  44. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
  45. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
  46. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
  47. data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
  48. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
  49. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
  50. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
  51. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
  52. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
  53. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
  54. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
  55. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
  56. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
  57. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
  58. data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
  59. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
  60. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
  61. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
  62. data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
  63. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
  64. data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
  65. data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
  66. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
  67. data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
  68. data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
  69. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
  70. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
  71. data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
  72. data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
  73. data/vendor/faiss/faiss/impl/io.cpp +33 -2
  74. data/vendor/faiss/faiss/impl/io.h +7 -2
  75. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
  76. data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
  77. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
  78. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
  79. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
  80. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
  81. data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
  82. data/vendor/faiss/faiss/index_factory.cpp +112 -7
  83. data/vendor/faiss/faiss/index_io.h +1 -48
  84. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
  85. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
  86. data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
  87. data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
  88. data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
  89. data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
  90. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
  91. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
  92. data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
  93. data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
  94. data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
  95. data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
  96. data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
  97. data/vendor/faiss/faiss/utils/Heap.h +61 -50
  98. data/vendor/faiss/faiss/utils/distances.cpp +164 -319
  99. data/vendor/faiss/faiss/utils/distances.h +28 -20
  100. data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
  101. data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
  102. data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
  103. data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
  104. data/vendor/faiss/faiss/utils/hamming.h +2 -7
  105. data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
  106. data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
  107. data/vendor/faiss/faiss/utils/partitioning.h +69 -0
  108. data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
  109. data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
  110. data/vendor/faiss/faiss/utils/simdlib.h +31 -0
  111. data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
  112. data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
  113. metadata +43 -141
  114. data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
  115. data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
  116. data/vendor/faiss/c_api/AutoTune_c.h +0 -66
  117. data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
  118. data/vendor/faiss/c_api/Clustering_c.h +0 -123
  119. data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
  120. data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
  121. data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
  122. data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
  123. data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
  124. data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
  125. data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
  126. data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
  127. data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
  128. data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
  129. data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
  130. data/vendor/faiss/c_api/IndexShards_c.h +0 -39
  131. data/vendor/faiss/c_api/Index_c.cpp +0 -105
  132. data/vendor/faiss/c_api/Index_c.h +0 -183
  133. data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
  134. data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
  135. data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
  136. data/vendor/faiss/c_api/clone_index_c.h +0 -32
  137. data/vendor/faiss/c_api/error_c.h +0 -42
  138. data/vendor/faiss/c_api/error_impl.cpp +0 -27
  139. data/vendor/faiss/c_api/error_impl.h +0 -16
  140. data/vendor/faiss/c_api/faiss_c.h +0 -58
  141. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
  142. data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
  143. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
  144. data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
  145. data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
  146. data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
  147. data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
  148. data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
  149. data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
  150. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
  151. data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
  152. data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
  153. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
  154. data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
  155. data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
  156. data/vendor/faiss/c_api/index_factory_c.h +0 -30
  157. data/vendor/faiss/c_api/index_io_c.cpp +0 -42
  158. data/vendor/faiss/c_api/index_io_c.h +0 -50
  159. data/vendor/faiss/c_api/macros_impl.h +0 -110
  160. data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
  161. data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
  162. data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
  163. data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
  164. data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
  165. data/vendor/faiss/misc/test_blas.cpp +0 -87
  166. data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
  167. data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
  168. data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
  169. data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
  170. data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
  171. data/vendor/faiss/tests/test_merge.cpp +0 -260
  172. data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
  173. data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
  174. data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
  175. data/vendor/faiss/tests/test_params_override.cpp +0 -236
  176. data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
  177. data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
  178. data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
  179. data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
  180. data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
  181. data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
  182. data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
  183. data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
  184. data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -1,151 +0,0 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
3
- *
4
- * This source code is licensed under the MIT license found in the
5
- * LICENSE file in the root directory of this source tree.
6
- */
7
-
8
-
9
-
10
- #include <cmath>
11
- #include <cstdio>
12
- #include <cstdlib>
13
- #include <random>
14
-
15
- #include <sys/time.h>
16
-
17
-
18
- #include <faiss/IndexIVFPQ.h>
19
- #include <faiss/IndexFlat.h>
20
- #include <faiss/index_io.h>
21
-
22
- double elapsed ()
23
- {
24
- struct timeval tv;
25
- gettimeofday (&tv, NULL);
26
- return tv.tv_sec + tv.tv_usec * 1e-6;
27
- }
28
-
29
-
30
- int main ()
31
- {
32
-
33
- double t0 = elapsed();
34
-
35
- // dimension of the vectors to index
36
- int d = 128;
37
-
38
- // size of the database we plan to index
39
- size_t nb = 200 * 1000;
40
-
41
- // make a set of nt training vectors in the unit cube
42
- // (could be the database)
43
- size_t nt = 100 * 1000;
44
-
45
- // make the index object and train it
46
- faiss::IndexFlatL2 coarse_quantizer (d);
47
-
48
- // a reasonable number of centroids to index nb vectors
49
- int ncentroids = int (4 * sqrt (nb));
50
-
51
- // the coarse quantizer should not be dealloced before the index
52
- // 4 = nb of bytes per code (d must be a multiple of this)
53
- // 8 = nb of bits per sub-code (almost always 8)
54
- faiss::IndexIVFPQ index (&coarse_quantizer, d,
55
- ncentroids, 4, 8);
56
-
57
-
58
- std::mt19937 rng;
59
-
60
- { // training
61
- printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
62
- elapsed() - t0, nt, d);
63
-
64
- std::vector <float> trainvecs (nt * d);
65
- std::uniform_real_distribution<> distrib;
66
- for (size_t i = 0; i < nt * d; i++) {
67
- trainvecs[i] = distrib(rng);
68
- }
69
-
70
- printf ("[%.3f s] Training the index\n",
71
- elapsed() - t0);
72
- index.verbose = true;
73
-
74
- index.train (nt, trainvecs.data());
75
- }
76
-
77
- { // I/O demo
78
- const char *outfilename = "/tmp/index_trained.faissindex";
79
- printf ("[%.3f s] storing the pre-trained index to %s\n",
80
- elapsed() - t0, outfilename);
81
-
82
- write_index (&index, outfilename);
83
- }
84
-
85
- size_t nq;
86
- std::vector<float> queries;
87
-
88
- { // populating the database
89
- printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
90
- elapsed() - t0, nb);
91
-
92
- std::vector <float> database (nb * d);
93
- std::uniform_real_distribution<> distrib;
94
- for (size_t i = 0; i < nb * d; i++) {
95
- database[i] = distrib(rng);
96
- }
97
-
98
- printf ("[%.3f s] Adding the vectors to the index\n",
99
- elapsed() - t0);
100
-
101
- index.add (nb, database.data());
102
-
103
- printf ("[%.3f s] imbalance factor: %g\n",
104
- elapsed() - t0, index.invlists->imbalance_factor ());
105
-
106
- // remember a few elements from the database as queries
107
- int i0 = 1234;
108
- int i1 = 1243;
109
-
110
- nq = i1 - i0;
111
- queries.resize (nq * d);
112
- for (int i = i0; i < i1; i++) {
113
- for (int j = 0; j < d; j++) {
114
- queries [(i - i0) * d + j] = database [i * d + j];
115
- }
116
- }
117
-
118
- }
119
-
120
- { // searching the database
121
- int k = 5;
122
- printf ("[%.3f s] Searching the %d nearest neighbors "
123
- "of %ld vectors in the index\n",
124
- elapsed() - t0, k, nq);
125
-
126
- std::vector<faiss::Index::idx_t> nns (k * nq);
127
- std::vector<float> dis (k * nq);
128
-
129
- index.search (nq, queries.data(), k, dis.data(), nns.data());
130
-
131
- printf ("[%.3f s] Query results (vector ids, then distances):\n",
132
- elapsed() - t0);
133
-
134
- for (int i = 0; i < nq; i++) {
135
- printf ("query %2d: ", i);
136
- for (int j = 0; j < k; j++) {
137
- printf ("%7ld ", nns[j + i * k]);
138
- }
139
- printf ("\n dis: ");
140
- for (int j = 0; j < k; j++) {
141
- printf ("%7g ", dis[j + i * k]);
142
- }
143
- printf ("\n");
144
- }
145
-
146
- printf ("note that the nearest neighbor is not at "
147
- "distance 0 due to quantization errors\n");
148
- }
149
-
150
- return 0;
151
- }
@@ -1,252 +0,0 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
3
- *
4
- * This source code is licensed under the MIT license found in the
5
- * LICENSE file in the root directory of this source tree.
6
- */
7
-
8
-
9
-
10
- #include <cmath>
11
- #include <cstdio>
12
- #include <cstdlib>
13
- #include <cassert>
14
- #include <cstring>
15
-
16
- #include <sys/types.h>
17
- #include <sys/stat.h>
18
- #include <unistd.h>
19
-
20
- #include <sys/time.h>
21
-
22
- #include <faiss/AutoTune.h>
23
- #include <faiss/index_factory.h>
24
-
25
- /**
26
- * To run this demo, please download the ANN_SIFT1M dataset from
27
- *
28
- * http://corpus-texmex.irisa.fr/
29
- *
30
- * and unzip it to the sudirectory sift1M.
31
- **/
32
-
33
- /*****************************************************
34
- * I/O functions for fvecs and ivecs
35
- *****************************************************/
36
-
37
-
38
- float * fvecs_read (const char *fname,
39
- size_t *d_out, size_t *n_out)
40
- {
41
- FILE *f = fopen(fname, "r");
42
- if(!f) {
43
- fprintf(stderr, "could not open %s\n", fname);
44
- perror("");
45
- abort();
46
- }
47
- int d;
48
- fread(&d, 1, sizeof(int), f);
49
- assert((d > 0 && d < 1000000) || !"unreasonable dimension");
50
- fseek(f, 0, SEEK_SET);
51
- struct stat st;
52
- fstat(fileno(f), &st);
53
- size_t sz = st.st_size;
54
- assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
55
- size_t n = sz / ((d + 1) * 4);
56
-
57
- *d_out = d; *n_out = n;
58
- float *x = new float[n * (d + 1)];
59
- size_t nr = fread(x, sizeof(float), n * (d + 1), f);
60
- assert(nr == n * (d + 1) || !"could not read whole file");
61
-
62
- // shift array to remove row headers
63
- for(size_t i = 0; i < n; i++)
64
- memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
65
-
66
- fclose(f);
67
- return x;
68
- }
69
-
70
- // not very clean, but works as long as sizeof(int) == sizeof(float)
71
- int *ivecs_read(const char *fname, size_t *d_out, size_t *n_out)
72
- {
73
- return (int*)fvecs_read(fname, d_out, n_out);
74
- }
75
-
76
- double elapsed ()
77
- {
78
- struct timeval tv;
79
- gettimeofday (&tv, nullptr);
80
- return tv.tv_sec + tv.tv_usec * 1e-6;
81
- }
82
-
83
-
84
-
85
- int main()
86
- {
87
- double t0 = elapsed();
88
-
89
- // this is typically the fastest one.
90
- const char *index_key = "IVF4096,Flat";
91
-
92
- // these ones have better memory usage
93
- // const char *index_key = "Flat";
94
- // const char *index_key = "PQ32";
95
- // const char *index_key = "PCA80,Flat";
96
- // const char *index_key = "IVF4096,PQ8+16";
97
- // const char *index_key = "IVF4096,PQ32";
98
- // const char *index_key = "IMI2x8,PQ32";
99
- // const char *index_key = "IMI2x8,PQ8+16";
100
- // const char *index_key = "OPQ16_64,IMI2x8,PQ8+16";
101
-
102
- faiss::Index * index;
103
-
104
- size_t d;
105
-
106
- {
107
- printf ("[%.3f s] Loading train set\n", elapsed() - t0);
108
-
109
- size_t nt;
110
- float *xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);
111
-
112
- printf ("[%.3f s] Preparing index \"%s\" d=%ld\n",
113
- elapsed() - t0, index_key, d);
114
- index = faiss::index_factory(d, index_key);
115
-
116
- printf ("[%.3f s] Training on %ld vectors\n", elapsed() - t0, nt);
117
-
118
- index->train(nt, xt);
119
- delete [] xt;
120
- }
121
-
122
-
123
- {
124
- printf ("[%.3f s] Loading database\n", elapsed() - t0);
125
-
126
- size_t nb, d2;
127
- float *xb = fvecs_read("sift1M/sift_base.fvecs", &d2, &nb);
128
- assert(d == d2 || !"dataset does not have same dimension as train set");
129
-
130
- printf ("[%.3f s] Indexing database, size %ld*%ld\n",
131
- elapsed() - t0, nb, d);
132
-
133
- index->add(nb, xb);
134
-
135
- delete [] xb;
136
- }
137
-
138
- size_t nq;
139
- float *xq;
140
-
141
- {
142
- printf ("[%.3f s] Loading queries\n", elapsed() - t0);
143
-
144
- size_t d2;
145
- xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq);
146
- assert(d == d2 || !"query does not have same dimension as train set");
147
-
148
- }
149
-
150
- size_t k; // nb of results per query in the GT
151
- faiss::Index::idx_t *gt; // nq * k matrix of ground-truth nearest-neighbors
152
-
153
- {
154
- printf ("[%.3f s] Loading ground truth for %ld queries\n",
155
- elapsed() - t0, nq);
156
-
157
- // load ground-truth and convert int to long
158
- size_t nq2;
159
- int *gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
160
- assert(nq2 == nq || !"incorrect nb of ground truth entries");
161
-
162
- gt = new faiss::Index::idx_t[k * nq];
163
- for(int i = 0; i < k * nq; i++) {
164
- gt[i] = gt_int[i];
165
- }
166
- delete [] gt_int;
167
- }
168
-
169
- // Result of the auto-tuning
170
- std::string selected_params;
171
-
172
- { // run auto-tuning
173
-
174
- printf ("[%.3f s] Preparing auto-tune criterion 1-recall at 1 "
175
- "criterion, with k=%ld nq=%ld\n", elapsed() - t0, k, nq);
176
-
177
- faiss::OneRecallAtRCriterion crit(nq, 1);
178
- crit.set_groundtruth (k, nullptr, gt);
179
- crit.nnn = k; // by default, the criterion will request only 1 NN
180
-
181
- printf ("[%.3f s] Preparing auto-tune parameters\n", elapsed() - t0);
182
-
183
- faiss::ParameterSpace params;
184
- params.initialize(index);
185
-
186
- printf ("[%.3f s] Auto-tuning over %ld parameters (%ld combinations)\n",
187
- elapsed() - t0, params.parameter_ranges.size(),
188
- params.n_combinations());
189
-
190
- faiss::OperatingPoints ops;
191
- params.explore (index, nq, xq, crit, &ops);
192
-
193
- printf ("[%.3f s] Found the following operating points: \n",
194
- elapsed() - t0);
195
-
196
- ops.display ();
197
-
198
- // keep the first parameter that obtains > 0.5 1-recall@1
199
- for (int i = 0; i < ops.optimal_pts.size(); i++) {
200
- if (ops.optimal_pts[i].perf > 0.5) {
201
- selected_params = ops.optimal_pts[i].key;
202
- break;
203
- }
204
- }
205
- assert (selected_params.size() >= 0 ||
206
- !"could not find good enough op point");
207
- }
208
-
209
-
210
- { // Use the found configuration to perform a search
211
-
212
- faiss::ParameterSpace params;
213
-
214
- printf ("[%.3f s] Setting parameter configuration \"%s\" on index\n",
215
- elapsed() - t0, selected_params.c_str());
216
-
217
- params.set_index_parameters (index, selected_params.c_str());
218
-
219
- printf ("[%.3f s] Perform a search on %ld queries\n",
220
- elapsed() - t0, nq);
221
-
222
- // output buffers
223
- faiss::Index::idx_t *I = new faiss::Index::idx_t[nq * k];
224
- float *D = new float[nq * k];
225
-
226
- index->search(nq, xq, k, D, I);
227
-
228
- printf ("[%.3f s] Compute recalls\n", elapsed() - t0);
229
-
230
- // evaluate result by hand.
231
- int n_1 = 0, n_10 = 0, n_100 = 0;
232
- for(int i = 0; i < nq; i++) {
233
- int gt_nn = gt[i * k];
234
- for(int j = 0; j < k; j++) {
235
- if (I[i * k + j] == gt_nn) {
236
- if(j < 1) n_1++;
237
- if(j < 10) n_10++;
238
- if(j < 100) n_100++;
239
- }
240
- }
241
- }
242
- printf("R@1 = %.4f\n", n_1 / float(nq));
243
- printf("R@10 = %.4f\n", n_10 / float(nq));
244
- printf("R@100 = %.4f\n", n_100 / float(nq));
245
-
246
- }
247
-
248
- delete [] xq;
249
- delete [] gt;
250
- delete index;
251
- return 0;
252
- }
@@ -1,185 +0,0 @@
1
- /**
2
- * Copyright (c) Facebook, Inc. and its affiliates.
3
- *
4
- * This source code is licensed under the MIT license found in the
5
- * LICENSE file in the root directory of this source tree.
6
- */
7
-
8
- #include <cstdio>
9
- #include <cstdlib>
10
-
11
- #include <faiss/Clustering.h>
12
- #include <faiss/utils/random.h>
13
- #include <faiss/utils/distances.h>
14
- #include <faiss/IndexFlat.h>
15
- #include <faiss/IndexHNSW.h>
16
-
17
-
18
- namespace {
19
-
20
-
21
- enum WeightedKMeansType {
22
- WKMT_FlatL2,
23
- WKMT_FlatIP,
24
- WKMT_FlatIP_spherical,
25
- WKMT_HNSW,
26
- };
27
-
28
-
29
- float weighted_kmeans_clustering (size_t d, size_t n, size_t k,
30
- const float *input,
31
- const float *weights,
32
- float *centroids,
33
- WeightedKMeansType index_num)
34
- {
35
- using namespace faiss;
36
- Clustering clus (d, k);
37
- clus.verbose = true;
38
-
39
- std::unique_ptr<Index> index;
40
-
41
- switch (index_num) {
42
- case WKMT_FlatL2:
43
- index.reset(new IndexFlatL2 (d));
44
- break;
45
- case WKMT_FlatIP:
46
- index.reset(new IndexFlatIP (d));
47
- break;
48
- case WKMT_FlatIP_spherical:
49
- index.reset(new IndexFlatIP (d));
50
- clus.spherical = true;
51
- break;
52
- case WKMT_HNSW:
53
- IndexHNSWFlat *ihnsw = new IndexHNSWFlat (d, 32);
54
- ihnsw->hnsw.efSearch = 128;
55
- index.reset(ihnsw);
56
- break;
57
- }
58
-
59
- clus.train(n, input, *index.get(), weights);
60
- // on output the index contains the centroids.
61
- memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
62
- return clus.iteration_stats.back().obj;
63
- }
64
-
65
-
66
- int d = 32;
67
- float sigma = 0.1;
68
-
69
- #define BIGTEST
70
-
71
- #ifdef BIGTEST
72
- // the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs
73
- int nc = 200000;
74
- int n_big = 4;
75
- int n_small = 2;
76
- #else
77
- int nc = 5;
78
- int n_big = 100;
79
- int n_small = 10;
80
- #endif
81
-
82
- int n; // number of training points
83
-
84
- void generate_trainset (std::vector<float> & ccent,
85
- std::vector<float> & x,
86
- std::vector<float> & weights)
87
- {
88
- // same sampling as test_build_blocks.py test_weighted
89
-
90
- ccent.resize (d * 2 * nc);
91
- faiss::float_randn (ccent.data(), d * 2 * nc, 123);
92
- faiss::fvec_renorm_L2 (d, 2 * nc, ccent.data());
93
- n = nc * n_big + nc * n_small;
94
- x.resize(d * n);
95
- weights.resize(n);
96
- faiss::float_randn (x.data(), x.size(), 1234);
97
-
98
- float *xi = x.data();
99
- float *w = weights.data();
100
- for (int ci = 0; ci < nc * 2; ci++) { // loop over centroids
101
- int np = ci < nc ? n_big : n_small; // nb of points around this centroid
102
- for (int i = 0; i < np; i++) {
103
- for (int j = 0; j < d; j++) {
104
- xi[j] = xi[j] * sigma + ccent[ci * d + j];
105
- }
106
- *w++ = ci < nc ? 0.1 : 10;
107
- xi += d;
108
- }
109
- }
110
- }
111
-
112
- }
113
-
114
-
115
- int main(int argc, char **argv) {
116
- std::vector<float> ccent;
117
- std::vector<float> x;
118
- std::vector<float> weights;
119
-
120
- printf("generate training set\n");
121
- generate_trainset(ccent, x, weights);
122
-
123
- std::vector<float> centroids;
124
- centroids.resize(nc * d);
125
-
126
- int the_index_num = -1;
127
- int the_with_weights = -1;
128
-
129
- if (argc == 3) {
130
- the_index_num = atoi(argv[1]);
131
- the_with_weights = atoi(argv[2]);
132
- }
133
-
134
-
135
- for (int index_num = WKMT_FlatL2;
136
- index_num <= WKMT_HNSW;
137
- index_num++) {
138
-
139
- if (the_index_num >= 0 && index_num != the_index_num) {
140
- continue;
141
- }
142
-
143
- for (int with_weights = 0; with_weights <= 1; with_weights++) {
144
- if (the_with_weights >= 0 && with_weights != the_with_weights) {
145
- continue;
146
- }
147
-
148
- printf("=================== index_num=%d Run %s weights\n",
149
- index_num, with_weights ? "with" : "without");
150
-
151
- weighted_kmeans_clustering (
152
- d, n, nc, x.data(),
153
- with_weights ? weights.data() : nullptr,
154
- centroids.data(), (WeightedKMeansType)index_num
155
- );
156
-
157
- { // compute distance of points to centroids
158
- faiss::IndexFlatL2 cent_index(d);
159
- cent_index.add(nc, centroids.data());
160
- std::vector<float> dis (n);
161
- std::vector<faiss::Index::idx_t> idx (n);
162
-
163
- cent_index.search (nc * 2, ccent.data(), 1,
164
- dis.data(), idx.data());
165
-
166
- float dis1 = 0, dis2 = 0;
167
- for (int i = 0; i < nc ; i++) {
168
- dis1 += dis[i];
169
- }
170
- printf("average distance of points from big clusters: %g\n",
171
- dis1 / nc);
172
-
173
- for (int i = 0; i < nc ; i++) {
174
- dis2 += dis[i + nc];
175
- }
176
-
177
- printf("average distance of points from small clusters: %g\n",
178
- dis2 / nc);
179
-
180
- }
181
-
182
- }
183
- }
184
- return 0;
185
- }