faiss 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/ext/faiss/extconf.rb +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +36 -33
- data/vendor/faiss/faiss/AutoTune.h +6 -3
- data/vendor/faiss/faiss/Clustering.cpp +16 -12
- data/vendor/faiss/faiss/Index.cpp +3 -4
- data/vendor/faiss/faiss/Index.h +3 -3
- data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
- data/vendor/faiss/faiss/IndexBinary.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
- data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
- data/vendor/faiss/faiss/IndexFlat.h +0 -51
- data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
- data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
- data/vendor/faiss/faiss/IndexIVF.h +22 -15
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
- data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
- data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
- data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
- data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
- data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
- data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
- data/vendor/faiss/faiss/IndexRefine.h +73 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
- data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
- data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
- data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
- data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
- data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
- data/vendor/faiss/faiss/impl/io.cpp +33 -2
- data/vendor/faiss/faiss/impl/io.h +7 -2
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
- data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
- data/vendor/faiss/faiss/index_factory.cpp +112 -7
- data/vendor/faiss/faiss/index_io.h +1 -48
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
- data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
- data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
- data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
- data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
- data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
- data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
- data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
- data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
- data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
- data/vendor/faiss/faiss/utils/Heap.h +61 -50
- data/vendor/faiss/faiss/utils/distances.cpp +164 -319
- data/vendor/faiss/faiss/utils/distances.h +28 -20
- data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
- data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
- data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
- data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
- data/vendor/faiss/faiss/utils/hamming.h +2 -7
- data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
- data/vendor/faiss/faiss/utils/partitioning.h +69 -0
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
- data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
- data/vendor/faiss/faiss/utils/simdlib.h +31 -0
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
- metadata +43 -141
- data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
- data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
- data/vendor/faiss/c_api/AutoTune_c.h +0 -66
- data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
- data/vendor/faiss/c_api/Clustering_c.h +0 -123
- data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
- data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
- data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
- data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
- data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
- data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
- data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
- data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
- data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
- data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
- data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
- data/vendor/faiss/c_api/IndexShards_c.h +0 -39
- data/vendor/faiss/c_api/Index_c.cpp +0 -105
- data/vendor/faiss/c_api/Index_c.h +0 -183
- data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
- data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
- data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
- data/vendor/faiss/c_api/clone_index_c.h +0 -32
- data/vendor/faiss/c_api/error_c.h +0 -42
- data/vendor/faiss/c_api/error_impl.cpp +0 -27
- data/vendor/faiss/c_api/error_impl.h +0 -16
- data/vendor/faiss/c_api/faiss_c.h +0 -58
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
- data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
- data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
- data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
- data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
- data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
- data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
- data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
- data/vendor/faiss/c_api/index_factory_c.h +0 -30
- data/vendor/faiss/c_api/index_io_c.cpp +0 -42
- data/vendor/faiss/c_api/index_io_c.h +0 -50
- data/vendor/faiss/c_api/macros_impl.h +0 -110
- data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
- data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
- data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
- data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
- data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
- data/vendor/faiss/misc/test_blas.cpp +0 -87
- data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
- data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
- data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
- data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
- data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
- data/vendor/faiss/tests/test_merge.cpp +0 -260
- data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
- data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
- data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
- data/vendor/faiss/tests/test_params_override.cpp +0 -236
- data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
- data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
- data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
- data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
- data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
- data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
- data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
- data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
- data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104
@@ -1,151 +0,0 @@
|
|
1
|
-
/**
|
2
|
-
* Copyright (c) Facebook, Inc. and its affiliates.
|
3
|
-
*
|
4
|
-
* This source code is licensed under the MIT license found in the
|
5
|
-
* LICENSE file in the root directory of this source tree.
|
6
|
-
*/
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
#include <cmath>
|
11
|
-
#include <cstdio>
|
12
|
-
#include <cstdlib>
|
13
|
-
#include <random>
|
14
|
-
|
15
|
-
#include <sys/time.h>
|
16
|
-
|
17
|
-
|
18
|
-
#include <faiss/IndexIVFPQ.h>
|
19
|
-
#include <faiss/IndexFlat.h>
|
20
|
-
#include <faiss/index_io.h>
|
21
|
-
|
22
|
-
double elapsed ()
|
23
|
-
{
|
24
|
-
struct timeval tv;
|
25
|
-
gettimeofday (&tv, NULL);
|
26
|
-
return tv.tv_sec + tv.tv_usec * 1e-6;
|
27
|
-
}
|
28
|
-
|
29
|
-
|
30
|
-
int main ()
|
31
|
-
{
|
32
|
-
|
33
|
-
double t0 = elapsed();
|
34
|
-
|
35
|
-
// dimension of the vectors to index
|
36
|
-
int d = 128;
|
37
|
-
|
38
|
-
// size of the database we plan to index
|
39
|
-
size_t nb = 200 * 1000;
|
40
|
-
|
41
|
-
// make a set of nt training vectors in the unit cube
|
42
|
-
// (could be the database)
|
43
|
-
size_t nt = 100 * 1000;
|
44
|
-
|
45
|
-
// make the index object and train it
|
46
|
-
faiss::IndexFlatL2 coarse_quantizer (d);
|
47
|
-
|
48
|
-
// a reasonable number of centroids to index nb vectors
|
49
|
-
int ncentroids = int (4 * sqrt (nb));
|
50
|
-
|
51
|
-
// the coarse quantizer should not be dealloced before the index
|
52
|
-
// 4 = nb of bytes per code (d must be a multiple of this)
|
53
|
-
// 8 = nb of bits per sub-code (almost always 8)
|
54
|
-
faiss::IndexIVFPQ index (&coarse_quantizer, d,
|
55
|
-
ncentroids, 4, 8);
|
56
|
-
|
57
|
-
|
58
|
-
std::mt19937 rng;
|
59
|
-
|
60
|
-
{ // training
|
61
|
-
printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
|
62
|
-
elapsed() - t0, nt, d);
|
63
|
-
|
64
|
-
std::vector <float> trainvecs (nt * d);
|
65
|
-
std::uniform_real_distribution<> distrib;
|
66
|
-
for (size_t i = 0; i < nt * d; i++) {
|
67
|
-
trainvecs[i] = distrib(rng);
|
68
|
-
}
|
69
|
-
|
70
|
-
printf ("[%.3f s] Training the index\n",
|
71
|
-
elapsed() - t0);
|
72
|
-
index.verbose = true;
|
73
|
-
|
74
|
-
index.train (nt, trainvecs.data());
|
75
|
-
}
|
76
|
-
|
77
|
-
{ // I/O demo
|
78
|
-
const char *outfilename = "/tmp/index_trained.faissindex";
|
79
|
-
printf ("[%.3f s] storing the pre-trained index to %s\n",
|
80
|
-
elapsed() - t0, outfilename);
|
81
|
-
|
82
|
-
write_index (&index, outfilename);
|
83
|
-
}
|
84
|
-
|
85
|
-
size_t nq;
|
86
|
-
std::vector<float> queries;
|
87
|
-
|
88
|
-
{ // populating the database
|
89
|
-
printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
|
90
|
-
elapsed() - t0, nb);
|
91
|
-
|
92
|
-
std::vector <float> database (nb * d);
|
93
|
-
std::uniform_real_distribution<> distrib;
|
94
|
-
for (size_t i = 0; i < nb * d; i++) {
|
95
|
-
database[i] = distrib(rng);
|
96
|
-
}
|
97
|
-
|
98
|
-
printf ("[%.3f s] Adding the vectors to the index\n",
|
99
|
-
elapsed() - t0);
|
100
|
-
|
101
|
-
index.add (nb, database.data());
|
102
|
-
|
103
|
-
printf ("[%.3f s] imbalance factor: %g\n",
|
104
|
-
elapsed() - t0, index.invlists->imbalance_factor ());
|
105
|
-
|
106
|
-
// remember a few elements from the database as queries
|
107
|
-
int i0 = 1234;
|
108
|
-
int i1 = 1243;
|
109
|
-
|
110
|
-
nq = i1 - i0;
|
111
|
-
queries.resize (nq * d);
|
112
|
-
for (int i = i0; i < i1; i++) {
|
113
|
-
for (int j = 0; j < d; j++) {
|
114
|
-
queries [(i - i0) * d + j] = database [i * d + j];
|
115
|
-
}
|
116
|
-
}
|
117
|
-
|
118
|
-
}
|
119
|
-
|
120
|
-
{ // searching the database
|
121
|
-
int k = 5;
|
122
|
-
printf ("[%.3f s] Searching the %d nearest neighbors "
|
123
|
-
"of %ld vectors in the index\n",
|
124
|
-
elapsed() - t0, k, nq);
|
125
|
-
|
126
|
-
std::vector<faiss::Index::idx_t> nns (k * nq);
|
127
|
-
std::vector<float> dis (k * nq);
|
128
|
-
|
129
|
-
index.search (nq, queries.data(), k, dis.data(), nns.data());
|
130
|
-
|
131
|
-
printf ("[%.3f s] Query results (vector ids, then distances):\n",
|
132
|
-
elapsed() - t0);
|
133
|
-
|
134
|
-
for (int i = 0; i < nq; i++) {
|
135
|
-
printf ("query %2d: ", i);
|
136
|
-
for (int j = 0; j < k; j++) {
|
137
|
-
printf ("%7ld ", nns[j + i * k]);
|
138
|
-
}
|
139
|
-
printf ("\n dis: ");
|
140
|
-
for (int j = 0; j < k; j++) {
|
141
|
-
printf ("%7g ", dis[j + i * k]);
|
142
|
-
}
|
143
|
-
printf ("\n");
|
144
|
-
}
|
145
|
-
|
146
|
-
printf ("note that the nearest neighbor is not at "
|
147
|
-
"distance 0 due to quantization errors\n");
|
148
|
-
}
|
149
|
-
|
150
|
-
return 0;
|
151
|
-
}
|
@@ -1,252 +0,0 @@
|
|
1
|
-
/**
|
2
|
-
* Copyright (c) Facebook, Inc. and its affiliates.
|
3
|
-
*
|
4
|
-
* This source code is licensed under the MIT license found in the
|
5
|
-
* LICENSE file in the root directory of this source tree.
|
6
|
-
*/
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
#include <cmath>
|
11
|
-
#include <cstdio>
|
12
|
-
#include <cstdlib>
|
13
|
-
#include <cassert>
|
14
|
-
#include <cstring>
|
15
|
-
|
16
|
-
#include <sys/types.h>
|
17
|
-
#include <sys/stat.h>
|
18
|
-
#include <unistd.h>
|
19
|
-
|
20
|
-
#include <sys/time.h>
|
21
|
-
|
22
|
-
#include <faiss/AutoTune.h>
|
23
|
-
#include <faiss/index_factory.h>
|
24
|
-
|
25
|
-
/**
|
26
|
-
* To run this demo, please download the ANN_SIFT1M dataset from
|
27
|
-
*
|
28
|
-
* http://corpus-texmex.irisa.fr/
|
29
|
-
*
|
30
|
-
* and unzip it to the sudirectory sift1M.
|
31
|
-
**/
|
32
|
-
|
33
|
-
/*****************************************************
|
34
|
-
* I/O functions for fvecs and ivecs
|
35
|
-
*****************************************************/
|
36
|
-
|
37
|
-
|
38
|
-
float * fvecs_read (const char *fname,
|
39
|
-
size_t *d_out, size_t *n_out)
|
40
|
-
{
|
41
|
-
FILE *f = fopen(fname, "r");
|
42
|
-
if(!f) {
|
43
|
-
fprintf(stderr, "could not open %s\n", fname);
|
44
|
-
perror("");
|
45
|
-
abort();
|
46
|
-
}
|
47
|
-
int d;
|
48
|
-
fread(&d, 1, sizeof(int), f);
|
49
|
-
assert((d > 0 && d < 1000000) || !"unreasonable dimension");
|
50
|
-
fseek(f, 0, SEEK_SET);
|
51
|
-
struct stat st;
|
52
|
-
fstat(fileno(f), &st);
|
53
|
-
size_t sz = st.st_size;
|
54
|
-
assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
|
55
|
-
size_t n = sz / ((d + 1) * 4);
|
56
|
-
|
57
|
-
*d_out = d; *n_out = n;
|
58
|
-
float *x = new float[n * (d + 1)];
|
59
|
-
size_t nr = fread(x, sizeof(float), n * (d + 1), f);
|
60
|
-
assert(nr == n * (d + 1) || !"could not read whole file");
|
61
|
-
|
62
|
-
// shift array to remove row headers
|
63
|
-
for(size_t i = 0; i < n; i++)
|
64
|
-
memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
|
65
|
-
|
66
|
-
fclose(f);
|
67
|
-
return x;
|
68
|
-
}
|
69
|
-
|
70
|
-
// not very clean, but works as long as sizeof(int) == sizeof(float)
|
71
|
-
int *ivecs_read(const char *fname, size_t *d_out, size_t *n_out)
|
72
|
-
{
|
73
|
-
return (int*)fvecs_read(fname, d_out, n_out);
|
74
|
-
}
|
75
|
-
|
76
|
-
double elapsed ()
|
77
|
-
{
|
78
|
-
struct timeval tv;
|
79
|
-
gettimeofday (&tv, nullptr);
|
80
|
-
return tv.tv_sec + tv.tv_usec * 1e-6;
|
81
|
-
}
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
int main()
|
86
|
-
{
|
87
|
-
double t0 = elapsed();
|
88
|
-
|
89
|
-
// this is typically the fastest one.
|
90
|
-
const char *index_key = "IVF4096,Flat";
|
91
|
-
|
92
|
-
// these ones have better memory usage
|
93
|
-
// const char *index_key = "Flat";
|
94
|
-
// const char *index_key = "PQ32";
|
95
|
-
// const char *index_key = "PCA80,Flat";
|
96
|
-
// const char *index_key = "IVF4096,PQ8+16";
|
97
|
-
// const char *index_key = "IVF4096,PQ32";
|
98
|
-
// const char *index_key = "IMI2x8,PQ32";
|
99
|
-
// const char *index_key = "IMI2x8,PQ8+16";
|
100
|
-
// const char *index_key = "OPQ16_64,IMI2x8,PQ8+16";
|
101
|
-
|
102
|
-
faiss::Index * index;
|
103
|
-
|
104
|
-
size_t d;
|
105
|
-
|
106
|
-
{
|
107
|
-
printf ("[%.3f s] Loading train set\n", elapsed() - t0);
|
108
|
-
|
109
|
-
size_t nt;
|
110
|
-
float *xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);
|
111
|
-
|
112
|
-
printf ("[%.3f s] Preparing index \"%s\" d=%ld\n",
|
113
|
-
elapsed() - t0, index_key, d);
|
114
|
-
index = faiss::index_factory(d, index_key);
|
115
|
-
|
116
|
-
printf ("[%.3f s] Training on %ld vectors\n", elapsed() - t0, nt);
|
117
|
-
|
118
|
-
index->train(nt, xt);
|
119
|
-
delete [] xt;
|
120
|
-
}
|
121
|
-
|
122
|
-
|
123
|
-
{
|
124
|
-
printf ("[%.3f s] Loading database\n", elapsed() - t0);
|
125
|
-
|
126
|
-
size_t nb, d2;
|
127
|
-
float *xb = fvecs_read("sift1M/sift_base.fvecs", &d2, &nb);
|
128
|
-
assert(d == d2 || !"dataset does not have same dimension as train set");
|
129
|
-
|
130
|
-
printf ("[%.3f s] Indexing database, size %ld*%ld\n",
|
131
|
-
elapsed() - t0, nb, d);
|
132
|
-
|
133
|
-
index->add(nb, xb);
|
134
|
-
|
135
|
-
delete [] xb;
|
136
|
-
}
|
137
|
-
|
138
|
-
size_t nq;
|
139
|
-
float *xq;
|
140
|
-
|
141
|
-
{
|
142
|
-
printf ("[%.3f s] Loading queries\n", elapsed() - t0);
|
143
|
-
|
144
|
-
size_t d2;
|
145
|
-
xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq);
|
146
|
-
assert(d == d2 || !"query does not have same dimension as train set");
|
147
|
-
|
148
|
-
}
|
149
|
-
|
150
|
-
size_t k; // nb of results per query in the GT
|
151
|
-
faiss::Index::idx_t *gt; // nq * k matrix of ground-truth nearest-neighbors
|
152
|
-
|
153
|
-
{
|
154
|
-
printf ("[%.3f s] Loading ground truth for %ld queries\n",
|
155
|
-
elapsed() - t0, nq);
|
156
|
-
|
157
|
-
// load ground-truth and convert int to long
|
158
|
-
size_t nq2;
|
159
|
-
int *gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
|
160
|
-
assert(nq2 == nq || !"incorrect nb of ground truth entries");
|
161
|
-
|
162
|
-
gt = new faiss::Index::idx_t[k * nq];
|
163
|
-
for(int i = 0; i < k * nq; i++) {
|
164
|
-
gt[i] = gt_int[i];
|
165
|
-
}
|
166
|
-
delete [] gt_int;
|
167
|
-
}
|
168
|
-
|
169
|
-
// Result of the auto-tuning
|
170
|
-
std::string selected_params;
|
171
|
-
|
172
|
-
{ // run auto-tuning
|
173
|
-
|
174
|
-
printf ("[%.3f s] Preparing auto-tune criterion 1-recall at 1 "
|
175
|
-
"criterion, with k=%ld nq=%ld\n", elapsed() - t0, k, nq);
|
176
|
-
|
177
|
-
faiss::OneRecallAtRCriterion crit(nq, 1);
|
178
|
-
crit.set_groundtruth (k, nullptr, gt);
|
179
|
-
crit.nnn = k; // by default, the criterion will request only 1 NN
|
180
|
-
|
181
|
-
printf ("[%.3f s] Preparing auto-tune parameters\n", elapsed() - t0);
|
182
|
-
|
183
|
-
faiss::ParameterSpace params;
|
184
|
-
params.initialize(index);
|
185
|
-
|
186
|
-
printf ("[%.3f s] Auto-tuning over %ld parameters (%ld combinations)\n",
|
187
|
-
elapsed() - t0, params.parameter_ranges.size(),
|
188
|
-
params.n_combinations());
|
189
|
-
|
190
|
-
faiss::OperatingPoints ops;
|
191
|
-
params.explore (index, nq, xq, crit, &ops);
|
192
|
-
|
193
|
-
printf ("[%.3f s] Found the following operating points: \n",
|
194
|
-
elapsed() - t0);
|
195
|
-
|
196
|
-
ops.display ();
|
197
|
-
|
198
|
-
// keep the first parameter that obtains > 0.5 1-recall@1
|
199
|
-
for (int i = 0; i < ops.optimal_pts.size(); i++) {
|
200
|
-
if (ops.optimal_pts[i].perf > 0.5) {
|
201
|
-
selected_params = ops.optimal_pts[i].key;
|
202
|
-
break;
|
203
|
-
}
|
204
|
-
}
|
205
|
-
assert (selected_params.size() >= 0 ||
|
206
|
-
!"could not find good enough op point");
|
207
|
-
}
|
208
|
-
|
209
|
-
|
210
|
-
{ // Use the found configuration to perform a search
|
211
|
-
|
212
|
-
faiss::ParameterSpace params;
|
213
|
-
|
214
|
-
printf ("[%.3f s] Setting parameter configuration \"%s\" on index\n",
|
215
|
-
elapsed() - t0, selected_params.c_str());
|
216
|
-
|
217
|
-
params.set_index_parameters (index, selected_params.c_str());
|
218
|
-
|
219
|
-
printf ("[%.3f s] Perform a search on %ld queries\n",
|
220
|
-
elapsed() - t0, nq);
|
221
|
-
|
222
|
-
// output buffers
|
223
|
-
faiss::Index::idx_t *I = new faiss::Index::idx_t[nq * k];
|
224
|
-
float *D = new float[nq * k];
|
225
|
-
|
226
|
-
index->search(nq, xq, k, D, I);
|
227
|
-
|
228
|
-
printf ("[%.3f s] Compute recalls\n", elapsed() - t0);
|
229
|
-
|
230
|
-
// evaluate result by hand.
|
231
|
-
int n_1 = 0, n_10 = 0, n_100 = 0;
|
232
|
-
for(int i = 0; i < nq; i++) {
|
233
|
-
int gt_nn = gt[i * k];
|
234
|
-
for(int j = 0; j < k; j++) {
|
235
|
-
if (I[i * k + j] == gt_nn) {
|
236
|
-
if(j < 1) n_1++;
|
237
|
-
if(j < 10) n_10++;
|
238
|
-
if(j < 100) n_100++;
|
239
|
-
}
|
240
|
-
}
|
241
|
-
}
|
242
|
-
printf("R@1 = %.4f\n", n_1 / float(nq));
|
243
|
-
printf("R@10 = %.4f\n", n_10 / float(nq));
|
244
|
-
printf("R@100 = %.4f\n", n_100 / float(nq));
|
245
|
-
|
246
|
-
}
|
247
|
-
|
248
|
-
delete [] xq;
|
249
|
-
delete [] gt;
|
250
|
-
delete index;
|
251
|
-
return 0;
|
252
|
-
}
|
@@ -1,185 +0,0 @@
|
|
1
|
-
/**
|
2
|
-
* Copyright (c) Facebook, Inc. and its affiliates.
|
3
|
-
*
|
4
|
-
* This source code is licensed under the MIT license found in the
|
5
|
-
* LICENSE file in the root directory of this source tree.
|
6
|
-
*/
|
7
|
-
|
8
|
-
#include <cstdio>
|
9
|
-
#include <cstdlib>
|
10
|
-
|
11
|
-
#include <faiss/Clustering.h>
|
12
|
-
#include <faiss/utils/random.h>
|
13
|
-
#include <faiss/utils/distances.h>
|
14
|
-
#include <faiss/IndexFlat.h>
|
15
|
-
#include <faiss/IndexHNSW.h>
|
16
|
-
|
17
|
-
|
18
|
-
namespace {
|
19
|
-
|
20
|
-
|
21
|
-
enum WeightedKMeansType {
|
22
|
-
WKMT_FlatL2,
|
23
|
-
WKMT_FlatIP,
|
24
|
-
WKMT_FlatIP_spherical,
|
25
|
-
WKMT_HNSW,
|
26
|
-
};
|
27
|
-
|
28
|
-
|
29
|
-
float weighted_kmeans_clustering (size_t d, size_t n, size_t k,
|
30
|
-
const float *input,
|
31
|
-
const float *weights,
|
32
|
-
float *centroids,
|
33
|
-
WeightedKMeansType index_num)
|
34
|
-
{
|
35
|
-
using namespace faiss;
|
36
|
-
Clustering clus (d, k);
|
37
|
-
clus.verbose = true;
|
38
|
-
|
39
|
-
std::unique_ptr<Index> index;
|
40
|
-
|
41
|
-
switch (index_num) {
|
42
|
-
case WKMT_FlatL2:
|
43
|
-
index.reset(new IndexFlatL2 (d));
|
44
|
-
break;
|
45
|
-
case WKMT_FlatIP:
|
46
|
-
index.reset(new IndexFlatIP (d));
|
47
|
-
break;
|
48
|
-
case WKMT_FlatIP_spherical:
|
49
|
-
index.reset(new IndexFlatIP (d));
|
50
|
-
clus.spherical = true;
|
51
|
-
break;
|
52
|
-
case WKMT_HNSW:
|
53
|
-
IndexHNSWFlat *ihnsw = new IndexHNSWFlat (d, 32);
|
54
|
-
ihnsw->hnsw.efSearch = 128;
|
55
|
-
index.reset(ihnsw);
|
56
|
-
break;
|
57
|
-
}
|
58
|
-
|
59
|
-
clus.train(n, input, *index.get(), weights);
|
60
|
-
// on output the index contains the centroids.
|
61
|
-
memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
|
62
|
-
return clus.iteration_stats.back().obj;
|
63
|
-
}
|
64
|
-
|
65
|
-
|
66
|
-
int d = 32;
|
67
|
-
float sigma = 0.1;
|
68
|
-
|
69
|
-
#define BIGTEST
|
70
|
-
|
71
|
-
#ifdef BIGTEST
|
72
|
-
// the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs
|
73
|
-
int nc = 200000;
|
74
|
-
int n_big = 4;
|
75
|
-
int n_small = 2;
|
76
|
-
#else
|
77
|
-
int nc = 5;
|
78
|
-
int n_big = 100;
|
79
|
-
int n_small = 10;
|
80
|
-
#endif
|
81
|
-
|
82
|
-
int n; // number of training points
|
83
|
-
|
84
|
-
void generate_trainset (std::vector<float> & ccent,
|
85
|
-
std::vector<float> & x,
|
86
|
-
std::vector<float> & weights)
|
87
|
-
{
|
88
|
-
// same sampling as test_build_blocks.py test_weighted
|
89
|
-
|
90
|
-
ccent.resize (d * 2 * nc);
|
91
|
-
faiss::float_randn (ccent.data(), d * 2 * nc, 123);
|
92
|
-
faiss::fvec_renorm_L2 (d, 2 * nc, ccent.data());
|
93
|
-
n = nc * n_big + nc * n_small;
|
94
|
-
x.resize(d * n);
|
95
|
-
weights.resize(n);
|
96
|
-
faiss::float_randn (x.data(), x.size(), 1234);
|
97
|
-
|
98
|
-
float *xi = x.data();
|
99
|
-
float *w = weights.data();
|
100
|
-
for (int ci = 0; ci < nc * 2; ci++) { // loop over centroids
|
101
|
-
int np = ci < nc ? n_big : n_small; // nb of points around this centroid
|
102
|
-
for (int i = 0; i < np; i++) {
|
103
|
-
for (int j = 0; j < d; j++) {
|
104
|
-
xi[j] = xi[j] * sigma + ccent[ci * d + j];
|
105
|
-
}
|
106
|
-
*w++ = ci < nc ? 0.1 : 10;
|
107
|
-
xi += d;
|
108
|
-
}
|
109
|
-
}
|
110
|
-
}
|
111
|
-
|
112
|
-
}
|
113
|
-
|
114
|
-
|
115
|
-
int main(int argc, char **argv) {
|
116
|
-
std::vector<float> ccent;
|
117
|
-
std::vector<float> x;
|
118
|
-
std::vector<float> weights;
|
119
|
-
|
120
|
-
printf("generate training set\n");
|
121
|
-
generate_trainset(ccent, x, weights);
|
122
|
-
|
123
|
-
std::vector<float> centroids;
|
124
|
-
centroids.resize(nc * d);
|
125
|
-
|
126
|
-
int the_index_num = -1;
|
127
|
-
int the_with_weights = -1;
|
128
|
-
|
129
|
-
if (argc == 3) {
|
130
|
-
the_index_num = atoi(argv[1]);
|
131
|
-
the_with_weights = atoi(argv[2]);
|
132
|
-
}
|
133
|
-
|
134
|
-
|
135
|
-
for (int index_num = WKMT_FlatL2;
|
136
|
-
index_num <= WKMT_HNSW;
|
137
|
-
index_num++) {
|
138
|
-
|
139
|
-
if (the_index_num >= 0 && index_num != the_index_num) {
|
140
|
-
continue;
|
141
|
-
}
|
142
|
-
|
143
|
-
for (int with_weights = 0; with_weights <= 1; with_weights++) {
|
144
|
-
if (the_with_weights >= 0 && with_weights != the_with_weights) {
|
145
|
-
continue;
|
146
|
-
}
|
147
|
-
|
148
|
-
printf("=================== index_num=%d Run %s weights\n",
|
149
|
-
index_num, with_weights ? "with" : "without");
|
150
|
-
|
151
|
-
weighted_kmeans_clustering (
|
152
|
-
d, n, nc, x.data(),
|
153
|
-
with_weights ? weights.data() : nullptr,
|
154
|
-
centroids.data(), (WeightedKMeansType)index_num
|
155
|
-
);
|
156
|
-
|
157
|
-
{ // compute distance of points to centroids
|
158
|
-
faiss::IndexFlatL2 cent_index(d);
|
159
|
-
cent_index.add(nc, centroids.data());
|
160
|
-
std::vector<float> dis (n);
|
161
|
-
std::vector<faiss::Index::idx_t> idx (n);
|
162
|
-
|
163
|
-
cent_index.search (nc * 2, ccent.data(), 1,
|
164
|
-
dis.data(), idx.data());
|
165
|
-
|
166
|
-
float dis1 = 0, dis2 = 0;
|
167
|
-
for (int i = 0; i < nc ; i++) {
|
168
|
-
dis1 += dis[i];
|
169
|
-
}
|
170
|
-
printf("average distance of points from big clusters: %g\n",
|
171
|
-
dis1 / nc);
|
172
|
-
|
173
|
-
for (int i = 0; i < nc ; i++) {
|
174
|
-
dis2 += dis[i + nc];
|
175
|
-
}
|
176
|
-
|
177
|
-
printf("average distance of points from small clusters: %g\n",
|
178
|
-
dis2 / nc);
|
179
|
-
|
180
|
-
}
|
181
|
-
|
182
|
-
}
|
183
|
-
}
|
184
|
-
return 0;
|
185
|
-
}
|