faiss 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.h +1 -1
- data/vendor/faiss/faiss/Clustering.cpp +35 -4
- data/vendor/faiss/faiss/Clustering.h +10 -1
- data/vendor/faiss/faiss/IVFlib.cpp +4 -1
- data/vendor/faiss/faiss/Index.h +21 -6
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -1
- data/vendor/faiss/faiss/IndexFastScan.cpp +22 -4
- data/vendor/faiss/faiss/IndexFlat.cpp +11 -7
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +159 -5
- data/vendor/faiss/faiss/IndexFlatCodes.h +20 -3
- data/vendor/faiss/faiss/IndexHNSW.cpp +143 -90
- data/vendor/faiss/faiss/IndexHNSW.h +52 -3
- data/vendor/faiss/faiss/IndexIVF.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVF.h +9 -1
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +15 -0
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -0
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +130 -57
- data/vendor/faiss/faiss/IndexIVFFastScan.h +14 -7
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +1 -3
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +21 -2
- data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
- data/vendor/faiss/faiss/IndexLattice.h +3 -22
- data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -29
- data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
- data/vendor/faiss/faiss/IndexNSG.h +1 -1
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
- data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRefine.cpp +5 -5
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +3 -1
- data/vendor/faiss/faiss/MetricType.h +7 -2
- data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
- data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
- data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +36 -4
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +6 -0
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuIndex.h +2 -8
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +6 -0
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +2 -0
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +25 -0
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +6 -0
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +65 -0
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
- data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +25 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +9 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +46 -0
- data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
- data/vendor/faiss/faiss/impl/HNSW.cpp +358 -190
- data/vendor/faiss/faiss/impl/HNSW.h +43 -22
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +8 -8
- data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +13 -8
- data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +1 -0
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +5 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +151 -32
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +719 -102
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -0
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +5 -0
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +29 -15
- data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
- data/vendor/faiss/faiss/impl/index_write.cpp +28 -10
- data/vendor/faiss/faiss/impl/io.cpp +13 -5
- data/vendor/faiss/faiss/impl/io.h +4 -4
- data/vendor/faiss/faiss/impl/io_macros.h +6 -0
- data/vendor/faiss/faiss/impl/platform_macros.h +22 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +11 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +1 -1
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +448 -1
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +5 -5
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +143 -59
- data/vendor/faiss/faiss/index_factory.cpp +31 -13
- data/vendor/faiss/faiss/index_io.h +12 -5
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +9 -1
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +55 -17
- data/vendor/faiss/faiss/invlists/InvertedLists.h +18 -9
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +21 -6
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
- data/vendor/faiss/faiss/python/python_callbacks.cpp +3 -3
- data/vendor/faiss/faiss/utils/Heap.h +105 -0
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
- data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
- data/vendor/faiss/faiss/utils/bf16.h +36 -0
- data/vendor/faiss/faiss/utils/distances.cpp +58 -88
- data/vendor/faiss/faiss/utils/distances.h +5 -5
- data/vendor/faiss/faiss/utils/distances_simd.cpp +997 -9
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
- data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
- data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
- data/vendor/faiss/faiss/utils/hamming.cpp +1 -1
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +4 -1
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +2 -1
- data/vendor/faiss/faiss/utils/random.cpp +43 -0
- data/vendor/faiss/faiss/utils/random.h +25 -0
- data/vendor/faiss/faiss/utils/simdlib.h +10 -1
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
- data/vendor/faiss/faiss/utils/simdlib_neon.h +5 -2
- data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
- data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
- data/vendor/faiss/faiss/utils/utils.cpp +10 -3
- data/vendor/faiss/faiss/utils/utils.h +3 -0
- metadata +16 -4
- data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102
@@ -12,10 +12,19 @@
|
|
12
12
|
#include <cstdint>
|
13
13
|
|
14
14
|
#include <faiss/cppcontrib/detail/CoarseBitType.h>
|
15
|
+
#include <faiss/impl/platform_macros.h>
|
15
16
|
|
16
17
|
namespace faiss {
|
17
18
|
namespace cppcontrib {
|
18
19
|
|
20
|
+
bool isBigEndian() {
|
21
|
+
#ifdef FAISS_BIG_ENDIAN
|
22
|
+
return true;
|
23
|
+
#else
|
24
|
+
return false;
|
25
|
+
#endif
|
26
|
+
}
|
27
|
+
|
19
28
|
////////////////////////////////////////////////////////////////////////////////////
|
20
29
|
/// Index2LevelDecoder
|
21
30
|
////////////////////////////////////////////////////////////////////////////////////
|
@@ -72,9 +81,14 @@ struct Index2LevelDecoder {
|
|
72
81
|
const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
|
73
82
|
const intptr_t fineCentroidIdx = i / FINE_SIZE;
|
74
83
|
const intptr_t fineCentroidOffset = i % FINE_SIZE;
|
75
|
-
|
76
|
-
|
77
|
-
|
84
|
+
intptr_t coarseCode, fineCode;
|
85
|
+
if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
|
86
|
+
coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
|
87
|
+
fineCode = Swap2Bytes(fine[fineCentroidIdx]);
|
88
|
+
} else {
|
89
|
+
coarseCode = coarse[coarseCentroidIdx];
|
90
|
+
fineCode = fine[fineCentroidIdx];
|
91
|
+
}
|
78
92
|
|
79
93
|
const float* const __restrict coarsePtr = pqCoarseCentroids +
|
80
94
|
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
|
@@ -112,9 +126,14 @@ struct Index2LevelDecoder {
|
|
112
126
|
const intptr_t fineCentroidIdx = i / FINE_SIZE;
|
113
127
|
const intptr_t fineCentroidOffset = i % FINE_SIZE;
|
114
128
|
|
115
|
-
|
116
|
-
|
117
|
-
|
129
|
+
intptr_t coarseCode, fineCode;
|
130
|
+
if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
|
131
|
+
coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
|
132
|
+
fineCode = Swap2Bytes(fine[fineCentroidIdx]);
|
133
|
+
} else {
|
134
|
+
coarseCode = coarse[coarseCentroidIdx];
|
135
|
+
fineCode = fine[fineCentroidIdx];
|
136
|
+
}
|
118
137
|
const float* const __restrict coarsePtr = pqCoarseCentroids +
|
119
138
|
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
|
120
139
|
COARSE_SIZE +
|
@@ -162,11 +181,18 @@ struct Index2LevelDecoder {
|
|
162
181
|
const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
|
163
182
|
const intptr_t fineCentroidIdx = i / FINE_SIZE;
|
164
183
|
const intptr_t fineCentroidOffset = i % FINE_SIZE;
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
184
|
+
intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
|
185
|
+
if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
|
186
|
+
coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
|
187
|
+
fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
|
188
|
+
coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
|
189
|
+
fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
|
190
|
+
} else {
|
191
|
+
coarseCode0 = coarse0[coarseCentroidIdx];
|
192
|
+
fineCode0 = fine0[fineCentroidIdx];
|
193
|
+
coarseCode1 = coarse1[coarseCentroidIdx];
|
194
|
+
fineCode1 = fine1[fineCentroidIdx];
|
195
|
+
}
|
170
196
|
|
171
197
|
const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
|
172
198
|
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
@@ -222,11 +248,18 @@ struct Index2LevelDecoder {
|
|
222
248
|
const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
|
223
249
|
const intptr_t fineCentroidIdx = i / FINE_SIZE;
|
224
250
|
const intptr_t fineCentroidOffset = i % FINE_SIZE;
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
251
|
+
intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
|
252
|
+
if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
|
253
|
+
coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
|
254
|
+
fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
|
255
|
+
coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
|
256
|
+
fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
|
257
|
+
} else {
|
258
|
+
coarseCode0 = coarse0[coarseCentroidIdx];
|
259
|
+
fineCode0 = fine0[fineCentroidIdx];
|
260
|
+
coarseCode1 = coarse1[coarseCentroidIdx];
|
261
|
+
fineCode1 = fine1[fineCentroidIdx];
|
262
|
+
}
|
230
263
|
|
231
264
|
const float* const __restrict coarsePtr0 = pqCoarseCentroids +
|
232
265
|
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
@@ -292,13 +325,23 @@ struct Index2LevelDecoder {
|
|
292
325
|
const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
|
293
326
|
const intptr_t fineCentroidIdx = i / FINE_SIZE;
|
294
327
|
const intptr_t fineCentroidOffset = i % FINE_SIZE;
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
328
|
+
intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
|
329
|
+
intptr_t coarseCode2, fineCode2;
|
330
|
+
if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
|
331
|
+
coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
|
332
|
+
fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
|
333
|
+
coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
|
334
|
+
fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
|
335
|
+
coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
|
336
|
+
fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
|
337
|
+
} else {
|
338
|
+
coarseCode0 = coarse0[coarseCentroidIdx];
|
339
|
+
fineCode0 = fine0[fineCentroidIdx];
|
340
|
+
coarseCode1 = coarse1[coarseCentroidIdx];
|
341
|
+
fineCode1 = fine1[fineCentroidIdx];
|
342
|
+
coarseCode2 = coarse2[coarseCentroidIdx];
|
343
|
+
fineCode2 = fine2[fineCentroidIdx];
|
344
|
+
}
|
302
345
|
|
303
346
|
const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
|
304
347
|
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
@@ -369,13 +412,23 @@ struct Index2LevelDecoder {
|
|
369
412
|
const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
|
370
413
|
const intptr_t fineCentroidIdx = i / FINE_SIZE;
|
371
414
|
const intptr_t fineCentroidOffset = i % FINE_SIZE;
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
415
|
+
intptr_t coarseCode0, fineCode0, coarseCode1, fineCode1;
|
416
|
+
intptr_t coarseCode2, fineCode2;
|
417
|
+
if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
|
418
|
+
coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
|
419
|
+
fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
|
420
|
+
coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
|
421
|
+
fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
|
422
|
+
coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
|
423
|
+
fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
|
424
|
+
} else {
|
425
|
+
coarseCode0 = coarse0[coarseCentroidIdx];
|
426
|
+
fineCode0 = fine0[fineCentroidIdx];
|
427
|
+
coarseCode1 = coarse1[coarseCentroidIdx];
|
428
|
+
fineCode1 = fine1[fineCentroidIdx];
|
429
|
+
coarseCode2 = coarse2[coarseCentroidIdx];
|
430
|
+
fineCode2 = fine2[fineCentroidIdx];
|
431
|
+
}
|
379
432
|
|
380
433
|
const float* const __restrict coarsePtr0 = pqCoarseCentroids +
|
381
434
|
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
@@ -14,6 +14,9 @@
|
|
14
14
|
|
15
15
|
#include <faiss/IndexBinaryFlat.h>
|
16
16
|
#include <faiss/IndexFlat.h>
|
17
|
+
#if defined USE_NVIDIA_RAFT
|
18
|
+
#include <faiss/IndexHNSW.h>
|
19
|
+
#endif
|
17
20
|
#include <faiss/IndexIVF.h>
|
18
21
|
#include <faiss/IndexIVFFlat.h>
|
19
22
|
#include <faiss/IndexIVFPQ.h>
|
@@ -24,6 +27,9 @@
|
|
24
27
|
#include <faiss/MetaIndexes.h>
|
25
28
|
#include <faiss/gpu/GpuIndex.h>
|
26
29
|
#include <faiss/gpu/GpuIndexBinaryFlat.h>
|
30
|
+
#if defined USE_NVIDIA_RAFT
|
31
|
+
#include <faiss/gpu/GpuIndexCagra.h>
|
32
|
+
#endif
|
27
33
|
#include <faiss/gpu/GpuIndexFlat.h>
|
28
34
|
#include <faiss/gpu/GpuIndexIVFFlat.h>
|
29
35
|
#include <faiss/gpu/GpuIndexIVFPQ.h>
|
@@ -85,7 +91,15 @@ Index* ToCPUCloner::clone_Index(const Index* index) {
|
|
85
91
|
// objective is to make a single component out of them
|
86
92
|
// (inverse op of ToGpuClonerMultiple)
|
87
93
|
|
88
|
-
}
|
94
|
+
}
|
95
|
+
#if defined USE_NVIDIA_RAFT
|
96
|
+
else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
|
97
|
+
IndexHNSWCagra* res = new IndexHNSWCagra();
|
98
|
+
icg->copyTo(res);
|
99
|
+
return res;
|
100
|
+
}
|
101
|
+
#endif
|
102
|
+
else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
|
89
103
|
int nshard = ish->count();
|
90
104
|
FAISS_ASSERT(nshard > 0);
|
91
105
|
Index* res = clone_Index(ish->at(0));
|
@@ -153,6 +167,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
|
|
153
167
|
config.indicesOptions = indicesOptions;
|
154
168
|
config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
|
155
169
|
config.use_raft = use_raft;
|
170
|
+
config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
|
156
171
|
|
157
172
|
GpuIndexIVFFlat* res = new GpuIndexIVFFlat(
|
158
173
|
provider, ifl->d, ifl->nlist, ifl->metric_type, config);
|
@@ -205,6 +220,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
|
|
205
220
|
config.usePrecomputedTables = usePrecomputed;
|
206
221
|
config.use_raft = use_raft;
|
207
222
|
config.interleavedLayout = use_raft;
|
223
|
+
config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
|
208
224
|
|
209
225
|
GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config);
|
210
226
|
|
@@ -213,9 +229,25 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
|
|
213
229
|
}
|
214
230
|
|
215
231
|
return res;
|
216
|
-
}
|
217
|
-
|
218
|
-
|
232
|
+
}
|
233
|
+
#if defined USE_NVIDIA_RAFT
|
234
|
+
else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
|
235
|
+
GpuIndexCagraConfig config;
|
236
|
+
config.device = device;
|
237
|
+
GpuIndexCagra* res =
|
238
|
+
new GpuIndexCagra(provider, icg->d, icg->metric_type, config);
|
239
|
+
res->copyFrom(icg);
|
240
|
+
return res;
|
241
|
+
}
|
242
|
+
#endif
|
243
|
+
else {
|
244
|
+
// use CPU cloner for IDMap and PreTransform
|
245
|
+
auto index_idmap = dynamic_cast<const IndexIDMap*>(index);
|
246
|
+
auto index_pt = dynamic_cast<const IndexPreTransform*>(index);
|
247
|
+
if (index_idmap || index_pt) {
|
248
|
+
return Cloner::clone_Index(index);
|
249
|
+
}
|
250
|
+
FAISS_THROW_MSG("This index type is not implemented on GPU.");
|
219
251
|
}
|
220
252
|
}
|
221
253
|
|
@@ -43,6 +43,12 @@ struct GpuClonerOptions {
|
|
43
43
|
#else
|
44
44
|
bool use_raft = false;
|
45
45
|
#endif
|
46
|
+
|
47
|
+
/// This flag controls the CPU fallback logic for coarse quantizer
|
48
|
+
/// component of the index. When set to false (default), the cloner will
|
49
|
+
/// throw an exception for indices not implemented on GPU. When set to
|
50
|
+
/// true, it will fallback to a CPU implementation.
|
51
|
+
bool allowCpuCoarseQuantizer = false;
|
46
52
|
};
|
47
53
|
|
48
54
|
struct GpuMultipleClonerOptions : public GpuClonerOptions {
|
@@ -84,19 +84,14 @@ class GpuIndex : public faiss::Index {
|
|
84
84
|
|
85
85
|
/// `x` and `labels` can be resident on the CPU or any GPU; copies are
|
86
86
|
/// performed as needed
|
87
|
-
void assign(
|
88
|
-
|
89
|
-
const float* x,
|
90
|
-
idx_t* labels,
|
91
|
-
// faiss::Index has idx_t for k
|
92
|
-
idx_t k = 1) const override;
|
87
|
+
void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
|
88
|
+
const override;
|
93
89
|
|
94
90
|
/// `x`, `distances` and `labels` can be resident on the CPU or any
|
95
91
|
/// GPU; copies are performed as needed
|
96
92
|
void search(
|
97
93
|
idx_t n,
|
98
94
|
const float* x,
|
99
|
-
// faiss::Index has idx_t for k
|
100
95
|
idx_t k,
|
101
96
|
float* distances,
|
102
97
|
idx_t* labels,
|
@@ -107,7 +102,6 @@ class GpuIndex : public faiss::Index {
|
|
107
102
|
void search_and_reconstruct(
|
108
103
|
idx_t n,
|
109
104
|
const float* x,
|
110
|
-
// faiss::Index has idx_t for k
|
111
105
|
idx_t k,
|
112
106
|
float* distances,
|
113
107
|
idx_t* labels,
|
@@ -0,0 +1,282 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
3
|
+
*
|
4
|
+
* This source code is licensed under the MIT license found in the
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
6
|
+
*/
|
7
|
+
/*
|
8
|
+
* Copyright (c) 2024, NVIDIA CORPORATION.
|
9
|
+
*
|
10
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
11
|
+
* you may not use this file except in compliance with the License.
|
12
|
+
* You may obtain a copy of the License at
|
13
|
+
*
|
14
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
15
|
+
*
|
16
|
+
* Unless required by applicable law or agreed to in writing, software
|
17
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
18
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
19
|
+
* See the License for the specific language governing permissions and
|
20
|
+
* limitations under the License.
|
21
|
+
*/
|
22
|
+
|
23
|
+
#pragma once
|
24
|
+
|
25
|
+
#include <faiss/IndexIVF.h>
|
26
|
+
#include <faiss/gpu/GpuIndex.h>
|
27
|
+
#include <faiss/gpu/GpuIndexIVFPQ.h>
|
28
|
+
|
29
|
+
namespace faiss {
|
30
|
+
struct IndexHNSWCagra;
|
31
|
+
}
|
32
|
+
|
33
|
+
namespace faiss {
|
34
|
+
namespace gpu {
|
35
|
+
|
36
|
+
class RaftCagra;
|
37
|
+
|
38
|
+
enum class graph_build_algo {
|
39
|
+
/// Use IVF-PQ to build all-neighbors knn graph
|
40
|
+
IVF_PQ,
|
41
|
+
/// Experimental, use NN-Descent to build all-neighbors knn graph
|
42
|
+
NN_DESCENT
|
43
|
+
};
|
44
|
+
|
45
|
+
/// A type for specifying how PQ codebooks are created.
|
46
|
+
enum class codebook_gen { // NOLINT
|
47
|
+
PER_SUBSPACE = 0, // NOLINT
|
48
|
+
PER_CLUSTER = 1, // NOLINT
|
49
|
+
};
|
50
|
+
|
51
|
+
struct IVFPQBuildCagraConfig {
|
52
|
+
///
|
53
|
+
/// The number of inverted lists (clusters)
|
54
|
+
///
|
55
|
+
/// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be
|
56
|
+
/// approximately 1,000 to 10,000.
|
57
|
+
|
58
|
+
uint32_t n_lists = 1024;
|
59
|
+
/// The number of iterations searching for kmeans centers (index building).
|
60
|
+
uint32_t kmeans_n_iters = 20;
|
61
|
+
/// The fraction of data to use during iterative kmeans building.
|
62
|
+
double kmeans_trainset_fraction = 0.5;
|
63
|
+
///
|
64
|
+
/// The bit length of the vector element after compression by PQ.
|
65
|
+
///
|
66
|
+
/// Possible values: [4, 5, 6, 7, 8].
|
67
|
+
///
|
68
|
+
/// Hint: the smaller the 'pq_bits', the smaller the index size and the
|
69
|
+
/// better the search performance, but the lower the recall.
|
70
|
+
|
71
|
+
uint32_t pq_bits = 8;
|
72
|
+
///
|
73
|
+
/// The dimensionality of the vector after compression by PQ. When zero, an
|
74
|
+
/// optimal value is selected using a heuristic.
|
75
|
+
///
|
76
|
+
/// NB: `pq_dim /// pq_bits` must be a multiple of 8.
|
77
|
+
///
|
78
|
+
/// Hint: a smaller 'pq_dim' results in a smaller index size and better
|
79
|
+
/// search performance, but lower recall. If 'pq_bits' is 8, 'pq_dim' can be
|
80
|
+
/// set to any number, but multiple of 8 are desirable for good performance.
|
81
|
+
/// If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8. For good
|
82
|
+
/// performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally,
|
83
|
+
/// 'pq_dim' should be also a divisor of the dataset dim.
|
84
|
+
|
85
|
+
uint32_t pq_dim = 0;
|
86
|
+
/// How PQ codebooks are created.
|
87
|
+
codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
|
88
|
+
///
|
89
|
+
/// Apply a random rotation matrix on the input data and queries even if
|
90
|
+
/// `dim % pq_dim == 0`.
|
91
|
+
///
|
92
|
+
/// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always
|
93
|
+
/// applied to the input data and queries to transform the working space
|
94
|
+
/// from `dim` to `rot_dim`, which may be slightly larger than the original
|
95
|
+
/// space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
|
96
|
+
/// However, this transform is not necessary when `dim` is multiple of
|
97
|
+
/// `pq_dim`
|
98
|
+
/// (`dim == rot_dim`, hence no need in adding "extra" data columns /
|
99
|
+
/// features).
|
100
|
+
///
|
101
|
+
/// By default, if `dim == rot_dim`, the rotation transform is initialized
|
102
|
+
/// with the identity matrix. When `force_random_rotation == true`, a random
|
103
|
+
/// orthogonal transform matrix is generated regardless of the values of
|
104
|
+
/// `dim` and `pq_dim`.
|
105
|
+
|
106
|
+
bool force_random_rotation = false;
|
107
|
+
///
|
108
|
+
/// By default, the algorithm allocates more space than necessary for
|
109
|
+
/// individual clusters
|
110
|
+
/// (`list_data`). This allows to amortize the cost of memory allocation and
|
111
|
+
/// reduce the number of data copies during repeated calls to `extend`
|
112
|
+
/// (extending the database).
|
113
|
+
///
|
114
|
+
/// The alternative is the conservative allocation behavior; when enabled,
|
115
|
+
/// the algorithm always allocates the minimum amount of memory required to
|
116
|
+
/// store the given number of records. Set this flag to `true` if you prefer
|
117
|
+
/// to use as little GPU memory for the database as possible.
|
118
|
+
|
119
|
+
bool conservative_memory_allocation = false;
|
120
|
+
};
|
121
|
+
|
122
|
+
struct IVFPQSearchCagraConfig {
|
123
|
+
/// The number of clusters to search.
|
124
|
+
uint32_t n_probes = 20;
|
125
|
+
///
|
126
|
+
/// Data type of look up table to be created dynamically at search time.
|
127
|
+
///
|
128
|
+
/// Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
|
129
|
+
///
|
130
|
+
/// The use of low-precision types reduces the amount of shared memory
|
131
|
+
/// required at search time, so fast shared memory kernels can be used even
|
132
|
+
/// for datasets with large dimansionality. Note that the recall is slightly
|
133
|
+
/// degraded when low-precision type is selected.
|
134
|
+
|
135
|
+
cudaDataType_t lut_dtype = CUDA_R_32F;
|
136
|
+
///
|
137
|
+
/// Storage data type for distance/similarity computed at search time.
|
138
|
+
///
|
139
|
+
/// Possible values: [CUDA_R_16F, CUDA_R_32F]
|
140
|
+
///
|
141
|
+
/// If the performance limiter at search time is device memory access,
|
142
|
+
/// selecting FP16 will improve performance slightly.
|
143
|
+
|
144
|
+
cudaDataType_t internal_distance_dtype = CUDA_R_32F;
|
145
|
+
///
|
146
|
+
/// Preferred fraction of SM's unified memory / L1 cache to be used as
|
147
|
+
/// shared memory.
|
148
|
+
///
|
149
|
+
/// Possible values: [0.0 - 1.0] as a fraction of the
|
150
|
+
/// `sharedMemPerMultiprocessor`.
|
151
|
+
///
|
152
|
+
/// One wants to increase the carveout to make sure a good GPU occupancy for
|
153
|
+
/// the main search kernel, but not to keep it too high to leave some memory
|
154
|
+
/// to be used as L1 cache. Note, this value is interpreted only as a hint.
|
155
|
+
/// Moreover, a GPU usually allows only a fixed set of cache configurations,
|
156
|
+
/// so the provided value is rounded up to the nearest configuration. Refer
|
157
|
+
/// to the NVIDIA tuning guide for the target GPU architecture.
|
158
|
+
///
|
159
|
+
/// Note, this is a low-level tuning parameter that can have drastic
|
160
|
+
/// negative effects on the search performance if tweaked incorrectly.
|
161
|
+
|
162
|
+
double preferred_shmem_carveout = 1.0;
|
163
|
+
};
|
164
|
+
|
165
|
+
struct GpuIndexCagraConfig : public GpuIndexConfig {
|
166
|
+
/// Degree of input graph for pruning.
|
167
|
+
size_t intermediate_graph_degree = 128;
|
168
|
+
/// Degree of output graph.
|
169
|
+
size_t graph_degree = 64;
|
170
|
+
/// ANN algorithm to build knn graph.
|
171
|
+
graph_build_algo build_algo = graph_build_algo::IVF_PQ;
|
172
|
+
/// Number of Iterations to run if building with NN_DESCENT
|
173
|
+
size_t nn_descent_niter = 20;
|
174
|
+
|
175
|
+
IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
|
176
|
+
IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
|
177
|
+
};
|
178
|
+
|
179
|
+
enum class search_algo {
|
180
|
+
/// For large batch sizes.
|
181
|
+
SINGLE_CTA,
|
182
|
+
/// For small batch sizes.
|
183
|
+
MULTI_CTA,
|
184
|
+
MULTI_KERNEL,
|
185
|
+
AUTO
|
186
|
+
};
|
187
|
+
|
188
|
+
enum class hash_mode { HASH, SMALL, AUTO };
|
189
|
+
|
190
|
+
struct SearchParametersCagra : SearchParameters {
|
191
|
+
/// Maximum number of queries to search at the same time (batch size). Auto
|
192
|
+
/// select when 0.
|
193
|
+
size_t max_queries = 0;
|
194
|
+
|
195
|
+
/// Number of intermediate search results retained during the search.
|
196
|
+
///
|
197
|
+
/// This is the main knob to adjust trade off between accuracy and search
|
198
|
+
/// speed. Higher values improve the search accuracy.
|
199
|
+
|
200
|
+
size_t itopk_size = 64;
|
201
|
+
|
202
|
+
/// Upper limit of search iterations. Auto select when 0.
|
203
|
+
size_t max_iterations = 0;
|
204
|
+
|
205
|
+
// In the following we list additional search parameters for fine tuning.
|
206
|
+
// Reasonable default values are automatically chosen.
|
207
|
+
|
208
|
+
/// Which search implementation to use.
|
209
|
+
search_algo algo = search_algo::AUTO;
|
210
|
+
|
211
|
+
/// Number of threads used to calculate a single distance. 4, 8, 16, or 32.
|
212
|
+
|
213
|
+
size_t team_size = 0;
|
214
|
+
|
215
|
+
/// Number of graph nodes to select as the starting point for the search in
|
216
|
+
/// each iteration. aka search width?
|
217
|
+
size_t search_width = 1;
|
218
|
+
/// Lower limit of search iterations.
|
219
|
+
size_t min_iterations = 0;
|
220
|
+
|
221
|
+
/// Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0.
|
222
|
+
size_t thread_block_size = 0;
|
223
|
+
/// Hashmap type. Auto selection when AUTO.
|
224
|
+
hash_mode hashmap_mode = hash_mode::AUTO;
|
225
|
+
/// Lower limit of hashmap bit length. More than 8.
|
226
|
+
size_t hashmap_min_bitlen = 0;
|
227
|
+
/// Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
|
228
|
+
float hashmap_max_fill_rate = 0.5;
|
229
|
+
|
230
|
+
/// Number of iterations of initial random seed node selection. 1 or more.
|
231
|
+
|
232
|
+
uint32_t num_random_samplings = 1;
|
233
|
+
/// Bit mask used for initial random seed node selection.
|
234
|
+
uint64_t seed = 0x128394;
|
235
|
+
};
|
236
|
+
|
237
|
+
struct GpuIndexCagra : public GpuIndex {
|
238
|
+
public:
|
239
|
+
GpuIndexCagra(
|
240
|
+
GpuResourcesProvider* provider,
|
241
|
+
int dims,
|
242
|
+
faiss::MetricType metric = faiss::METRIC_L2,
|
243
|
+
GpuIndexCagraConfig config = GpuIndexCagraConfig());
|
244
|
+
|
245
|
+
/// Trains CAGRA based on the given vector data
|
246
|
+
void train(idx_t n, const float* x) override;
|
247
|
+
|
248
|
+
/// Initialize ourselves from the given CPU index; will overwrite
|
249
|
+
/// all data in ourselves
|
250
|
+
void copyFrom(const faiss::IndexHNSWCagra* index);
|
251
|
+
|
252
|
+
/// Copy ourselves to the given CPU index; will overwrite all data
|
253
|
+
/// in the index instance
|
254
|
+
void copyTo(faiss::IndexHNSWCagra* index) const;
|
255
|
+
|
256
|
+
void reset() override;
|
257
|
+
|
258
|
+
std::vector<idx_t> get_knngraph() const;
|
259
|
+
|
260
|
+
protected:
|
261
|
+
bool addImplRequiresIDs_() const override;
|
262
|
+
|
263
|
+
void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
|
264
|
+
|
265
|
+
/// Called from GpuIndex for search
|
266
|
+
void searchImpl_(
|
267
|
+
idx_t n,
|
268
|
+
const float* x,
|
269
|
+
int k,
|
270
|
+
float* distances,
|
271
|
+
idx_t* labels,
|
272
|
+
const SearchParameters* search_params) const override;
|
273
|
+
|
274
|
+
/// Our configuration options
|
275
|
+
const GpuIndexCagraConfig cagraConfig_;
|
276
|
+
|
277
|
+
/// Instance that we own; contains the inverted lists
|
278
|
+
std::shared_ptr<RaftCagra> index_;
|
279
|
+
};
|
280
|
+
|
281
|
+
} // namespace gpu
|
282
|
+
} // namespace faiss
|
@@ -26,6 +26,12 @@ struct GpuIndexIVFConfig : public GpuIndexConfig {
|
|
26
26
|
|
27
27
|
/// Configuration for the coarse quantizer object
|
28
28
|
GpuIndexFlatConfig flatConfig;
|
29
|
+
|
30
|
+
/// This flag controls the CPU fallback logic for coarse quantizer
|
31
|
+
/// component of the index. When set to false (default), the cloner will
|
32
|
+
/// throw an exception for indices not implemented on GPU. When set to
|
33
|
+
/// true, it will fallback to a CPU implementation.
|
34
|
+
bool allowCpuCoarseQuantizer = false;
|
29
35
|
};
|
30
36
|
|
31
37
|
/// Base class of all GPU IVF index types. This (for now) deliberately does not
|
@@ -87,6 +87,8 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
|
|
87
87
|
/// Trains the coarse quantizer based on the given vector data
|
88
88
|
void train(idx_t n, const float* x) override;
|
89
89
|
|
90
|
+
void reconstruct_n(idx_t i0, idx_t n, float* out) const override;
|
91
|
+
|
90
92
|
protected:
|
91
93
|
/// Initialize appropriate index
|
92
94
|
void setIndex_(
|
@@ -257,6 +257,14 @@ void StandardGpuResourcesImpl::setDefaultStream(
|
|
257
257
|
if (prevStream != stream) {
|
258
258
|
streamWait({stream}, {prevStream});
|
259
259
|
}
|
260
|
+
#if defined USE_NVIDIA_RAFT
|
261
|
+
// delete the raft handle for this device, which will be initialized
|
262
|
+
// with the updated stream during any subsequent calls to getRaftHandle
|
263
|
+
auto it2 = raftHandles_.find(device);
|
264
|
+
if (it2 != raftHandles_.end()) {
|
265
|
+
raftHandles_.erase(it2);
|
266
|
+
}
|
267
|
+
#endif
|
260
268
|
}
|
261
269
|
|
262
270
|
userDefaultStreams_[device] = stream;
|
@@ -275,6 +283,14 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
|
|
275
283
|
|
276
284
|
streamWait({newStream}, {prevStream});
|
277
285
|
}
|
286
|
+
#if defined USE_NVIDIA_RAFT
|
287
|
+
// delete the raft handle for this device, which will be initialized
|
288
|
+
// with the updated stream during any subsequent calls to getRaftHandle
|
289
|
+
auto it2 = raftHandles_.find(device);
|
290
|
+
if (it2 != raftHandles_.end()) {
|
291
|
+
raftHandles_.erase(it2);
|
292
|
+
}
|
293
|
+
#endif
|
278
294
|
}
|
279
295
|
|
280
296
|
userDefaultStreams_.erase(device);
|
@@ -347,11 +363,20 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
|
|
347
363
|
prop.major,
|
348
364
|
prop.minor);
|
349
365
|
|
366
|
+
#if USE_AMD_ROCM
|
367
|
+
// Our code is pre-built with and expects warpSize == 32 or 64, validate
|
368
|
+
// that
|
369
|
+
FAISS_ASSERT_FMT(
|
370
|
+
prop.warpSize == 32 || prop.warpSize == 64,
|
371
|
+
"Device id %d does not have expected warpSize of 32 or 64",
|
372
|
+
device);
|
373
|
+
#else
|
350
374
|
// Our code is pre-built with and expects warpSize == 32, validate that
|
351
375
|
FAISS_ASSERT_FMT(
|
352
376
|
prop.warpSize == 32,
|
353
377
|
"Device id %d does not have expected warpSize of 32",
|
354
378
|
device);
|
379
|
+
#endif
|
355
380
|
|
356
381
|
// Create streams
|
357
382
|
cudaStream_t defaultStream = nullptr;
|