faiss 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.h +1 -1
- data/vendor/faiss/faiss/Clustering.cpp +35 -4
- data/vendor/faiss/faiss/Clustering.h +10 -1
- data/vendor/faiss/faiss/IVFlib.cpp +4 -1
- data/vendor/faiss/faiss/Index.h +21 -6
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -1
- data/vendor/faiss/faiss/IndexFastScan.cpp +22 -4
- data/vendor/faiss/faiss/IndexFlat.cpp +11 -7
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +159 -5
- data/vendor/faiss/faiss/IndexFlatCodes.h +20 -3
- data/vendor/faiss/faiss/IndexHNSW.cpp +143 -90
- data/vendor/faiss/faiss/IndexHNSW.h +52 -3
- data/vendor/faiss/faiss/IndexIVF.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVF.h +9 -1
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +15 -0
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -0
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +130 -57
- data/vendor/faiss/faiss/IndexIVFFastScan.h +14 -7
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +1 -3
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +21 -2
- data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
- data/vendor/faiss/faiss/IndexLattice.h +3 -22
- data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -29
- data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
- data/vendor/faiss/faiss/IndexNSG.h +1 -1
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
- data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRefine.cpp +5 -5
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +3 -1
- data/vendor/faiss/faiss/MetricType.h +7 -2
- data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
- data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
- data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +36 -4
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +6 -0
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuIndex.h +2 -8
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +6 -0
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +2 -0
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +25 -0
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +6 -0
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +65 -0
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
- data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +25 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +9 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +46 -0
- data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
- data/vendor/faiss/faiss/impl/HNSW.cpp +358 -190
- data/vendor/faiss/faiss/impl/HNSW.h +43 -22
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +8 -8
- data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +13 -8
- data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +1 -0
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +5 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +151 -32
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +719 -102
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -0
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +5 -0
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +29 -15
- data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
- data/vendor/faiss/faiss/impl/index_write.cpp +28 -10
- data/vendor/faiss/faiss/impl/io.cpp +13 -5
- data/vendor/faiss/faiss/impl/io.h +4 -4
- data/vendor/faiss/faiss/impl/io_macros.h +6 -0
- data/vendor/faiss/faiss/impl/platform_macros.h +22 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +11 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +1 -1
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +448 -1
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +5 -5
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +143 -59
- data/vendor/faiss/faiss/index_factory.cpp +31 -13
- data/vendor/faiss/faiss/index_io.h +12 -5
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +9 -1
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +55 -17
- data/vendor/faiss/faiss/invlists/InvertedLists.h +18 -9
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +21 -6
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
- data/vendor/faiss/faiss/python/python_callbacks.cpp +3 -3
- data/vendor/faiss/faiss/utils/Heap.h +105 -0
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
- data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
- data/vendor/faiss/faiss/utils/bf16.h +36 -0
- data/vendor/faiss/faiss/utils/distances.cpp +58 -88
- data/vendor/faiss/faiss/utils/distances.h +5 -5
- data/vendor/faiss/faiss/utils/distances_simd.cpp +997 -9
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
- data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
- data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
- data/vendor/faiss/faiss/utils/hamming.cpp +1 -1
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +4 -1
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +2 -1
- data/vendor/faiss/faiss/utils/random.cpp +43 -0
- data/vendor/faiss/faiss/utils/random.h +25 -0
- data/vendor/faiss/faiss/utils/simdlib.h +10 -1
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
- data/vendor/faiss/faiss/utils/simdlib_neon.h +5 -2
- data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
- data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
- data/vendor/faiss/faiss/utils/utils.cpp +10 -3
- data/vendor/faiss/faiss/utils/utils.h +3 -0
- metadata +16 -4
- data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
8
|
#include <faiss/gpu/impl/InterleavedCodes.h>
|
|
9
|
+
#include <faiss/gpu/utils/DeviceUtils.h>
|
|
9
10
|
#include <faiss/gpu/utils/StaticUtils.h>
|
|
10
11
|
#include <faiss/impl/FaissAssert.h>
|
|
11
12
|
|
|
@@ -166,15 +167,16 @@ void unpackInterleavedWord(
|
|
|
166
167
|
int numVecs,
|
|
167
168
|
int dims,
|
|
168
169
|
int bitsPerCode) {
|
|
169
|
-
int
|
|
170
|
+
int warpSize = getWarpSizeCurrentDevice();
|
|
171
|
+
int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T));
|
|
170
172
|
int wordsPerBlock = wordsPerDimBlock * dims;
|
|
171
|
-
int numBlocks = utils::divUp(numVecs,
|
|
173
|
+
int numBlocks = utils::divUp(numVecs, warpSize);
|
|
172
174
|
|
|
173
175
|
#pragma omp parallel for
|
|
174
176
|
for (int i = 0; i < numVecs; ++i) {
|
|
175
|
-
int block = i /
|
|
177
|
+
int block = i / warpSize;
|
|
176
178
|
FAISS_ASSERT(block < numBlocks);
|
|
177
|
-
int lane = i %
|
|
179
|
+
int lane = i % warpSize;
|
|
178
180
|
|
|
179
181
|
for (int j = 0; j < dims; ++j) {
|
|
180
182
|
int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
|
|
@@ -188,9 +190,10 @@ std::vector<uint8_t> unpackInterleaved(
|
|
|
188
190
|
int numVecs,
|
|
189
191
|
int dims,
|
|
190
192
|
int bitsPerCode) {
|
|
191
|
-
int
|
|
193
|
+
int warpSize = getWarpSizeCurrentDevice();
|
|
194
|
+
int bytesPerDimBlock = warpSize * bitsPerCode / 8;
|
|
192
195
|
int bytesPerBlock = bytesPerDimBlock * dims;
|
|
193
|
-
int numBlocks = utils::divUp(numVecs,
|
|
196
|
+
int numBlocks = utils::divUp(numVecs, warpSize);
|
|
194
197
|
size_t totalSize = (size_t)bytesPerBlock * numBlocks;
|
|
195
198
|
FAISS_ASSERT(data.size() == totalSize);
|
|
196
199
|
|
|
@@ -217,8 +220,8 @@ std::vector<uint8_t> unpackInterleaved(
|
|
|
217
220
|
} else if (bitsPerCode == 4) {
|
|
218
221
|
#pragma omp parallel for
|
|
219
222
|
for (int i = 0; i < numVecs; ++i) {
|
|
220
|
-
int block = i /
|
|
221
|
-
int lane = i %
|
|
223
|
+
int block = i / warpSize;
|
|
224
|
+
int lane = i % warpSize;
|
|
222
225
|
|
|
223
226
|
int word = lane / 2;
|
|
224
227
|
int subWord = lane % 2;
|
|
@@ -235,8 +238,8 @@ std::vector<uint8_t> unpackInterleaved(
|
|
|
235
238
|
} else if (bitsPerCode == 5) {
|
|
236
239
|
#pragma omp parallel for
|
|
237
240
|
for (int i = 0; i < numVecs; ++i) {
|
|
238
|
-
int block = i /
|
|
239
|
-
int blockVector = i %
|
|
241
|
+
int block = i / warpSize;
|
|
242
|
+
int blockVector = i % warpSize;
|
|
240
243
|
|
|
241
244
|
for (int j = 0; j < dims; ++j) {
|
|
242
245
|
uint8_t* dimBlock =
|
|
@@ -257,8 +260,8 @@ std::vector<uint8_t> unpackInterleaved(
|
|
|
257
260
|
} else if (bitsPerCode == 6) {
|
|
258
261
|
#pragma omp parallel for
|
|
259
262
|
for (int i = 0; i < numVecs; ++i) {
|
|
260
|
-
int block = i /
|
|
261
|
-
int blockVector = i %
|
|
263
|
+
int block = i / warpSize;
|
|
264
|
+
int blockVector = i % warpSize;
|
|
262
265
|
|
|
263
266
|
for (int j = 0; j < dims; ++j) {
|
|
264
267
|
uint8_t* dimBlock =
|
|
@@ -442,17 +445,18 @@ void packInterleavedWord(
|
|
|
442
445
|
int numVecs,
|
|
443
446
|
int dims,
|
|
444
447
|
int bitsPerCode) {
|
|
445
|
-
int
|
|
448
|
+
int warpSize = getWarpSizeCurrentDevice();
|
|
449
|
+
int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T));
|
|
446
450
|
int wordsPerBlock = wordsPerDimBlock * dims;
|
|
447
|
-
int numBlocks = utils::divUp(numVecs,
|
|
451
|
+
int numBlocks = utils::divUp(numVecs, warpSize);
|
|
448
452
|
|
|
449
453
|
// We're guaranteed that all other slots not filled by the vectors present
|
|
450
454
|
// are initialized to zero (from the vector constructor in packInterleaved)
|
|
451
455
|
#pragma omp parallel for
|
|
452
456
|
for (int i = 0; i < numVecs; ++i) {
|
|
453
|
-
int block = i /
|
|
457
|
+
int block = i / warpSize;
|
|
454
458
|
FAISS_ASSERT(block < numBlocks);
|
|
455
|
-
int lane = i %
|
|
459
|
+
int lane = i % warpSize;
|
|
456
460
|
|
|
457
461
|
for (int j = 0; j < dims; ++j) {
|
|
458
462
|
int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
|
|
@@ -466,9 +470,10 @@ std::vector<uint8_t> packInterleaved(
|
|
|
466
470
|
int numVecs,
|
|
467
471
|
int dims,
|
|
468
472
|
int bitsPerCode) {
|
|
469
|
-
int
|
|
473
|
+
int warpSize = getWarpSizeCurrentDevice();
|
|
474
|
+
int bytesPerDimBlock = warpSize * bitsPerCode / 8;
|
|
470
475
|
int bytesPerBlock = bytesPerDimBlock * dims;
|
|
471
|
-
int numBlocks = utils::divUp(numVecs,
|
|
476
|
+
int numBlocks = utils::divUp(numVecs, warpSize);
|
|
472
477
|
size_t totalSize = (size_t)bytesPerBlock * numBlocks;
|
|
473
478
|
|
|
474
479
|
// bit codes padded to whole bytes
|
|
@@ -499,7 +504,7 @@ std::vector<uint8_t> packInterleaved(
|
|
|
499
504
|
for (int i = 0; i < numBlocks; ++i) {
|
|
500
505
|
for (int j = 0; j < dims; ++j) {
|
|
501
506
|
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
|
502
|
-
int loVec = i *
|
|
507
|
+
int loVec = i * warpSize + k * 2;
|
|
503
508
|
int hiVec = loVec + 1;
|
|
504
509
|
|
|
505
510
|
uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
|
|
@@ -516,7 +521,7 @@ std::vector<uint8_t> packInterleaved(
|
|
|
516
521
|
for (int j = 0; j < dims; ++j) {
|
|
517
522
|
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
|
518
523
|
// What input vectors we are pulling from
|
|
519
|
-
int loVec = i *
|
|
524
|
+
int loVec = i * warpSize + (k * 8) / 5;
|
|
520
525
|
int hiVec = loVec + 1;
|
|
521
526
|
int hiVec2 = hiVec + 1;
|
|
522
527
|
|
|
@@ -536,7 +541,7 @@ std::vector<uint8_t> packInterleaved(
|
|
|
536
541
|
for (int j = 0; j < dims; ++j) {
|
|
537
542
|
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
|
538
543
|
// What input vectors we are pulling from
|
|
539
|
-
int loVec = i *
|
|
544
|
+
int loVec = i * warpSize + (k * 8) / 6;
|
|
540
545
|
int hiVec = loVec + 1;
|
|
541
546
|
|
|
542
547
|
uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
#include <vector>
|
|
18
18
|
|
|
19
19
|
#include <cuda_profiler_api.h>
|
|
20
|
+
#include <faiss/impl/AuxIndexStructures.h>
|
|
20
21
|
|
|
21
22
|
DEFINE_int32(num, 10000, "# of vecs");
|
|
22
23
|
DEFINE_int32(k, 100, "# of clusters");
|
|
@@ -34,6 +35,7 @@ DEFINE_int64(
|
|
|
34
35
|
"minimum size to use CPU -> GPU paged copies");
|
|
35
36
|
DEFINE_int64(pinned_mem, -1, "pinned memory allocation to use");
|
|
36
37
|
DEFINE_int32(max_points, -1, "max points per centroid");
|
|
38
|
+
DEFINE_double(timeout, 0, "timeout in seconds");
|
|
37
39
|
|
|
38
40
|
using namespace faiss::gpu;
|
|
39
41
|
|
|
@@ -99,10 +101,14 @@ int main(int argc, char** argv) {
|
|
|
99
101
|
cp.max_points_per_centroid = FLAGS_max_points;
|
|
100
102
|
}
|
|
101
103
|
|
|
104
|
+
auto tc = new faiss::TimeoutCallback();
|
|
105
|
+
faiss::InterruptCallback::instance.reset(tc);
|
|
106
|
+
|
|
102
107
|
faiss::Clustering kmeans(FLAGS_dim, FLAGS_k, cp);
|
|
103
108
|
|
|
104
109
|
// Time k-means
|
|
105
110
|
{
|
|
111
|
+
tc->set_timeout(FLAGS_timeout);
|
|
106
112
|
CpuTimer timer;
|
|
107
113
|
|
|
108
114
|
kmeans.train(FLAGS_num, vecs.data(), *(gpuIndex.getIndex()));
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
#include <faiss/gpu/impl/InterleavedCodes.h>
|
|
9
9
|
#include <faiss/gpu/test/TestUtils.h>
|
|
10
|
+
#include <faiss/gpu/utils/DeviceUtils.h>
|
|
10
11
|
#include <faiss/gpu/utils/StaticUtils.h>
|
|
11
12
|
#include <gtest/gtest.h>
|
|
12
13
|
#include <cmath>
|
|
@@ -119,8 +120,9 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
|
|
|
119
120
|
std::cout << bitsPerCode << " " << dims << " " << numVecs
|
|
120
121
|
<< "\n";
|
|
121
122
|
|
|
122
|
-
int
|
|
123
|
-
int
|
|
123
|
+
int warpSize = getWarpSizeCurrentDevice();
|
|
124
|
+
int blocks = utils::divUp(numVecs, warpSize);
|
|
125
|
+
int bytesPerDimBlock = warpSize * bitsPerCode / 8;
|
|
124
126
|
int bytesPerBlock = bytesPerDimBlock * dims;
|
|
125
127
|
int size = blocks * bytesPerBlock;
|
|
126
128
|
|
|
@@ -132,9 +134,9 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
|
|
|
132
134
|
|
|
133
135
|
for (int i = 0; i < blocks; ++i) {
|
|
134
136
|
for (int j = 0; j < dims; ++j) {
|
|
135
|
-
for (int k = 0; k <
|
|
137
|
+
for (int k = 0; k < warpSize; ++k) {
|
|
136
138
|
for (int l = 0; l < bytesPerCode; ++l) {
|
|
137
|
-
int vec = i *
|
|
139
|
+
int vec = i * warpSize + k;
|
|
138
140
|
if (vec < numVecs) {
|
|
139
141
|
data[i * bytesPerBlock +
|
|
140
142
|
j * bytesPerDimBlock +
|
|
@@ -148,7 +150,8 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
|
|
|
148
150
|
for (int i = 0; i < blocks; ++i) {
|
|
149
151
|
for (int j = 0; j < dims; ++j) {
|
|
150
152
|
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
|
151
|
-
int loVec =
|
|
153
|
+
int loVec =
|
|
154
|
+
i * warpSize + (k * 8) / bitsPerCode;
|
|
152
155
|
int hiVec = loVec + 1;
|
|
153
156
|
int hiVec2 = hiVec + 1;
|
|
154
157
|
|
|
@@ -842,6 +842,71 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
|
|
|
842
842
|
#endif
|
|
843
843
|
}
|
|
844
844
|
|
|
845
|
+
TEST(TestGpuIndexIVFFlat, Reconstruct_n) {
|
|
846
|
+
Options opt;
|
|
847
|
+
|
|
848
|
+
std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
|
|
849
|
+
std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
|
|
850
|
+
|
|
851
|
+
faiss::IndexFlatL2 cpuQuantizer(opt.dim);
|
|
852
|
+
faiss::IndexIVFFlat cpuIndex(
|
|
853
|
+
&cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2);
|
|
854
|
+
cpuIndex.nprobe = opt.nprobe;
|
|
855
|
+
cpuIndex.train(opt.numTrain, trainVecs.data());
|
|
856
|
+
cpuIndex.add(opt.numAdd, addVecs.data());
|
|
857
|
+
|
|
858
|
+
faiss::gpu::StandardGpuResources res;
|
|
859
|
+
res.noTempMemory();
|
|
860
|
+
|
|
861
|
+
faiss::gpu::GpuIndexIVFFlatConfig config;
|
|
862
|
+
config.device = opt.device;
|
|
863
|
+
config.indicesOptions = faiss::gpu::INDICES_64_BIT;
|
|
864
|
+
config.use_raft = false;
|
|
865
|
+
|
|
866
|
+
faiss::gpu::GpuIndexIVFFlat gpuIndex(
|
|
867
|
+
&res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
|
|
868
|
+
gpuIndex.nprobe = opt.nprobe;
|
|
869
|
+
|
|
870
|
+
gpuIndex.train(opt.numTrain, trainVecs.data());
|
|
871
|
+
gpuIndex.add(opt.numAdd, addVecs.data());
|
|
872
|
+
|
|
873
|
+
std::vector<float> gpuVals(opt.numAdd * opt.dim);
|
|
874
|
+
|
|
875
|
+
gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
|
|
876
|
+
|
|
877
|
+
std::vector<float> cpuVals(opt.numAdd * opt.dim);
|
|
878
|
+
|
|
879
|
+
cpuIndex.reconstruct_n(0, cpuIndex.ntotal, cpuVals.data());
|
|
880
|
+
|
|
881
|
+
EXPECT_EQ(gpuVals, cpuVals);
|
|
882
|
+
|
|
883
|
+
config.indicesOptions = faiss::gpu::INDICES_32_BIT;
|
|
884
|
+
|
|
885
|
+
faiss::gpu::GpuIndexIVFFlat gpuIndex1(
|
|
886
|
+
&res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
|
|
887
|
+
gpuIndex1.nprobe = opt.nprobe;
|
|
888
|
+
|
|
889
|
+
gpuIndex1.train(opt.numTrain, trainVecs.data());
|
|
890
|
+
gpuIndex1.add(opt.numAdd, addVecs.data());
|
|
891
|
+
|
|
892
|
+
gpuIndex1.reconstruct_n(0, gpuIndex1.ntotal, gpuVals.data());
|
|
893
|
+
|
|
894
|
+
EXPECT_EQ(gpuVals, cpuVals);
|
|
895
|
+
|
|
896
|
+
config.indicesOptions = faiss::gpu::INDICES_CPU;
|
|
897
|
+
|
|
898
|
+
faiss::gpu::GpuIndexIVFFlat gpuIndex2(
|
|
899
|
+
&res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
|
|
900
|
+
gpuIndex2.nprobe = opt.nprobe;
|
|
901
|
+
|
|
902
|
+
gpuIndex2.train(opt.numTrain, trainVecs.data());
|
|
903
|
+
gpuIndex2.add(opt.numAdd, addVecs.data());
|
|
904
|
+
|
|
905
|
+
gpuIndex2.reconstruct_n(0, gpuIndex2.ntotal, gpuVals.data());
|
|
906
|
+
|
|
907
|
+
EXPECT_EQ(gpuVals, cpuVals);
|
|
908
|
+
}
|
|
909
|
+
|
|
845
910
|
int main(int argc, char** argv) {
|
|
846
911
|
testing::InitGoogleTest(&argc, argv);
|
|
847
912
|
|
|
@@ -76,6 +76,12 @@ bool getTensorCoreSupport(int device);
|
|
|
76
76
|
/// Equivalent to getTensorCoreSupport(getCurrentDevice())
|
|
77
77
|
bool getTensorCoreSupportCurrentDevice();
|
|
78
78
|
|
|
79
|
+
/// Returns the warp size of the given GPU device
|
|
80
|
+
int getWarpSize(int device);
|
|
81
|
+
|
|
82
|
+
/// Equivalent to getWarpSize(getCurrentDevice())
|
|
83
|
+
int getWarpSizeCurrentDevice();
|
|
84
|
+
|
|
79
85
|
/// Returns the amount of currently available memory on the given device
|
|
80
86
|
size_t getFreeMemory(int device);
|
|
81
87
|
|
|
@@ -14,7 +14,10 @@ namespace faiss {
|
|
|
14
14
|
namespace gpu {
|
|
15
15
|
|
|
16
16
|
KernelTimer::KernelTimer(cudaStream_t stream)
|
|
17
|
-
: startEvent_(
|
|
17
|
+
: startEvent_(nullptr),
|
|
18
|
+
stopEvent_(nullptr),
|
|
19
|
+
stream_(stream),
|
|
20
|
+
valid_(true) {
|
|
18
21
|
CUDA_VERIFY(cudaEventCreate(&startEvent_));
|
|
19
22
|
CUDA_VERIFY(cudaEventCreate(&stopEvent_));
|
|
20
23
|
|
|
@@ -18,7 +18,7 @@ class KernelTimer {
|
|
|
18
18
|
public:
|
|
19
19
|
/// Constructor starts the timer and adds an event into the current
|
|
20
20
|
/// device stream
|
|
21
|
-
KernelTimer(cudaStream_t stream =
|
|
21
|
+
KernelTimer(cudaStream_t stream = nullptr);
|
|
22
22
|
|
|
23
23
|
/// Destructor releases event resources
|
|
24
24
|
~KernelTimer();
|
|
@@ -236,4 +236,29 @@ size_t InterruptCallback::get_period_hint(size_t flops) {
|
|
|
236
236
|
return std::max((size_t)10 * 10 * 1000 * 1000 / (flops + 1), (size_t)1);
|
|
237
237
|
}
|
|
238
238
|
|
|
239
|
+
void TimeoutCallback::set_timeout(double timeout_in_seconds) {
|
|
240
|
+
timeout = timeout_in_seconds;
|
|
241
|
+
start = std::chrono::steady_clock::now();
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
bool TimeoutCallback::want_interrupt() {
|
|
245
|
+
if (timeout == 0) {
|
|
246
|
+
return false;
|
|
247
|
+
}
|
|
248
|
+
auto end = std::chrono::steady_clock::now();
|
|
249
|
+
std::chrono::duration<float, std::milli> duration = end - start;
|
|
250
|
+
float elapsed_in_seconds = duration.count() / 1000.0;
|
|
251
|
+
if (elapsed_in_seconds > timeout) {
|
|
252
|
+
timeout = 0;
|
|
253
|
+
return true;
|
|
254
|
+
}
|
|
255
|
+
return false;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
void TimeoutCallback::reset(double timeout_in_seconds) {
|
|
259
|
+
auto tc(new faiss::TimeoutCallback());
|
|
260
|
+
faiss::InterruptCallback::instance.reset(tc);
|
|
261
|
+
tc->set_timeout(timeout_in_seconds);
|
|
262
|
+
}
|
|
263
|
+
|
|
239
264
|
} // namespace faiss
|
|
@@ -122,7 +122,7 @@ struct RangeSearchPartialResult : BufferList {
|
|
|
122
122
|
void copy_result(bool incremental = false);
|
|
123
123
|
|
|
124
124
|
/// merge a set of PartialResult's into one RangeSearchResult
|
|
125
|
-
/// on
|
|
125
|
+
/// on output the partialresults are empty!
|
|
126
126
|
static void merge(
|
|
127
127
|
std::vector<RangeSearchPartialResult*>& partial_results,
|
|
128
128
|
bool do_delete = true);
|
|
@@ -161,6 +161,14 @@ struct FAISS_API InterruptCallback {
|
|
|
161
161
|
static size_t get_period_hint(size_t flops);
|
|
162
162
|
};
|
|
163
163
|
|
|
164
|
+
struct TimeoutCallback : InterruptCallback {
|
|
165
|
+
std::chrono::time_point<std::chrono::steady_clock> start;
|
|
166
|
+
double timeout;
|
|
167
|
+
bool want_interrupt() override;
|
|
168
|
+
void set_timeout(double timeout_in_seconds);
|
|
169
|
+
static void reset(double timeout_in_seconds);
|
|
170
|
+
};
|
|
171
|
+
|
|
164
172
|
/// set implementation optimized for fast access.
|
|
165
173
|
struct VisitedTable {
|
|
166
174
|
std::vector<uint8_t> visited;
|
|
@@ -59,6 +59,52 @@ struct DistanceComputer {
|
|
|
59
59
|
virtual ~DistanceComputer() {}
|
|
60
60
|
};
|
|
61
61
|
|
|
62
|
+
/* Wrap the distance computer into one that negates the
|
|
63
|
+
distances. This makes supporting INNER_PRODUCE search easier */
|
|
64
|
+
|
|
65
|
+
struct NegativeDistanceComputer : DistanceComputer {
|
|
66
|
+
/// owned by this
|
|
67
|
+
DistanceComputer* basedis;
|
|
68
|
+
|
|
69
|
+
explicit NegativeDistanceComputer(DistanceComputer* basedis)
|
|
70
|
+
: basedis(basedis) {}
|
|
71
|
+
|
|
72
|
+
void set_query(const float* x) override {
|
|
73
|
+
basedis->set_query(x);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/// compute distance of vector i to current query
|
|
77
|
+
float operator()(idx_t i) override {
|
|
78
|
+
return -(*basedis)(i);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
void distances_batch_4(
|
|
82
|
+
const idx_t idx0,
|
|
83
|
+
const idx_t idx1,
|
|
84
|
+
const idx_t idx2,
|
|
85
|
+
const idx_t idx3,
|
|
86
|
+
float& dis0,
|
|
87
|
+
float& dis1,
|
|
88
|
+
float& dis2,
|
|
89
|
+
float& dis3) override {
|
|
90
|
+
basedis->distances_batch_4(
|
|
91
|
+
idx0, idx1, idx2, idx3, dis0, dis1, dis2, dis3);
|
|
92
|
+
dis0 = -dis0;
|
|
93
|
+
dis1 = -dis1;
|
|
94
|
+
dis2 = -dis2;
|
|
95
|
+
dis3 = -dis3;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/// compute distance between two stored vectors
|
|
99
|
+
float symmetric_dis(idx_t i, idx_t j) override {
|
|
100
|
+
return -basedis->symmetric_dis(i, j);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
virtual ~NegativeDistanceComputer() {
|
|
104
|
+
delete basedis;
|
|
105
|
+
}
|
|
106
|
+
};
|
|
107
|
+
|
|
62
108
|
/*************************************************************
|
|
63
109
|
* Specialized version of the DistanceComputer when we know that codes are
|
|
64
110
|
* laid out in a flat index.
|
|
@@ -94,13 +94,15 @@
|
|
|
94
94
|
} \
|
|
95
95
|
} while (false)
|
|
96
96
|
|
|
97
|
-
#define
|
|
97
|
+
#define FAISS_THROW_IF_MSG(X, MSG) \
|
|
98
98
|
do { \
|
|
99
|
-
if (
|
|
99
|
+
if (X) { \
|
|
100
100
|
FAISS_THROW_FMT("Error: '%s' failed: " MSG, #X); \
|
|
101
101
|
} \
|
|
102
102
|
} while (false)
|
|
103
103
|
|
|
104
|
+
#define FAISS_THROW_IF_NOT_MSG(X, MSG) FAISS_THROW_IF_MSG(!(X), MSG)
|
|
105
|
+
|
|
104
106
|
#define FAISS_THROW_IF_NOT_FMT(X, FMT, ...) \
|
|
105
107
|
do { \
|
|
106
108
|
if (!(X)) { \
|