faiss 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.h +1 -1
- data/vendor/faiss/faiss/Clustering.cpp +35 -4
- data/vendor/faiss/faiss/Clustering.h +10 -1
- data/vendor/faiss/faiss/IVFlib.cpp +4 -1
- data/vendor/faiss/faiss/Index.h +21 -6
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -1
- data/vendor/faiss/faiss/IndexFastScan.cpp +22 -4
- data/vendor/faiss/faiss/IndexFlat.cpp +11 -7
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +159 -5
- data/vendor/faiss/faiss/IndexFlatCodes.h +20 -3
- data/vendor/faiss/faiss/IndexHNSW.cpp +143 -90
- data/vendor/faiss/faiss/IndexHNSW.h +52 -3
- data/vendor/faiss/faiss/IndexIVF.cpp +3 -3
- data/vendor/faiss/faiss/IndexIVF.h +9 -1
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +15 -0
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -0
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +130 -57
- data/vendor/faiss/faiss/IndexIVFFastScan.h +14 -7
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +1 -3
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +21 -2
- data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
- data/vendor/faiss/faiss/IndexLattice.h +3 -22
- data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -29
- data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
- data/vendor/faiss/faiss/IndexNSG.h +1 -1
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
- data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRefine.cpp +5 -5
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +3 -1
- data/vendor/faiss/faiss/MetricType.h +7 -2
- data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
- data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
- data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +36 -4
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +6 -0
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
- data/vendor/faiss/faiss/gpu/GpuIndex.h +2 -8
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +6 -0
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +2 -0
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +25 -0
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +6 -0
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +65 -0
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
- data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +25 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +9 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +46 -0
- data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
- data/vendor/faiss/faiss/impl/HNSW.cpp +358 -190
- data/vendor/faiss/faiss/impl/HNSW.h +43 -22
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +8 -8
- data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +13 -8
- data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +1 -0
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +5 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +151 -32
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +719 -102
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -0
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +5 -0
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +29 -15
- data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
- data/vendor/faiss/faiss/impl/index_write.cpp +28 -10
- data/vendor/faiss/faiss/impl/io.cpp +13 -5
- data/vendor/faiss/faiss/impl/io.h +4 -4
- data/vendor/faiss/faiss/impl/io_macros.h +6 -0
- data/vendor/faiss/faiss/impl/platform_macros.h +22 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +11 -0
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +1 -1
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +448 -1
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +5 -5
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +143 -59
- data/vendor/faiss/faiss/index_factory.cpp +31 -13
- data/vendor/faiss/faiss/index_io.h +12 -5
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +9 -1
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +55 -17
- data/vendor/faiss/faiss/invlists/InvertedLists.h +18 -9
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +21 -6
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
- data/vendor/faiss/faiss/python/python_callbacks.cpp +3 -3
- data/vendor/faiss/faiss/utils/Heap.h +105 -0
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
- data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
- data/vendor/faiss/faiss/utils/bf16.h +36 -0
- data/vendor/faiss/faiss/utils/distances.cpp +58 -88
- data/vendor/faiss/faiss/utils/distances.h +5 -5
- data/vendor/faiss/faiss/utils/distances_simd.cpp +997 -9
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
- data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
- data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
- data/vendor/faiss/faiss/utils/hamming.cpp +1 -1
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +4 -1
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +2 -1
- data/vendor/faiss/faiss/utils/random.cpp +43 -0
- data/vendor/faiss/faiss/utils/random.h +25 -0
- data/vendor/faiss/faiss/utils/simdlib.h +10 -1
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
- data/vendor/faiss/faiss/utils/simdlib_neon.h +5 -2
- data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
- data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
- data/vendor/faiss/faiss/utils/utils.cpp +10 -3
- data/vendor/faiss/faiss/utils/utils.h +3 -0
- metadata +16 -4
- data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102
@@ -6,6 +6,7 @@
|
|
6
6
|
*/
|
7
7
|
|
8
8
|
#include <faiss/gpu/impl/InterleavedCodes.h>
|
9
|
+
#include <faiss/gpu/utils/DeviceUtils.h>
|
9
10
|
#include <faiss/gpu/utils/StaticUtils.h>
|
10
11
|
#include <faiss/impl/FaissAssert.h>
|
11
12
|
|
@@ -166,15 +167,16 @@ void unpackInterleavedWord(
|
|
166
167
|
int numVecs,
|
167
168
|
int dims,
|
168
169
|
int bitsPerCode) {
|
169
|
-
int
|
170
|
+
int warpSize = getWarpSizeCurrentDevice();
|
171
|
+
int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T));
|
170
172
|
int wordsPerBlock = wordsPerDimBlock * dims;
|
171
|
-
int numBlocks = utils::divUp(numVecs,
|
173
|
+
int numBlocks = utils::divUp(numVecs, warpSize);
|
172
174
|
|
173
175
|
#pragma omp parallel for
|
174
176
|
for (int i = 0; i < numVecs; ++i) {
|
175
|
-
int block = i /
|
177
|
+
int block = i / warpSize;
|
176
178
|
FAISS_ASSERT(block < numBlocks);
|
177
|
-
int lane = i %
|
179
|
+
int lane = i % warpSize;
|
178
180
|
|
179
181
|
for (int j = 0; j < dims; ++j) {
|
180
182
|
int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
|
@@ -188,9 +190,10 @@ std::vector<uint8_t> unpackInterleaved(
|
|
188
190
|
int numVecs,
|
189
191
|
int dims,
|
190
192
|
int bitsPerCode) {
|
191
|
-
int
|
193
|
+
int warpSize = getWarpSizeCurrentDevice();
|
194
|
+
int bytesPerDimBlock = warpSize * bitsPerCode / 8;
|
192
195
|
int bytesPerBlock = bytesPerDimBlock * dims;
|
193
|
-
int numBlocks = utils::divUp(numVecs,
|
196
|
+
int numBlocks = utils::divUp(numVecs, warpSize);
|
194
197
|
size_t totalSize = (size_t)bytesPerBlock * numBlocks;
|
195
198
|
FAISS_ASSERT(data.size() == totalSize);
|
196
199
|
|
@@ -217,8 +220,8 @@ std::vector<uint8_t> unpackInterleaved(
|
|
217
220
|
} else if (bitsPerCode == 4) {
|
218
221
|
#pragma omp parallel for
|
219
222
|
for (int i = 0; i < numVecs; ++i) {
|
220
|
-
int block = i /
|
221
|
-
int lane = i %
|
223
|
+
int block = i / warpSize;
|
224
|
+
int lane = i % warpSize;
|
222
225
|
|
223
226
|
int word = lane / 2;
|
224
227
|
int subWord = lane % 2;
|
@@ -235,8 +238,8 @@ std::vector<uint8_t> unpackInterleaved(
|
|
235
238
|
} else if (bitsPerCode == 5) {
|
236
239
|
#pragma omp parallel for
|
237
240
|
for (int i = 0; i < numVecs; ++i) {
|
238
|
-
int block = i /
|
239
|
-
int blockVector = i %
|
241
|
+
int block = i / warpSize;
|
242
|
+
int blockVector = i % warpSize;
|
240
243
|
|
241
244
|
for (int j = 0; j < dims; ++j) {
|
242
245
|
uint8_t* dimBlock =
|
@@ -257,8 +260,8 @@ std::vector<uint8_t> unpackInterleaved(
|
|
257
260
|
} else if (bitsPerCode == 6) {
|
258
261
|
#pragma omp parallel for
|
259
262
|
for (int i = 0; i < numVecs; ++i) {
|
260
|
-
int block = i /
|
261
|
-
int blockVector = i %
|
263
|
+
int block = i / warpSize;
|
264
|
+
int blockVector = i % warpSize;
|
262
265
|
|
263
266
|
for (int j = 0; j < dims; ++j) {
|
264
267
|
uint8_t* dimBlock =
|
@@ -442,17 +445,18 @@ void packInterleavedWord(
|
|
442
445
|
int numVecs,
|
443
446
|
int dims,
|
444
447
|
int bitsPerCode) {
|
445
|
-
int
|
448
|
+
int warpSize = getWarpSizeCurrentDevice();
|
449
|
+
int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T));
|
446
450
|
int wordsPerBlock = wordsPerDimBlock * dims;
|
447
|
-
int numBlocks = utils::divUp(numVecs,
|
451
|
+
int numBlocks = utils::divUp(numVecs, warpSize);
|
448
452
|
|
449
453
|
// We're guaranteed that all other slots not filled by the vectors present
|
450
454
|
// are initialized to zero (from the vector constructor in packInterleaved)
|
451
455
|
#pragma omp parallel for
|
452
456
|
for (int i = 0; i < numVecs; ++i) {
|
453
|
-
int block = i /
|
457
|
+
int block = i / warpSize;
|
454
458
|
FAISS_ASSERT(block < numBlocks);
|
455
|
-
int lane = i %
|
459
|
+
int lane = i % warpSize;
|
456
460
|
|
457
461
|
for (int j = 0; j < dims; ++j) {
|
458
462
|
int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
|
@@ -466,9 +470,10 @@ std::vector<uint8_t> packInterleaved(
|
|
466
470
|
int numVecs,
|
467
471
|
int dims,
|
468
472
|
int bitsPerCode) {
|
469
|
-
int
|
473
|
+
int warpSize = getWarpSizeCurrentDevice();
|
474
|
+
int bytesPerDimBlock = warpSize * bitsPerCode / 8;
|
470
475
|
int bytesPerBlock = bytesPerDimBlock * dims;
|
471
|
-
int numBlocks = utils::divUp(numVecs,
|
476
|
+
int numBlocks = utils::divUp(numVecs, warpSize);
|
472
477
|
size_t totalSize = (size_t)bytesPerBlock * numBlocks;
|
473
478
|
|
474
479
|
// bit codes padded to whole bytes
|
@@ -499,7 +504,7 @@ std::vector<uint8_t> packInterleaved(
|
|
499
504
|
for (int i = 0; i < numBlocks; ++i) {
|
500
505
|
for (int j = 0; j < dims; ++j) {
|
501
506
|
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
502
|
-
int loVec = i *
|
507
|
+
int loVec = i * warpSize + k * 2;
|
503
508
|
int hiVec = loVec + 1;
|
504
509
|
|
505
510
|
uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
|
@@ -516,7 +521,7 @@ std::vector<uint8_t> packInterleaved(
|
|
516
521
|
for (int j = 0; j < dims; ++j) {
|
517
522
|
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
518
523
|
// What input vectors we are pulling from
|
519
|
-
int loVec = i *
|
524
|
+
int loVec = i * warpSize + (k * 8) / 5;
|
520
525
|
int hiVec = loVec + 1;
|
521
526
|
int hiVec2 = hiVec + 1;
|
522
527
|
|
@@ -536,7 +541,7 @@ std::vector<uint8_t> packInterleaved(
|
|
536
541
|
for (int j = 0; j < dims; ++j) {
|
537
542
|
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
538
543
|
// What input vectors we are pulling from
|
539
|
-
int loVec = i *
|
544
|
+
int loVec = i * warpSize + (k * 8) / 6;
|
540
545
|
int hiVec = loVec + 1;
|
541
546
|
|
542
547
|
uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
|
@@ -17,6 +17,7 @@
|
|
17
17
|
#include <vector>
|
18
18
|
|
19
19
|
#include <cuda_profiler_api.h>
|
20
|
+
#include <faiss/impl/AuxIndexStructures.h>
|
20
21
|
|
21
22
|
DEFINE_int32(num, 10000, "# of vecs");
|
22
23
|
DEFINE_int32(k, 100, "# of clusters");
|
@@ -34,6 +35,7 @@ DEFINE_int64(
|
|
34
35
|
"minimum size to use CPU -> GPU paged copies");
|
35
36
|
DEFINE_int64(pinned_mem, -1, "pinned memory allocation to use");
|
36
37
|
DEFINE_int32(max_points, -1, "max points per centroid");
|
38
|
+
DEFINE_double(timeout, 0, "timeout in seconds");
|
37
39
|
|
38
40
|
using namespace faiss::gpu;
|
39
41
|
|
@@ -99,10 +101,14 @@ int main(int argc, char** argv) {
|
|
99
101
|
cp.max_points_per_centroid = FLAGS_max_points;
|
100
102
|
}
|
101
103
|
|
104
|
+
auto tc = new faiss::TimeoutCallback();
|
105
|
+
faiss::InterruptCallback::instance.reset(tc);
|
106
|
+
|
102
107
|
faiss::Clustering kmeans(FLAGS_dim, FLAGS_k, cp);
|
103
108
|
|
104
109
|
// Time k-means
|
105
110
|
{
|
111
|
+
tc->set_timeout(FLAGS_timeout);
|
106
112
|
CpuTimer timer;
|
107
113
|
|
108
114
|
kmeans.train(FLAGS_num, vecs.data(), *(gpuIndex.getIndex()));
|
@@ -7,6 +7,7 @@
|
|
7
7
|
|
8
8
|
#include <faiss/gpu/impl/InterleavedCodes.h>
|
9
9
|
#include <faiss/gpu/test/TestUtils.h>
|
10
|
+
#include <faiss/gpu/utils/DeviceUtils.h>
|
10
11
|
#include <faiss/gpu/utils/StaticUtils.h>
|
11
12
|
#include <gtest/gtest.h>
|
12
13
|
#include <cmath>
|
@@ -119,8 +120,9 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
|
|
119
120
|
std::cout << bitsPerCode << " " << dims << " " << numVecs
|
120
121
|
<< "\n";
|
121
122
|
|
122
|
-
int
|
123
|
-
int
|
123
|
+
int warpSize = getWarpSizeCurrentDevice();
|
124
|
+
int blocks = utils::divUp(numVecs, warpSize);
|
125
|
+
int bytesPerDimBlock = warpSize * bitsPerCode / 8;
|
124
126
|
int bytesPerBlock = bytesPerDimBlock * dims;
|
125
127
|
int size = blocks * bytesPerBlock;
|
126
128
|
|
@@ -132,9 +134,9 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
|
|
132
134
|
|
133
135
|
for (int i = 0; i < blocks; ++i) {
|
134
136
|
for (int j = 0; j < dims; ++j) {
|
135
|
-
for (int k = 0; k <
|
137
|
+
for (int k = 0; k < warpSize; ++k) {
|
136
138
|
for (int l = 0; l < bytesPerCode; ++l) {
|
137
|
-
int vec = i *
|
139
|
+
int vec = i * warpSize + k;
|
138
140
|
if (vec < numVecs) {
|
139
141
|
data[i * bytesPerBlock +
|
140
142
|
j * bytesPerDimBlock +
|
@@ -148,7 +150,8 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
|
|
148
150
|
for (int i = 0; i < blocks; ++i) {
|
149
151
|
for (int j = 0; j < dims; ++j) {
|
150
152
|
for (int k = 0; k < bytesPerDimBlock; ++k) {
|
151
|
-
int loVec =
|
153
|
+
int loVec =
|
154
|
+
i * warpSize + (k * 8) / bitsPerCode;
|
152
155
|
int hiVec = loVec + 1;
|
153
156
|
int hiVec2 = hiVec + 1;
|
154
157
|
|
@@ -842,6 +842,71 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
|
|
842
842
|
#endif
|
843
843
|
}
|
844
844
|
|
845
|
+
TEST(TestGpuIndexIVFFlat, Reconstruct_n) {
|
846
|
+
Options opt;
|
847
|
+
|
848
|
+
std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
|
849
|
+
std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
|
850
|
+
|
851
|
+
faiss::IndexFlatL2 cpuQuantizer(opt.dim);
|
852
|
+
faiss::IndexIVFFlat cpuIndex(
|
853
|
+
&cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2);
|
854
|
+
cpuIndex.nprobe = opt.nprobe;
|
855
|
+
cpuIndex.train(opt.numTrain, trainVecs.data());
|
856
|
+
cpuIndex.add(opt.numAdd, addVecs.data());
|
857
|
+
|
858
|
+
faiss::gpu::StandardGpuResources res;
|
859
|
+
res.noTempMemory();
|
860
|
+
|
861
|
+
faiss::gpu::GpuIndexIVFFlatConfig config;
|
862
|
+
config.device = opt.device;
|
863
|
+
config.indicesOptions = faiss::gpu::INDICES_64_BIT;
|
864
|
+
config.use_raft = false;
|
865
|
+
|
866
|
+
faiss::gpu::GpuIndexIVFFlat gpuIndex(
|
867
|
+
&res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
|
868
|
+
gpuIndex.nprobe = opt.nprobe;
|
869
|
+
|
870
|
+
gpuIndex.train(opt.numTrain, trainVecs.data());
|
871
|
+
gpuIndex.add(opt.numAdd, addVecs.data());
|
872
|
+
|
873
|
+
std::vector<float> gpuVals(opt.numAdd * opt.dim);
|
874
|
+
|
875
|
+
gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
|
876
|
+
|
877
|
+
std::vector<float> cpuVals(opt.numAdd * opt.dim);
|
878
|
+
|
879
|
+
cpuIndex.reconstruct_n(0, cpuIndex.ntotal, cpuVals.data());
|
880
|
+
|
881
|
+
EXPECT_EQ(gpuVals, cpuVals);
|
882
|
+
|
883
|
+
config.indicesOptions = faiss::gpu::INDICES_32_BIT;
|
884
|
+
|
885
|
+
faiss::gpu::GpuIndexIVFFlat gpuIndex1(
|
886
|
+
&res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
|
887
|
+
gpuIndex1.nprobe = opt.nprobe;
|
888
|
+
|
889
|
+
gpuIndex1.train(opt.numTrain, trainVecs.data());
|
890
|
+
gpuIndex1.add(opt.numAdd, addVecs.data());
|
891
|
+
|
892
|
+
gpuIndex1.reconstruct_n(0, gpuIndex1.ntotal, gpuVals.data());
|
893
|
+
|
894
|
+
EXPECT_EQ(gpuVals, cpuVals);
|
895
|
+
|
896
|
+
config.indicesOptions = faiss::gpu::INDICES_CPU;
|
897
|
+
|
898
|
+
faiss::gpu::GpuIndexIVFFlat gpuIndex2(
|
899
|
+
&res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
|
900
|
+
gpuIndex2.nprobe = opt.nprobe;
|
901
|
+
|
902
|
+
gpuIndex2.train(opt.numTrain, trainVecs.data());
|
903
|
+
gpuIndex2.add(opt.numAdd, addVecs.data());
|
904
|
+
|
905
|
+
gpuIndex2.reconstruct_n(0, gpuIndex2.ntotal, gpuVals.data());
|
906
|
+
|
907
|
+
EXPECT_EQ(gpuVals, cpuVals);
|
908
|
+
}
|
909
|
+
|
845
910
|
int main(int argc, char** argv) {
|
846
911
|
testing::InitGoogleTest(&argc, argv);
|
847
912
|
|
@@ -76,6 +76,12 @@ bool getTensorCoreSupport(int device);
|
|
76
76
|
/// Equivalent to getTensorCoreSupport(getCurrentDevice())
|
77
77
|
bool getTensorCoreSupportCurrentDevice();
|
78
78
|
|
79
|
+
/// Returns the warp size of the given GPU device
|
80
|
+
int getWarpSize(int device);
|
81
|
+
|
82
|
+
/// Equivalent to getWarpSize(getCurrentDevice())
|
83
|
+
int getWarpSizeCurrentDevice();
|
84
|
+
|
79
85
|
/// Returns the amount of currently available memory on the given device
|
80
86
|
size_t getFreeMemory(int device);
|
81
87
|
|
@@ -14,7 +14,10 @@ namespace faiss {
|
|
14
14
|
namespace gpu {
|
15
15
|
|
16
16
|
KernelTimer::KernelTimer(cudaStream_t stream)
|
17
|
-
: startEvent_(
|
17
|
+
: startEvent_(nullptr),
|
18
|
+
stopEvent_(nullptr),
|
19
|
+
stream_(stream),
|
20
|
+
valid_(true) {
|
18
21
|
CUDA_VERIFY(cudaEventCreate(&startEvent_));
|
19
22
|
CUDA_VERIFY(cudaEventCreate(&stopEvent_));
|
20
23
|
|
@@ -18,7 +18,7 @@ class KernelTimer {
|
|
18
18
|
public:
|
19
19
|
/// Constructor starts the timer and adds an event into the current
|
20
20
|
/// device stream
|
21
|
-
KernelTimer(cudaStream_t stream =
|
21
|
+
KernelTimer(cudaStream_t stream = nullptr);
|
22
22
|
|
23
23
|
/// Destructor releases event resources
|
24
24
|
~KernelTimer();
|
@@ -236,4 +236,29 @@ size_t InterruptCallback::get_period_hint(size_t flops) {
|
|
236
236
|
return std::max((size_t)10 * 10 * 1000 * 1000 / (flops + 1), (size_t)1);
|
237
237
|
}
|
238
238
|
|
239
|
+
void TimeoutCallback::set_timeout(double timeout_in_seconds) {
|
240
|
+
timeout = timeout_in_seconds;
|
241
|
+
start = std::chrono::steady_clock::now();
|
242
|
+
}
|
243
|
+
|
244
|
+
bool TimeoutCallback::want_interrupt() {
|
245
|
+
if (timeout == 0) {
|
246
|
+
return false;
|
247
|
+
}
|
248
|
+
auto end = std::chrono::steady_clock::now();
|
249
|
+
std::chrono::duration<float, std::milli> duration = end - start;
|
250
|
+
float elapsed_in_seconds = duration.count() / 1000.0;
|
251
|
+
if (elapsed_in_seconds > timeout) {
|
252
|
+
timeout = 0;
|
253
|
+
return true;
|
254
|
+
}
|
255
|
+
return false;
|
256
|
+
}
|
257
|
+
|
258
|
+
void TimeoutCallback::reset(double timeout_in_seconds) {
|
259
|
+
auto tc(new faiss::TimeoutCallback());
|
260
|
+
faiss::InterruptCallback::instance.reset(tc);
|
261
|
+
tc->set_timeout(timeout_in_seconds);
|
262
|
+
}
|
263
|
+
|
239
264
|
} // namespace faiss
|
@@ -122,7 +122,7 @@ struct RangeSearchPartialResult : BufferList {
|
|
122
122
|
void copy_result(bool incremental = false);
|
123
123
|
|
124
124
|
/// merge a set of PartialResult's into one RangeSearchResult
|
125
|
-
/// on
|
125
|
+
/// on output the partialresults are empty!
|
126
126
|
static void merge(
|
127
127
|
std::vector<RangeSearchPartialResult*>& partial_results,
|
128
128
|
bool do_delete = true);
|
@@ -161,6 +161,14 @@ struct FAISS_API InterruptCallback {
|
|
161
161
|
static size_t get_period_hint(size_t flops);
|
162
162
|
};
|
163
163
|
|
164
|
+
struct TimeoutCallback : InterruptCallback {
|
165
|
+
std::chrono::time_point<std::chrono::steady_clock> start;
|
166
|
+
double timeout;
|
167
|
+
bool want_interrupt() override;
|
168
|
+
void set_timeout(double timeout_in_seconds);
|
169
|
+
static void reset(double timeout_in_seconds);
|
170
|
+
};
|
171
|
+
|
164
172
|
/// set implementation optimized for fast access.
|
165
173
|
struct VisitedTable {
|
166
174
|
std::vector<uint8_t> visited;
|
@@ -59,6 +59,52 @@ struct DistanceComputer {
|
|
59
59
|
virtual ~DistanceComputer() {}
|
60
60
|
};
|
61
61
|
|
62
|
+
/* Wrap the distance computer into one that negates the
|
63
|
+
distances. This makes supporting INNER_PRODUCE search easier */
|
64
|
+
|
65
|
+
struct NegativeDistanceComputer : DistanceComputer {
|
66
|
+
/// owned by this
|
67
|
+
DistanceComputer* basedis;
|
68
|
+
|
69
|
+
explicit NegativeDistanceComputer(DistanceComputer* basedis)
|
70
|
+
: basedis(basedis) {}
|
71
|
+
|
72
|
+
void set_query(const float* x) override {
|
73
|
+
basedis->set_query(x);
|
74
|
+
}
|
75
|
+
|
76
|
+
/// compute distance of vector i to current query
|
77
|
+
float operator()(idx_t i) override {
|
78
|
+
return -(*basedis)(i);
|
79
|
+
}
|
80
|
+
|
81
|
+
void distances_batch_4(
|
82
|
+
const idx_t idx0,
|
83
|
+
const idx_t idx1,
|
84
|
+
const idx_t idx2,
|
85
|
+
const idx_t idx3,
|
86
|
+
float& dis0,
|
87
|
+
float& dis1,
|
88
|
+
float& dis2,
|
89
|
+
float& dis3) override {
|
90
|
+
basedis->distances_batch_4(
|
91
|
+
idx0, idx1, idx2, idx3, dis0, dis1, dis2, dis3);
|
92
|
+
dis0 = -dis0;
|
93
|
+
dis1 = -dis1;
|
94
|
+
dis2 = -dis2;
|
95
|
+
dis3 = -dis3;
|
96
|
+
}
|
97
|
+
|
98
|
+
/// compute distance between two stored vectors
|
99
|
+
float symmetric_dis(idx_t i, idx_t j) override {
|
100
|
+
return -basedis->symmetric_dis(i, j);
|
101
|
+
}
|
102
|
+
|
103
|
+
virtual ~NegativeDistanceComputer() {
|
104
|
+
delete basedis;
|
105
|
+
}
|
106
|
+
};
|
107
|
+
|
62
108
|
/*************************************************************
|
63
109
|
* Specialized version of the DistanceComputer when we know that codes are
|
64
110
|
* laid out in a flat index.
|
@@ -94,13 +94,15 @@
|
|
94
94
|
} \
|
95
95
|
} while (false)
|
96
96
|
|
97
|
-
#define
|
97
|
+
#define FAISS_THROW_IF_MSG(X, MSG) \
|
98
98
|
do { \
|
99
|
-
if (
|
99
|
+
if (X) { \
|
100
100
|
FAISS_THROW_FMT("Error: '%s' failed: " MSG, #X); \
|
101
101
|
} \
|
102
102
|
} while (false)
|
103
103
|
|
104
|
+
#define FAISS_THROW_IF_NOT_MSG(X, MSG) FAISS_THROW_IF_MSG(!(X), MSG)
|
105
|
+
|
104
106
|
#define FAISS_THROW_IF_NOT_FMT(X, FMT, ...) \
|
105
107
|
do { \
|
106
108
|
if (!(X)) { \
|