faiss 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +292 -291
- data/vendor/faiss/faiss/AutoTune.h +55 -56
- data/vendor/faiss/faiss/Clustering.cpp +334 -195
- data/vendor/faiss/faiss/Clustering.h +88 -35
- data/vendor/faiss/faiss/IVFlib.cpp +171 -195
- data/vendor/faiss/faiss/IVFlib.h +48 -51
- data/vendor/faiss/faiss/Index.cpp +85 -103
- data/vendor/faiss/faiss/Index.h +54 -48
- data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
- data/vendor/faiss/faiss/Index2Layer.h +22 -22
- data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
- data/vendor/faiss/faiss/IndexBinary.h +140 -132
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
- data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
- data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
- data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
- data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
- data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
- data/vendor/faiss/faiss/IndexFlat.h +35 -46
- data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
- data/vendor/faiss/faiss/IndexHNSW.h +57 -41
- data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
- data/vendor/faiss/faiss/IndexIVF.h +146 -113
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
- data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
- data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
- data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
- data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
- data/vendor/faiss/faiss/IndexLSH.h +21 -26
- data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
- data/vendor/faiss/faiss/IndexLattice.h +11 -16
- data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
- data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
- data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
- data/vendor/faiss/faiss/IndexNSG.h +85 -0
- data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
- data/vendor/faiss/faiss/IndexPQ.h +64 -67
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
- data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
- data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
- data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
- data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
- data/vendor/faiss/faiss/IndexRefine.h +22 -23
- data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
- data/vendor/faiss/faiss/IndexReplicas.h +62 -56
- data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
- data/vendor/faiss/faiss/IndexResidual.h +152 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
- data/vendor/faiss/faiss/IndexShards.cpp +256 -240
- data/vendor/faiss/faiss/IndexShards.h +85 -73
- data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
- data/vendor/faiss/faiss/MatrixStats.h +7 -10
- data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
- data/vendor/faiss/faiss/MetaIndexes.h +40 -34
- data/vendor/faiss/faiss/MetricType.h +7 -7
- data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
- data/vendor/faiss/faiss/VectorTransform.h +61 -89
- data/vendor/faiss/faiss/clone_index.cpp +77 -73
- data/vendor/faiss/faiss/clone_index.h +4 -9
- data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
- data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
- data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
- data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
- data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
- data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
- data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
- data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
- data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
- data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
- data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
- data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
- data/vendor/faiss/faiss/impl/FaissException.h +41 -29
- data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
- data/vendor/faiss/faiss/impl/HNSW.h +179 -200
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
- data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
- data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
- data/vendor/faiss/faiss/impl/NSG.h +199 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
- data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
- data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
- data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
- data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
- data/vendor/faiss/faiss/impl/io.cpp +75 -94
- data/vendor/faiss/faiss/impl/io.h +31 -41
- data/vendor/faiss/faiss/impl/io_macros.h +40 -29
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
- data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
- data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
- data/vendor/faiss/faiss/index_factory.cpp +269 -218
- data/vendor/faiss/faiss/index_factory.h +6 -7
- data/vendor/faiss/faiss/index_io.h +23 -26
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
- data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
- data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
- data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
- data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
- data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
- data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
- data/vendor/faiss/faiss/utils/Heap.h +186 -209
- data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
- data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
- data/vendor/faiss/faiss/utils/distances.cpp +301 -310
- data/vendor/faiss/faiss/utils/distances.h +133 -118
- data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
- data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
- data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
- data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
- data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
- data/vendor/faiss/faiss/utils/hamming.h +62 -85
- data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
- data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
- data/vendor/faiss/faiss/utils/partitioning.h +26 -21
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
- data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
- data/vendor/faiss/faiss/utils/random.cpp +39 -63
- data/vendor/faiss/faiss/utils/random.h +13 -16
- data/vendor/faiss/faiss/utils/simdlib.h +4 -2
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
- data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
- data/vendor/faiss/faiss/utils/utils.cpp +304 -287
- data/vendor/faiss/faiss/utils/utils.h +53 -48
- metadata +20 -2
|
@@ -5,7 +5,6 @@
|
|
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
|
|
9
8
|
#pragma once
|
|
10
9
|
|
|
11
10
|
#include <cuda.h>
|
|
@@ -16,43 +15,45 @@
|
|
|
16
15
|
#define __device__
|
|
17
16
|
#endif
|
|
18
17
|
|
|
19
|
-
namespace faiss {
|
|
18
|
+
namespace faiss {
|
|
19
|
+
namespace gpu {
|
|
20
|
+
namespace utils {
|
|
20
21
|
|
|
21
22
|
template <typename U, typename V>
|
|
22
23
|
constexpr __host__ __device__ auto divDown(U a, V b) -> decltype(a + b) {
|
|
23
|
-
|
|
24
|
+
return (a / b);
|
|
24
25
|
}
|
|
25
26
|
|
|
26
27
|
template <typename U, typename V>
|
|
27
28
|
constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
|
|
28
|
-
|
|
29
|
+
return (a + b - 1) / b;
|
|
29
30
|
}
|
|
30
31
|
|
|
31
32
|
template <typename U, typename V>
|
|
32
33
|
constexpr __host__ __device__ auto roundDown(U a, V b) -> decltype(a + b) {
|
|
33
|
-
|
|
34
|
+
return divDown(a, b) * b;
|
|
34
35
|
}
|
|
35
36
|
|
|
36
37
|
template <typename U, typename V>
|
|
37
38
|
constexpr __host__ __device__ auto roundUp(U a, V b) -> decltype(a + b) {
|
|
38
|
-
|
|
39
|
+
return divUp(a, b) * b;
|
|
39
40
|
}
|
|
40
41
|
|
|
41
42
|
template <class T>
|
|
42
43
|
constexpr __host__ __device__ T pow(T n, T power) {
|
|
43
|
-
|
|
44
|
+
return (power > 0 ? n * pow(n, power - 1) : 1);
|
|
44
45
|
}
|
|
45
46
|
|
|
46
47
|
template <class T>
|
|
47
48
|
constexpr __host__ __device__ T pow2(T n) {
|
|
48
|
-
|
|
49
|
+
return pow(2, (T)n);
|
|
49
50
|
}
|
|
50
51
|
|
|
51
52
|
static_assert(pow2(8) == 256, "pow2");
|
|
52
53
|
|
|
53
54
|
template <typename T>
|
|
54
55
|
constexpr __host__ __device__ int log2(T n, int p = 0) {
|
|
55
|
-
|
|
56
|
+
return (n <= 1) ? p : log2(n / 2, p + 1);
|
|
56
57
|
}
|
|
57
58
|
|
|
58
59
|
static_assert(log2(2) == 1, "log2");
|
|
@@ -61,7 +62,7 @@ static_assert(log2(4) == 2, "log2");
|
|
|
61
62
|
|
|
62
63
|
template <typename T>
|
|
63
64
|
constexpr __host__ __device__ bool isPowerOf2(T v) {
|
|
64
|
-
|
|
65
|
+
return (v && !(v & (v - 1)));
|
|
65
66
|
}
|
|
66
67
|
|
|
67
68
|
static_assert(isPowerOf2(2048), "isPowerOf2");
|
|
@@ -69,7 +70,7 @@ static_assert(!isPowerOf2(3333), "isPowerOf2");
|
|
|
69
70
|
|
|
70
71
|
template <typename T>
|
|
71
72
|
constexpr __host__ __device__ T nextHighestPowerOf2(T v) {
|
|
72
|
-
|
|
73
|
+
return (isPowerOf2(v) ? (T)2 * v : ((T)1 << (log2(v) + 1)));
|
|
73
74
|
}
|
|
74
75
|
|
|
75
76
|
static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2");
|
|
@@ -81,9 +82,13 @@ static_assert(nextHighestPowerOf2(15) == 16, "nextHighestPowerOf2");
|
|
|
81
82
|
static_assert(nextHighestPowerOf2(16) == 32, "nextHighestPowerOf2");
|
|
82
83
|
static_assert(nextHighestPowerOf2(17) == 32, "nextHighestPowerOf2");
|
|
83
84
|
|
|
84
|
-
static_assert(
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
85
|
+
static_assert(
|
|
86
|
+
nextHighestPowerOf2(1536000000u) == 2147483648u,
|
|
87
|
+
"nextHighestPowerOf2");
|
|
88
|
+
static_assert(
|
|
89
|
+
nextHighestPowerOf2((size_t)2147483648ULL) == (size_t)4294967296ULL,
|
|
90
|
+
"nextHighestPowerOf2");
|
|
88
91
|
|
|
89
|
-
}
|
|
92
|
+
} // namespace utils
|
|
93
|
+
} // namespace gpu
|
|
94
|
+
} // namespace faiss
|
|
@@ -5,55 +5,51 @@
|
|
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
#include <faiss/gpu/utils/Timer.h>
|
|
10
8
|
#include <faiss/gpu/utils/DeviceUtils.h>
|
|
9
|
+
#include <faiss/gpu/utils/Timer.h>
|
|
11
10
|
#include <faiss/impl/FaissAssert.h>
|
|
12
11
|
#include <chrono>
|
|
13
12
|
|
|
14
|
-
namespace faiss {
|
|
13
|
+
namespace faiss {
|
|
14
|
+
namespace gpu {
|
|
15
15
|
|
|
16
16
|
KernelTimer::KernelTimer(cudaStream_t stream)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
CUDA_VERIFY(cudaEventCreate(&stopEvent_));
|
|
23
|
-
|
|
24
|
-
CUDA_VERIFY(cudaEventRecord(startEvent_, stream_));
|
|
17
|
+
: startEvent_(0), stopEvent_(0), stream_(stream), valid_(true) {
|
|
18
|
+
CUDA_VERIFY(cudaEventCreate(&startEvent_));
|
|
19
|
+
CUDA_VERIFY(cudaEventCreate(&stopEvent_));
|
|
20
|
+
|
|
21
|
+
CUDA_VERIFY(cudaEventRecord(startEvent_, stream_));
|
|
25
22
|
}
|
|
26
23
|
|
|
27
24
|
KernelTimer::~KernelTimer() {
|
|
28
|
-
|
|
29
|
-
|
|
25
|
+
CUDA_VERIFY(cudaEventDestroy(startEvent_));
|
|
26
|
+
CUDA_VERIFY(cudaEventDestroy(stopEvent_));
|
|
30
27
|
}
|
|
31
28
|
|
|
32
|
-
float
|
|
33
|
-
|
|
34
|
-
FAISS_ASSERT(valid_);
|
|
29
|
+
float KernelTimer::elapsedMilliseconds() {
|
|
30
|
+
FAISS_ASSERT(valid_);
|
|
35
31
|
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
CUDA_VERIFY(cudaEventRecord(stopEvent_, stream_));
|
|
33
|
+
CUDA_VERIFY(cudaEventSynchronize(stopEvent_));
|
|
38
34
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
35
|
+
auto time = 0.0f;
|
|
36
|
+
CUDA_VERIFY(cudaEventElapsedTime(&time, startEvent_, stopEvent_));
|
|
37
|
+
valid_ = false;
|
|
42
38
|
|
|
43
|
-
|
|
39
|
+
return time;
|
|
44
40
|
}
|
|
45
41
|
|
|
46
42
|
CpuTimer::CpuTimer() {
|
|
47
|
-
|
|
43
|
+
start_ = std::chrono::steady_clock::now();
|
|
48
44
|
}
|
|
49
45
|
|
|
50
|
-
float
|
|
51
|
-
|
|
52
|
-
auto end = std::chrono::steady_clock::now();
|
|
46
|
+
float CpuTimer::elapsedMilliseconds() {
|
|
47
|
+
auto end = std::chrono::steady_clock::now();
|
|
53
48
|
|
|
54
|
-
|
|
49
|
+
std::chrono::duration<float, std::milli> duration = end - start_;
|
|
55
50
|
|
|
56
|
-
|
|
51
|
+
return duration.count();
|
|
57
52
|
}
|
|
58
53
|
|
|
59
|
-
}
|
|
54
|
+
} // namespace gpu
|
|
55
|
+
} // namespace faiss
|
|
@@ -5,48 +5,49 @@
|
|
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
|
|
9
8
|
#pragma once
|
|
10
9
|
|
|
11
10
|
#include <cuda_runtime.h>
|
|
12
11
|
#include <chrono>
|
|
13
12
|
|
|
14
|
-
namespace faiss {
|
|
13
|
+
namespace faiss {
|
|
14
|
+
namespace gpu {
|
|
15
15
|
|
|
16
16
|
/// Utility class for timing execution of a kernel
|
|
17
17
|
class KernelTimer {
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
18
|
+
public:
|
|
19
|
+
/// Constructor starts the timer and adds an event into the current
|
|
20
|
+
/// device stream
|
|
21
|
+
KernelTimer(cudaStream_t stream = 0);
|
|
22
|
+
|
|
23
|
+
/// Destructor releases event resources
|
|
24
|
+
~KernelTimer();
|
|
25
|
+
|
|
26
|
+
/// Adds a stop event then synchronizes on the stop event to get the
|
|
27
|
+
/// actual GPU-side kernel timings for any kernels launched in the
|
|
28
|
+
/// current stream. Returns the number of milliseconds elapsed.
|
|
29
|
+
/// Can only be called once.
|
|
30
|
+
float elapsedMilliseconds();
|
|
31
|
+
|
|
32
|
+
private:
|
|
33
|
+
cudaEvent_t startEvent_;
|
|
34
|
+
cudaEvent_t stopEvent_;
|
|
35
|
+
cudaStream_t stream_;
|
|
36
|
+
bool valid_;
|
|
37
37
|
};
|
|
38
38
|
|
|
39
39
|
/// CPU wallclock elapsed timer
|
|
40
40
|
class CpuTimer {
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
41
|
+
public:
|
|
42
|
+
/// Creates and starts a new timer
|
|
43
|
+
CpuTimer();
|
|
44
44
|
|
|
45
|
-
|
|
46
|
-
|
|
45
|
+
/// Returns elapsed time in milliseconds
|
|
46
|
+
float elapsedMilliseconds();
|
|
47
47
|
|
|
48
|
-
|
|
49
|
-
|
|
48
|
+
private:
|
|
49
|
+
std::chrono::time_point<std::chrono::steady_clock> start_;
|
|
50
50
|
};
|
|
51
51
|
|
|
52
|
-
}
|
|
52
|
+
} // namespace gpu
|
|
53
|
+
} // namespace faiss
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// -*- c++ -*-
|
|
9
|
+
|
|
10
|
+
#include <faiss/impl/AdditiveQuantizer.h>
|
|
11
|
+
#include <faiss/impl/FaissAssert.h>
|
|
12
|
+
|
|
13
|
+
#include <cstddef>
|
|
14
|
+
#include <cstdio>
|
|
15
|
+
#include <cstring>
|
|
16
|
+
#include <memory>
|
|
17
|
+
#include <random>
|
|
18
|
+
|
|
19
|
+
#include <algorithm>
|
|
20
|
+
|
|
21
|
+
#include <faiss/utils/Heap.h>
|
|
22
|
+
#include <faiss/utils/distances.h>
|
|
23
|
+
#include <faiss/utils/hamming.h> // BitstringWriter
|
|
24
|
+
#include <faiss/utils/utils.h>
|
|
25
|
+
|
|
26
|
+
extern "C" {
|
|
27
|
+
|
|
28
|
+
// general matrix multiplication
|
|
29
|
+
int sgemm_(
|
|
30
|
+
const char* transa,
|
|
31
|
+
const char* transb,
|
|
32
|
+
FINTEGER* m,
|
|
33
|
+
FINTEGER* n,
|
|
34
|
+
FINTEGER* k,
|
|
35
|
+
const float* alpha,
|
|
36
|
+
const float* a,
|
|
37
|
+
FINTEGER* lda,
|
|
38
|
+
const float* b,
|
|
39
|
+
FINTEGER* ldb,
|
|
40
|
+
float* beta,
|
|
41
|
+
float* c,
|
|
42
|
+
FINTEGER* ldc);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
namespace {
|
|
46
|
+
|
|
47
|
+
// c and a and b can overlap
|
|
48
|
+
void fvec_add(size_t d, const float* a, const float* b, float* c) {
|
|
49
|
+
for (size_t i = 0; i < d; i++) {
|
|
50
|
+
c[i] = a[i] + b[i];
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
void fvec_add(size_t d, const float* a, float b, float* c) {
|
|
55
|
+
for (size_t i = 0; i < d; i++) {
|
|
56
|
+
c[i] = a[i] + b;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
} // namespace
|
|
61
|
+
|
|
62
|
+
namespace faiss {
|
|
63
|
+
|
|
64
|
+
void AdditiveQuantizer::set_derived_values() {
|
|
65
|
+
tot_bits = 0;
|
|
66
|
+
is_byte_aligned = true;
|
|
67
|
+
codebook_offsets.resize(M + 1, 0);
|
|
68
|
+
for (int i = 0; i < M; i++) {
|
|
69
|
+
int nbit = nbits[i];
|
|
70
|
+
size_t k = 1 << nbit;
|
|
71
|
+
codebook_offsets[i + 1] = codebook_offsets[i] + k;
|
|
72
|
+
tot_bits += nbit;
|
|
73
|
+
if (nbit % 8 != 0) {
|
|
74
|
+
is_byte_aligned = false;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
total_codebook_size = codebook_offsets[M];
|
|
78
|
+
// convert bits to bytes
|
|
79
|
+
code_size = (tot_bits + 7) / 8;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
void AdditiveQuantizer::pack_codes(
|
|
83
|
+
size_t n,
|
|
84
|
+
const int32_t* codes,
|
|
85
|
+
uint8_t* packed_codes,
|
|
86
|
+
int64_t ld_codes) const {
|
|
87
|
+
if (ld_codes == -1) {
|
|
88
|
+
ld_codes = M;
|
|
89
|
+
}
|
|
90
|
+
#pragma omp parallel for if (n > 1000)
|
|
91
|
+
for (int64_t i = 0; i < n; i++) {
|
|
92
|
+
const int32_t* codes1 = codes + i * ld_codes;
|
|
93
|
+
BitstringWriter bsw(packed_codes + i * code_size, code_size);
|
|
94
|
+
for (int m = 0; m < M; m++) {
|
|
95
|
+
bsw.write(codes1[m], nbits[m]);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
void AdditiveQuantizer::decode(const uint8_t* code, float* x, size_t n) const {
|
|
101
|
+
FAISS_THROW_IF_NOT_MSG(
|
|
102
|
+
is_trained, "The additive quantizer is not trained yet.");
|
|
103
|
+
|
|
104
|
+
// standard additive quantizer decoding
|
|
105
|
+
#pragma omp parallel for if (n > 1000)
|
|
106
|
+
for (int64_t i = 0; i < n; i++) {
|
|
107
|
+
BitstringReader bsr(code + i * code_size, code_size);
|
|
108
|
+
float* xi = x + i * d;
|
|
109
|
+
for (int m = 0; m < M; m++) {
|
|
110
|
+
int idx = bsr.read(nbits[m]);
|
|
111
|
+
const float* c = codebooks.data() + d * (codebook_offsets[m] + idx);
|
|
112
|
+
if (m == 0) {
|
|
113
|
+
memcpy(xi, c, sizeof(*x) * d);
|
|
114
|
+
} else {
|
|
115
|
+
fvec_add(d, xi, c, xi);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
AdditiveQuantizer::~AdditiveQuantizer() {}
|
|
122
|
+
|
|
123
|
+
/****************************************************************************
|
|
124
|
+
* Support for fast distance computations and search with additive quantizer
|
|
125
|
+
****************************************************************************/
|
|
126
|
+
|
|
127
|
+
void AdditiveQuantizer::compute_centroid_norms(float* norms) const {
|
|
128
|
+
size_t ntotal = (size_t)1 << tot_bits;
|
|
129
|
+
// TODO: make tree of partial sums
|
|
130
|
+
#pragma omp parallel
|
|
131
|
+
{
|
|
132
|
+
std::vector<float> tmp(d);
|
|
133
|
+
#pragma omp for
|
|
134
|
+
for (int64_t i = 0; i < ntotal; i++) {
|
|
135
|
+
decode_64bit(i, tmp.data());
|
|
136
|
+
norms[i] = fvec_norm_L2sqr(tmp.data(), d);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
void AdditiveQuantizer::decode_64bit(idx_t bits, float* xi) const {
|
|
142
|
+
for (int m = 0; m < M; m++) {
|
|
143
|
+
idx_t idx = bits & (((size_t)1 << nbits[m]) - 1);
|
|
144
|
+
bits >>= nbits[m];
|
|
145
|
+
const float* c = codebooks.data() + d * (codebook_offsets[m] + idx);
|
|
146
|
+
if (m == 0) {
|
|
147
|
+
memcpy(xi, c, sizeof(*xi) * d);
|
|
148
|
+
} else {
|
|
149
|
+
fvec_add(d, xi, c, xi);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
void AdditiveQuantizer::compute_LUT(size_t n, const float* xq, float* LUT)
|
|
155
|
+
const {
|
|
156
|
+
// in all cases, it is large matrix multiplication
|
|
157
|
+
|
|
158
|
+
FINTEGER ncenti = total_codebook_size;
|
|
159
|
+
FINTEGER di = d;
|
|
160
|
+
FINTEGER nqi = n;
|
|
161
|
+
float one = 1, zero = 0;
|
|
162
|
+
|
|
163
|
+
sgemm_("Transposed",
|
|
164
|
+
"Not transposed",
|
|
165
|
+
&ncenti,
|
|
166
|
+
&nqi,
|
|
167
|
+
&di,
|
|
168
|
+
&one,
|
|
169
|
+
codebooks.data(),
|
|
170
|
+
&di,
|
|
171
|
+
xq,
|
|
172
|
+
&di,
|
|
173
|
+
&zero,
|
|
174
|
+
LUT,
|
|
175
|
+
&ncenti);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
namespace {
|
|
179
|
+
|
|
180
|
+
void compute_inner_prod_with_LUT(
|
|
181
|
+
const AdditiveQuantizer& aq,
|
|
182
|
+
const float* LUT,
|
|
183
|
+
float* ips) {
|
|
184
|
+
size_t prev_size = 1;
|
|
185
|
+
for (int m = 0; m < aq.M; m++) {
|
|
186
|
+
const float* LUTm = LUT + aq.codebook_offsets[m];
|
|
187
|
+
int nb = aq.nbits[m];
|
|
188
|
+
size_t nc = (size_t)1 << nb;
|
|
189
|
+
|
|
190
|
+
if (m == 0) {
|
|
191
|
+
memcpy(ips, LUT, sizeof(*ips) * nc);
|
|
192
|
+
} else {
|
|
193
|
+
for (int64_t i = nc - 1; i >= 0; i--) {
|
|
194
|
+
float v = LUTm[i];
|
|
195
|
+
fvec_add(prev_size, ips, v, ips + i * prev_size);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
prev_size *= nc;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
} // anonymous namespace
|
|
203
|
+
|
|
204
|
+
void AdditiveQuantizer::knn_exact_inner_product(
|
|
205
|
+
idx_t n,
|
|
206
|
+
const float* xq,
|
|
207
|
+
idx_t k,
|
|
208
|
+
float* distances,
|
|
209
|
+
idx_t* labels) const {
|
|
210
|
+
std::unique_ptr<float[]> LUT(new float[n * total_codebook_size]);
|
|
211
|
+
compute_LUT(n, xq, LUT.get());
|
|
212
|
+
size_t ntotal = (size_t)1 << tot_bits;
|
|
213
|
+
|
|
214
|
+
#pragma omp parallel if (n > 100)
|
|
215
|
+
{
|
|
216
|
+
std::vector<float> dis(ntotal);
|
|
217
|
+
#pragma omp for
|
|
218
|
+
for (idx_t i = 0; i < n; i++) {
|
|
219
|
+
const float* LUTi = LUT.get() + i * total_codebook_size;
|
|
220
|
+
compute_inner_prod_with_LUT(*this, LUTi, dis.data());
|
|
221
|
+
float* distances_i = distances + i * k;
|
|
222
|
+
idx_t* labels_i = labels + i * k;
|
|
223
|
+
minheap_heapify(k, distances_i, labels_i);
|
|
224
|
+
minheap_addn(k, distances_i, labels_i, dis.data(), nullptr, ntotal);
|
|
225
|
+
minheap_reorder(k, distances_i, labels_i);
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
void AdditiveQuantizer::knn_exact_L2(
|
|
231
|
+
idx_t n,
|
|
232
|
+
const float* xq,
|
|
233
|
+
idx_t k,
|
|
234
|
+
float* distances,
|
|
235
|
+
idx_t* labels,
|
|
236
|
+
const float* norms) const {
|
|
237
|
+
std::unique_ptr<float[]> LUT(new float[n * total_codebook_size]);
|
|
238
|
+
compute_LUT(n, xq, LUT.get());
|
|
239
|
+
std::unique_ptr<float[]> q_norms(new float[n]);
|
|
240
|
+
fvec_norms_L2sqr(q_norms.get(), xq, d, n);
|
|
241
|
+
size_t ntotal = (size_t)1 << tot_bits;
|
|
242
|
+
|
|
243
|
+
#pragma omp parallel if (n > 100)
|
|
244
|
+
{
|
|
245
|
+
std::vector<float> dis(ntotal);
|
|
246
|
+
#pragma omp for
|
|
247
|
+
for (idx_t i = 0; i < n; i++) {
|
|
248
|
+
const float* LUTi = LUT.get() + i * total_codebook_size;
|
|
249
|
+
float* distances_i = distances + i * k;
|
|
250
|
+
idx_t* labels_i = labels + i * k;
|
|
251
|
+
|
|
252
|
+
compute_inner_prod_with_LUT(*this, LUTi, dis.data());
|
|
253
|
+
|
|
254
|
+
// update distances using
|
|
255
|
+
// ||x - y||^2 = ||x||^2 + ||y||^2 - 2 * <x,y>
|
|
256
|
+
|
|
257
|
+
maxheap_heapify(k, distances_i, labels_i);
|
|
258
|
+
for (idx_t j = 0; j < ntotal; j++) {
|
|
259
|
+
float disj = q_norms[i] + norms[j] - 2 * dis[j];
|
|
260
|
+
if (disj < distances_i[0]) {
|
|
261
|
+
heap_replace_top<CMax<float, int64_t>>(
|
|
262
|
+
k, distances_i, labels_i, disj, j);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
maxheap_reorder(k, distances_i, labels_i);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
} // namespace faiss
|