faiss 0.1.5 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/README.md +12 -0
- data/ext/faiss/ext.cpp +1 -1
- data/ext/faiss/extconf.rb +6 -2
- data/ext/faiss/index.cpp +114 -43
- data/ext/faiss/index_binary.cpp +24 -30
- data/ext/faiss/kmeans.cpp +20 -16
- data/ext/faiss/numo.hpp +867 -0
- data/ext/faiss/pca_matrix.cpp +13 -14
- data/ext/faiss/product_quantizer.cpp +23 -24
- data/ext/faiss/utils.cpp +10 -37
- data/ext/faiss/utils.h +2 -13
- data/lib/faiss.rb +0 -5
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +292 -291
- data/vendor/faiss/faiss/AutoTune.h +55 -56
- data/vendor/faiss/faiss/Clustering.cpp +334 -195
- data/vendor/faiss/faiss/Clustering.h +88 -35
- data/vendor/faiss/faiss/IVFlib.cpp +171 -195
- data/vendor/faiss/faiss/IVFlib.h +48 -51
- data/vendor/faiss/faiss/Index.cpp +85 -103
- data/vendor/faiss/faiss/Index.h +54 -48
- data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
- data/vendor/faiss/faiss/Index2Layer.h +22 -22
- data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
- data/vendor/faiss/faiss/IndexBinary.h +140 -132
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
- data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
- data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
- data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
- data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
- data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
- data/vendor/faiss/faiss/IndexFlat.h +35 -46
- data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
- data/vendor/faiss/faiss/IndexHNSW.h +57 -41
- data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
- data/vendor/faiss/faiss/IndexIVF.h +146 -113
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
- data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
- data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
- data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
- data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
- data/vendor/faiss/faiss/IndexLSH.h +21 -26
- data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
- data/vendor/faiss/faiss/IndexLattice.h +11 -16
- data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
- data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
- data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
- data/vendor/faiss/faiss/IndexNSG.h +85 -0
- data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
- data/vendor/faiss/faiss/IndexPQ.h +64 -67
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
- data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
- data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
- data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
- data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
- data/vendor/faiss/faiss/IndexRefine.h +22 -23
- data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
- data/vendor/faiss/faiss/IndexReplicas.h +62 -56
- data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
- data/vendor/faiss/faiss/IndexResidual.h +152 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
- data/vendor/faiss/faiss/IndexShards.cpp +256 -240
- data/vendor/faiss/faiss/IndexShards.h +85 -73
- data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
- data/vendor/faiss/faiss/MatrixStats.h +7 -10
- data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
- data/vendor/faiss/faiss/MetaIndexes.h +40 -34
- data/vendor/faiss/faiss/MetricType.h +7 -7
- data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
- data/vendor/faiss/faiss/VectorTransform.h +61 -89
- data/vendor/faiss/faiss/clone_index.cpp +77 -73
- data/vendor/faiss/faiss/clone_index.h +4 -9
- data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
- data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
- data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
- data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
- data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
- data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
- data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
- data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
- data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
- data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
- data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
- data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
- data/vendor/faiss/faiss/impl/FaissException.h +41 -29
- data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
- data/vendor/faiss/faiss/impl/HNSW.h +179 -200
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
- data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
- data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
- data/vendor/faiss/faiss/impl/NSG.h +199 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
- data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
- data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
- data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
- data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
- data/vendor/faiss/faiss/impl/io.cpp +75 -94
- data/vendor/faiss/faiss/impl/io.h +31 -41
- data/vendor/faiss/faiss/impl/io_macros.h +40 -29
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
- data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
- data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
- data/vendor/faiss/faiss/index_factory.cpp +269 -218
- data/vendor/faiss/faiss/index_factory.h +6 -7
- data/vendor/faiss/faiss/index_io.h +23 -26
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
- data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
- data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
- data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
- data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
- data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
- data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
- data/vendor/faiss/faiss/utils/Heap.h +186 -209
- data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
- data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
- data/vendor/faiss/faiss/utils/distances.cpp +301 -310
- data/vendor/faiss/faiss/utils/distances.h +133 -118
- data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
- data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
- data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
- data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
- data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
- data/vendor/faiss/faiss/utils/hamming.h +62 -85
- data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
- data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
- data/vendor/faiss/faiss/utils/partitioning.h +26 -21
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
- data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
- data/vendor/faiss/faiss/utils/random.cpp +39 -63
- data/vendor/faiss/faiss/utils/random.h +13 -16
- data/vendor/faiss/faiss/utils/simdlib.h +4 -2
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
- data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
- data/vendor/faiss/faiss/utils/utils.cpp +304 -287
- data/vendor/faiss/faiss/utils/utils.h +53 -48
- metadata +24 -10
- data/lib/faiss/index.rb +0 -20
- data/lib/faiss/index_binary.rb +0 -20
- data/lib/faiss/kmeans.rb +0 -15
- data/lib/faiss/pca_matrix.rb +0 -15
- data/lib/faiss/product_quantizer.rb +0 -22
@@ -5,55 +5,59 @@
|
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
6
6
|
*/
|
7
7
|
|
8
|
-
|
9
8
|
#pragma once
|
10
9
|
|
11
|
-
#include <faiss/impl/FaissAssert.h>
|
12
|
-
#include <cuda_runtime.h>
|
13
10
|
#include <cublas_v2.h>
|
11
|
+
#include <cuda_runtime.h>
|
12
|
+
#include <faiss/impl/FaissAssert.h>
|
14
13
|
#include <memory>
|
15
14
|
#include <utility>
|
16
15
|
#include <vector>
|
17
16
|
|
18
|
-
namespace faiss {
|
17
|
+
namespace faiss {
|
18
|
+
namespace gpu {
|
19
19
|
|
20
20
|
class GpuResources;
|
21
21
|
|
22
22
|
enum AllocType {
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
23
|
+
/// Unknown allocation type or miscellaneous (not currently categorized)
|
24
|
+
Other = 0,
|
25
|
+
|
26
|
+
/// Primary data storage for GpuIndexFlat (the raw matrix of vectors and
|
27
|
+
/// vector norms if needed)
|
28
|
+
FlatData = 1,
|
29
|
+
|
30
|
+
/// Primary data storage for GpuIndexIVF* (the storage for each individual
|
31
|
+
/// IVF
|
32
|
+
/// list)
|
33
|
+
IVFLists = 2,
|
34
|
+
|
35
|
+
/// Quantizer (PQ, SQ) dictionary information
|
36
|
+
Quantizer = 3,
|
37
|
+
|
38
|
+
/// For GpuIndexIVFPQ, "precomputed codes" for more efficient PQ lookup
|
39
|
+
/// require the use of possibly large tables. These are marked separately
|
40
|
+
/// from
|
41
|
+
/// Quantizer as these can frequently be 100s - 1000s of MiB in size
|
42
|
+
QuantizerPrecomputedCodes = 4,
|
43
|
+
|
44
|
+
///
|
45
|
+
/// StandardGpuResources implementation specific types
|
46
|
+
///
|
47
|
+
|
48
|
+
/// When using StandardGpuResources, temporary memory allocations
|
49
|
+
/// (MemorySpace::Temporary) come out of a stack region of memory that is
|
50
|
+
/// allocated up front for each gpu (e.g., 1.5 GiB upon initialization).
|
51
|
+
/// This
|
52
|
+
/// allocation by StandardGpuResources is marked with this AllocType.
|
53
|
+
TemporaryMemoryBuffer = 10,
|
54
|
+
|
55
|
+
/// When using StandardGpuResources, any MemorySpace::Temporary allocations
|
56
|
+
/// that cannot be satisfied within the TemporaryMemoryBuffer region fall
|
57
|
+
/// back
|
58
|
+
/// to calling cudaMalloc which are sized to just the request at hand. These
|
59
|
+
/// "overflow" temporary allocations are marked with this AllocType.
|
60
|
+
TemporaryMemoryOverflow = 11,
|
57
61
|
};
|
58
62
|
|
59
63
|
/// Convert an AllocType to string
|
@@ -61,16 +65,17 @@ std::string allocTypeToString(AllocType t);
|
|
61
65
|
|
62
66
|
/// Memory regions accessible to the GPU
|
63
67
|
enum MemorySpace {
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
+
/// Temporary device memory (guaranteed to no longer be used upon exit of a
|
69
|
+
/// top-level index call, and where the streams using it have completed GPU
|
70
|
+
/// work). Typically backed by Device memory (cudaMalloc/cudaFree).
|
71
|
+
Temporary = 0,
|
68
72
|
|
69
|
-
|
70
|
-
|
73
|
+
/// Managed using cudaMalloc/cudaFree (typical GPU device memory)
|
74
|
+
Device = 1,
|
71
75
|
|
72
|
-
|
73
|
-
|
76
|
+
/// Managed using cudaMallocManaged/cudaFree (typical Unified CPU/GPU
|
77
|
+
/// memory)
|
78
|
+
Unified = 2,
|
74
79
|
};
|
75
80
|
|
76
81
|
/// Convert a MemorySpace to string
|
@@ -78,44 +83,36 @@ std::string memorySpaceToString(MemorySpace s);
|
|
78
83
|
|
79
84
|
/// Information on what/where an allocation is
|
80
85
|
struct AllocInfo {
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
/// piece of memory cached and to be returned for this call was last used on
|
112
|
-
/// stream 3 and a new memory request is for stream 4, the memory manager will
|
113
|
-
/// synchronize stream 4 to wait for the completion of stream 3 via events or
|
114
|
-
/// other stream synchronization.
|
115
|
-
///
|
116
|
-
/// The memory manager guarantees that the returned memory is free to use
|
117
|
-
/// without data races on this stream specified.
|
118
|
-
cudaStream_t stream;
|
86
|
+
inline AllocInfo()
|
87
|
+
: type(AllocType::Other),
|
88
|
+
device(0),
|
89
|
+
space(MemorySpace::Device),
|
90
|
+
stream(nullptr) {}
|
91
|
+
|
92
|
+
inline AllocInfo(AllocType at, int dev, MemorySpace sp, cudaStream_t st)
|
93
|
+
: type(at), device(dev), space(sp), stream(st) {}
|
94
|
+
|
95
|
+
/// Returns a string representation of this info
|
96
|
+
std::string toString() const;
|
97
|
+
|
98
|
+
/// The internal category of the allocation
|
99
|
+
AllocType type;
|
100
|
+
|
101
|
+
/// The device on which the allocation is happening
|
102
|
+
int device;
|
103
|
+
|
104
|
+
/// The memory space of the allocation
|
105
|
+
MemorySpace space;
|
106
|
+
|
107
|
+
/// The stream on which new work on the memory will be ordered (e.g., if a
|
108
|
+
/// piece of memory cached and to be returned for this call was last used on
|
109
|
+
/// stream 3 and a new memory request is for stream 4, the memory manager
|
110
|
+
/// will synchronize stream 4 to wait for the completion of stream 3 via
|
111
|
+
/// events or other stream synchronization.
|
112
|
+
///
|
113
|
+
/// The memory manager guarantees that the returned memory is free to use
|
114
|
+
/// without data races on this stream specified.
|
115
|
+
cudaStream_t stream;
|
119
116
|
};
|
120
117
|
|
121
118
|
/// Create an AllocInfo for the current device with MemorySpace::Device
|
@@ -129,140 +126,139 @@ AllocInfo makeSpaceAlloc(AllocType at, MemorySpace sp, cudaStream_t st);
|
|
129
126
|
|
130
127
|
/// Information on what/where an allocation is, along with how big it should be
|
131
128
|
struct AllocRequest : public AllocInfo {
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
}
|
151
|
-
|
152
|
-
/// Returns a string representation of this request
|
153
|
-
std::string toString() const;
|
154
|
-
|
155
|
-
/// The size in bytes of the allocation
|
156
|
-
size_t size;
|
129
|
+
inline AllocRequest() : AllocInfo(), size(0) {}
|
130
|
+
|
131
|
+
inline AllocRequest(const AllocInfo& info, size_t sz)
|
132
|
+
: AllocInfo(info), size(sz) {}
|
133
|
+
|
134
|
+
inline AllocRequest(
|
135
|
+
AllocType at,
|
136
|
+
int dev,
|
137
|
+
MemorySpace sp,
|
138
|
+
cudaStream_t st,
|
139
|
+
size_t sz)
|
140
|
+
: AllocInfo(at, dev, sp, st), size(sz) {}
|
141
|
+
|
142
|
+
/// Returns a string representation of this request
|
143
|
+
std::string toString() const;
|
144
|
+
|
145
|
+
/// The size in bytes of the allocation
|
146
|
+
size_t size;
|
157
147
|
};
|
158
148
|
|
159
149
|
/// A RAII object that manages a temporary memory request
|
160
150
|
struct GpuMemoryReservation {
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
151
|
+
GpuMemoryReservation();
|
152
|
+
GpuMemoryReservation(
|
153
|
+
GpuResources* r,
|
154
|
+
int dev,
|
155
|
+
cudaStream_t str,
|
156
|
+
void* p,
|
157
|
+
size_t sz);
|
158
|
+
GpuMemoryReservation(GpuMemoryReservation&& m) noexcept;
|
159
|
+
~GpuMemoryReservation();
|
160
|
+
|
161
|
+
GpuMemoryReservation& operator=(GpuMemoryReservation&& m);
|
162
|
+
|
163
|
+
inline void* get() {
|
164
|
+
return data;
|
165
|
+
}
|
166
|
+
|
167
|
+
void release();
|
168
|
+
|
169
|
+
GpuResources* res;
|
170
|
+
int device;
|
171
|
+
cudaStream_t stream;
|
172
|
+
void* data;
|
173
|
+
size_t size;
|
181
174
|
};
|
182
175
|
|
183
176
|
/// Base class of GPU-side resource provider; hides provision of
|
184
177
|
/// cuBLAS handles, CUDA streams and all device memory allocation performed
|
185
178
|
class GpuResources {
|
186
|
-
|
187
|
-
|
179
|
+
public:
|
180
|
+
virtual ~GpuResources();
|
188
181
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
182
|
+
/// Call to pre-allocate resources for a particular device. If this is
|
183
|
+
/// not called, then resources will be allocated at the first time
|
184
|
+
/// of demand
|
185
|
+
virtual void initializeForDevice(int device) = 0;
|
193
186
|
|
194
|
-
|
195
|
-
|
187
|
+
/// Returns the cuBLAS handle that we use for the given device
|
188
|
+
virtual cublasHandle_t getBlasHandle(int device) = 0;
|
196
189
|
|
197
|
-
|
198
|
-
|
199
|
-
|
190
|
+
/// Returns the stream that we order all computation on for the
|
191
|
+
/// given device
|
192
|
+
virtual cudaStream_t getDefaultStream(int device) = 0;
|
200
193
|
|
201
|
-
|
202
|
-
|
203
|
-
|
194
|
+
/// Overrides the default stream for a device to the user-supplied stream.
|
195
|
+
/// The resources object does not own this stream (i.e., it will not destroy
|
196
|
+
/// it).
|
197
|
+
virtual void setDefaultStream(int device, cudaStream_t stream) = 0;
|
204
198
|
|
205
|
-
|
206
|
-
|
199
|
+
/// Returns the set of alternative streams that we use for the given device
|
200
|
+
virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
|
207
201
|
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
202
|
+
/// Memory management
|
203
|
+
/// Returns an allocation from the given memory space, ordered with respect
|
204
|
+
/// to the given stream (i.e., the first user will be a kernel in this
|
205
|
+
/// stream). All allocations are sized internally to be the next highest
|
206
|
+
/// multiple of 16 bytes, and all allocations returned are guaranteed to be
|
207
|
+
/// 16 byte aligned.
|
208
|
+
virtual void* allocMemory(const AllocRequest& req) = 0;
|
214
209
|
|
215
|
-
|
216
|
-
|
210
|
+
/// Returns a previous allocation
|
211
|
+
virtual void deallocMemory(int device, void* in) = 0;
|
217
212
|
|
218
|
-
|
219
|
-
|
220
|
-
|
213
|
+
/// For MemorySpace::Temporary, how much space is immediately available
|
214
|
+
/// without cudaMalloc allocation?
|
215
|
+
virtual size_t getTempMemoryAvailable(int device) const = 0;
|
221
216
|
|
222
|
-
|
223
|
-
|
217
|
+
/// Returns the available CPU pinned memory buffer
|
218
|
+
virtual std::pair<void*, size_t> getPinnedMemory() = 0;
|
224
219
|
|
225
|
-
|
226
|
-
|
220
|
+
/// Returns the stream on which we perform async CPU <-> GPU copies
|
221
|
+
virtual cudaStream_t getAsyncCopyStream(int device) = 0;
|
227
222
|
|
228
|
-
|
229
|
-
|
230
|
-
|
223
|
+
///
|
224
|
+
/// Functions provided by default
|
225
|
+
///
|
231
226
|
|
232
|
-
|
233
|
-
|
227
|
+
/// Calls getBlasHandle with the current device
|
228
|
+
cublasHandle_t getBlasHandleCurrentDevice();
|
234
229
|
|
235
|
-
|
236
|
-
|
230
|
+
/// Calls getDefaultStream with the current device
|
231
|
+
cudaStream_t getDefaultStreamCurrentDevice();
|
237
232
|
|
238
|
-
|
239
|
-
|
233
|
+
/// Calls getTempMemoryAvailable with the current device
|
234
|
+
size_t getTempMemoryAvailableCurrentDevice() const;
|
240
235
|
|
241
|
-
|
242
|
-
|
236
|
+
/// Returns a temporary memory allocation via a RAII object
|
237
|
+
GpuMemoryReservation allocMemoryHandle(const AllocRequest& req);
|
243
238
|
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
239
|
+
/// Synchronizes the CPU with respect to the default stream for the
|
240
|
+
/// given device
|
241
|
+
// equivalent to cudaDeviceSynchronize(getDefaultStream(device))
|
242
|
+
void syncDefaultStream(int device);
|
248
243
|
|
249
|
-
|
250
|
-
|
244
|
+
/// Calls syncDefaultStream for the current device
|
245
|
+
void syncDefaultStreamCurrentDevice();
|
251
246
|
|
252
|
-
|
253
|
-
|
247
|
+
/// Calls getAlternateStreams for the current device
|
248
|
+
std::vector<cudaStream_t> getAlternateStreamsCurrentDevice();
|
254
249
|
|
255
|
-
|
256
|
-
|
250
|
+
/// Calls getAsyncCopyStream for the current device
|
251
|
+
cudaStream_t getAsyncCopyStreamCurrentDevice();
|
257
252
|
};
|
258
253
|
|
259
254
|
/// Interface for a provider of a shared resources object
|
260
255
|
class GpuResourcesProvider {
|
261
|
-
|
262
|
-
|
256
|
+
public:
|
257
|
+
virtual ~GpuResourcesProvider();
|
263
258
|
|
264
|
-
|
265
|
-
|
259
|
+
/// Returns the shared resources object
|
260
|
+
virtual std::shared_ptr<GpuResources> getResources() = 0;
|
266
261
|
};
|
267
262
|
|
268
|
-
}
|
263
|
+
} // namespace gpu
|
264
|
+
} // namespace faiss
|
@@ -5,16 +5,16 @@
|
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
6
6
|
*/
|
7
7
|
|
8
|
-
|
9
8
|
#include <faiss/gpu/StandardGpuResources.h>
|
10
9
|
#include <faiss/gpu/utils/DeviceUtils.h>
|
11
10
|
#include <faiss/gpu/utils/StaticUtils.h>
|
12
11
|
#include <faiss/impl/FaissAssert.h>
|
13
|
-
#include <limits>
|
14
12
|
#include <iostream>
|
13
|
+
#include <limits>
|
15
14
|
#include <sstream>
|
16
15
|
|
17
|
-
namespace faiss {
|
16
|
+
namespace faiss {
|
17
|
+
namespace gpu {
|
18
18
|
|
19
19
|
namespace {
|
20
20
|
|
@@ -22,513 +22,536 @@ namespace {
|
|
22
22
|
constexpr int kNumStreams = 2;
|
23
23
|
|
24
24
|
// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
|
25
|
-
constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)
|
25
|
+
constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
|
26
26
|
|
27
27
|
// Default temporary memory allocation for <= 4 GiB memory GPUs
|
28
|
-
constexpr size_t k4GiBTempMem = (size_t)
|
28
|
+
constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
|
29
29
|
|
30
30
|
// Default temporary memory allocation for <= 8 GiB memory GPUs
|
31
|
-
constexpr size_t k8GiBTempMem = (size_t)
|
31
|
+
constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
|
32
32
|
|
33
33
|
// Maximum temporary memory allocation for all GPUs
|
34
|
-
constexpr size_t kMaxTempMem = (size_t)
|
34
|
+
constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
|
35
35
|
|
36
36
|
std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
37
|
+
// Produce a sorted list of all outstanding allocations by type
|
38
|
+
std::unordered_map<AllocType, std::pair<int, size_t>> stats;
|
39
|
+
|
40
|
+
for (auto& entry : map) {
|
41
|
+
auto& a = entry.second;
|
42
|
+
|
43
|
+
auto it = stats.find(a.type);
|
44
|
+
if (it != stats.end()) {
|
45
|
+
stats[a.type].first++;
|
46
|
+
stats[a.type].second += a.size;
|
47
|
+
} else {
|
48
|
+
stats[a.type] = std::make_pair(1, a.size);
|
49
|
+
}
|
49
50
|
}
|
50
|
-
}
|
51
51
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
52
|
+
std::stringstream ss;
|
53
|
+
for (auto& entry : stats) {
|
54
|
+
ss << "Alloc type " << allocTypeToString(entry.first) << ": "
|
55
|
+
<< entry.second.first << " allocations, " << entry.second.second
|
56
|
+
<< " bytes\n";
|
57
|
+
}
|
58
58
|
|
59
|
-
|
59
|
+
return ss.str();
|
60
60
|
}
|
61
61
|
|
62
|
-
}
|
62
|
+
} // namespace
|
63
63
|
|
64
64
|
//
|
65
65
|
// StandardGpuResourcesImpl
|
66
66
|
//
|
67
67
|
|
68
|
-
StandardGpuResourcesImpl::StandardGpuResourcesImpl()
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
}
|
68
|
+
StandardGpuResourcesImpl::StandardGpuResourcesImpl()
|
69
|
+
: pinnedMemAlloc_(nullptr),
|
70
|
+
pinnedMemAllocSize_(0),
|
71
|
+
// let the adjustment function determine the memory size for us by
|
72
|
+
// passing in a huge value that will then be adjusted
|
73
|
+
tempMemSize_(getDefaultTempMemForGPU(
|
74
|
+
-1,
|
75
|
+
std::numeric_limits<size_t>::max())),
|
76
|
+
pinnedMemSize_(kDefaultPinnedMemoryAllocation),
|
77
|
+
allocLogging_(false) {}
|
78
78
|
|
79
79
|
StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
80
|
+
// The temporary memory allocator has allocated memory through us, so clean
|
81
|
+
// that up before we finish fully de-initializing ourselves
|
82
|
+
tempMemory_.clear();
|
83
|
+
|
84
|
+
// Make sure all allocations have been freed
|
85
|
+
bool allocError = false;
|
86
|
+
|
87
|
+
for (auto& entry : allocs_) {
|
88
|
+
auto& map = entry.second;
|
89
|
+
|
90
|
+
if (!map.empty()) {
|
91
|
+
std::cerr
|
92
|
+
<< "StandardGpuResources destroyed with allocations outstanding:\n"
|
93
|
+
<< "Device " << entry.first
|
94
|
+
<< " outstanding allocations:\n";
|
95
|
+
std::cerr << allocsToString(map);
|
96
|
+
allocError = true;
|
97
|
+
}
|
96
98
|
}
|
97
|
-
}
|
98
99
|
|
99
|
-
|
100
|
+
FAISS_ASSERT_MSG(
|
101
|
+
!allocError, "GPU memory allocations not properly cleaned up");
|
100
102
|
|
101
|
-
|
102
|
-
|
103
|
+
for (auto& entry : defaultStreams_) {
|
104
|
+
DeviceScope scope(entry.first);
|
103
105
|
|
104
|
-
|
105
|
-
|
106
|
-
|
106
|
+
// We created these streams, so are responsible for destroying them
|
107
|
+
CUDA_VERIFY(cudaStreamDestroy(entry.second));
|
108
|
+
}
|
107
109
|
|
108
|
-
|
109
|
-
|
110
|
+
for (auto& entry : alternateStreams_) {
|
111
|
+
DeviceScope scope(entry.first);
|
110
112
|
|
111
|
-
|
112
|
-
|
113
|
+
for (auto stream : entry.second) {
|
114
|
+
CUDA_VERIFY(cudaStreamDestroy(stream));
|
115
|
+
}
|
113
116
|
}
|
114
|
-
}
|
115
117
|
|
116
|
-
|
117
|
-
|
118
|
+
for (auto& entry : asyncCopyStreams_) {
|
119
|
+
DeviceScope scope(entry.first);
|
118
120
|
|
119
|
-
|
120
|
-
|
121
|
+
CUDA_VERIFY(cudaStreamDestroy(entry.second));
|
122
|
+
}
|
121
123
|
|
122
|
-
|
123
|
-
|
124
|
+
for (auto& entry : blasHandles_) {
|
125
|
+
DeviceScope scope(entry.first);
|
124
126
|
|
125
|
-
|
126
|
-
|
127
|
-
|
127
|
+
auto blasStatus = cublasDestroy(entry.second);
|
128
|
+
FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
|
129
|
+
}
|
128
130
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
131
|
+
if (pinnedMemAlloc_) {
|
132
|
+
auto err = cudaFreeHost(pinnedMemAlloc_);
|
133
|
+
FAISS_ASSERT_FMT(
|
134
|
+
err == cudaSuccess,
|
135
|
+
"Failed to cudaFreeHost pointer %p (error %d %s)",
|
136
|
+
pinnedMemAlloc_,
|
137
|
+
(int)err,
|
138
|
+
cudaGetErrorString(err));
|
139
|
+
}
|
135
140
|
}
|
136
141
|
|
137
|
-
size_t
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
std::numeric_limits<size_t>::max();
|
142
|
+
size_t StandardGpuResourcesImpl::getDefaultTempMemForGPU(
|
143
|
+
int device,
|
144
|
+
size_t requested) {
|
145
|
+
auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
|
146
|
+
: std::numeric_limits<size_t>::max();
|
143
147
|
|
144
|
-
|
145
|
-
|
148
|
+
if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
|
149
|
+
// If the GPU has <= 4 GiB of memory, reserve 512 MiB
|
146
150
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
151
|
+
if (requested > k4GiBTempMem) {
|
152
|
+
return k4GiBTempMem;
|
153
|
+
}
|
154
|
+
} else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
|
155
|
+
// If the GPU has <= 8 GiB of memory, reserve 1 GiB
|
152
156
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
157
|
+
if (requested > k8GiBTempMem) {
|
158
|
+
return k8GiBTempMem;
|
159
|
+
}
|
160
|
+
} else {
|
161
|
+
// Never use more than 1.5 GiB
|
162
|
+
if (requested > kMaxTempMem) {
|
163
|
+
return kMaxTempMem;
|
164
|
+
}
|
160
165
|
}
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
p.first,
|
192
|
-
// adjust for this specific device
|
193
|
-
getDefaultTempMemForGPU(device, tempMemSize_)));
|
166
|
+
|
167
|
+
// use whatever lower limit the user requested
|
168
|
+
return requested;
|
169
|
+
}
|
170
|
+
|
171
|
+
void StandardGpuResourcesImpl::noTempMemory() {
|
172
|
+
setTempMemory(0);
|
173
|
+
}
|
174
|
+
|
175
|
+
void StandardGpuResourcesImpl::setTempMemory(size_t size) {
|
176
|
+
if (tempMemSize_ != size) {
|
177
|
+
// adjust based on general limits
|
178
|
+
tempMemSize_ = getDefaultTempMemForGPU(-1, size);
|
179
|
+
|
180
|
+
// We need to re-initialize memory resources for all current devices
|
181
|
+
// that have been initialized. This should be safe to do, even if we are
|
182
|
+
// currently running work, because the cudaFree call that this implies
|
183
|
+
// will force-synchronize all GPUs with the CPU
|
184
|
+
for (auto& p : tempMemory_) {
|
185
|
+
int device = p.first;
|
186
|
+
// Free the existing memory first
|
187
|
+
p.second.reset();
|
188
|
+
|
189
|
+
// Allocate new
|
190
|
+
p.second = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
|
191
|
+
this,
|
192
|
+
p.first,
|
193
|
+
// adjust for this specific device
|
194
|
+
getDefaultTempMemForGPU(device, tempMemSize_)));
|
195
|
+
}
|
194
196
|
}
|
195
|
-
}
|
196
197
|
}
|
197
198
|
|
198
|
-
void
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
FAISS_ASSERT(!pinnedMemAlloc_);
|
199
|
+
void StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
|
200
|
+
// Should not call this after devices have been initialized
|
201
|
+
FAISS_ASSERT(defaultStreams_.size() == 0);
|
202
|
+
FAISS_ASSERT(!pinnedMemAlloc_);
|
203
203
|
|
204
|
-
|
204
|
+
pinnedMemSize_ = size;
|
205
205
|
}
|
206
206
|
|
207
|
-
void
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
207
|
+
void StandardGpuResourcesImpl::setDefaultStream(
|
208
|
+
int device,
|
209
|
+
cudaStream_t stream) {
|
210
|
+
if (isInitialized(device)) {
|
211
|
+
// A new series of calls may not be ordered with what was the previous
|
212
|
+
// stream, so if the stream being specified is different, then we need
|
213
|
+
// to ensure ordering between the two (new stream waits on old).
|
214
|
+
auto it = userDefaultStreams_.find(device);
|
215
|
+
cudaStream_t prevStream = nullptr;
|
215
216
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
217
|
+
if (it != userDefaultStreams_.end()) {
|
218
|
+
prevStream = it->second;
|
219
|
+
} else {
|
220
|
+
FAISS_ASSERT(defaultStreams_.count(device));
|
221
|
+
prevStream = defaultStreams_[device];
|
222
|
+
}
|
222
223
|
|
223
|
-
|
224
|
-
|
224
|
+
if (prevStream != stream) {
|
225
|
+
streamWait({stream}, {prevStream});
|
226
|
+
}
|
225
227
|
}
|
226
|
-
}
|
227
228
|
|
228
|
-
|
229
|
+
userDefaultStreams_[device] = stream;
|
229
230
|
}
|
230
231
|
|
231
|
-
void
|
232
|
-
|
233
|
-
|
234
|
-
auto it = userDefaultStreams_.find(device);
|
232
|
+
void StandardGpuResourcesImpl::revertDefaultStream(int device) {
|
233
|
+
if (isInitialized(device)) {
|
234
|
+
auto it = userDefaultStreams_.find(device);
|
235
235
|
|
236
|
-
|
237
|
-
|
238
|
-
|
236
|
+
if (it != userDefaultStreams_.end()) {
|
237
|
+
// There was a user stream set that we need to synchronize against
|
238
|
+
cudaStream_t prevStream = userDefaultStreams_[device];
|
239
239
|
|
240
|
-
|
241
|
-
|
240
|
+
FAISS_ASSERT(defaultStreams_.count(device));
|
241
|
+
cudaStream_t newStream = defaultStreams_[device];
|
242
242
|
|
243
|
-
|
243
|
+
streamWait({newStream}, {prevStream});
|
244
|
+
}
|
244
245
|
}
|
245
|
-
}
|
246
246
|
|
247
|
-
|
247
|
+
userDefaultStreams_.erase(device);
|
248
248
|
}
|
249
249
|
|
250
|
-
void
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
}
|
250
|
+
void StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
|
251
|
+
for (int dev = 0; dev < getNumDevices(); ++dev) {
|
252
|
+
setDefaultStream(dev, nullptr);
|
253
|
+
}
|
255
254
|
}
|
256
255
|
|
257
|
-
void
|
258
|
-
|
259
|
-
allocLogging_ = enable;
|
256
|
+
void StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
|
257
|
+
allocLogging_ = enable;
|
260
258
|
}
|
261
259
|
|
262
|
-
bool
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
return defaultStreams_.count(device) != 0;
|
260
|
+
bool StandardGpuResourcesImpl::isInitialized(int device) const {
|
261
|
+
// Use default streams as a marker for whether or not a certain
|
262
|
+
// device has been initialized
|
263
|
+
return defaultStreams_.count(device) != 0;
|
267
264
|
}
|
268
265
|
|
269
|
-
void
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
}
|
274
|
-
|
275
|
-
// If this is the first device that we're initializing, create our
|
276
|
-
// pinned memory allocation
|
277
|
-
if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
|
278
|
-
auto err =
|
279
|
-
cudaHostAlloc(&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
|
266
|
+
void StandardGpuResourcesImpl::initializeForDevice(int device) {
|
267
|
+
if (isInitialized(device)) {
|
268
|
+
return;
|
269
|
+
}
|
280
270
|
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
271
|
+
// If this is the first device that we're initializing, create our
|
272
|
+
// pinned memory allocation
|
273
|
+
if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
|
274
|
+
auto err = cudaHostAlloc(
|
275
|
+
&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
|
276
|
+
|
277
|
+
FAISS_THROW_IF_NOT_FMT(
|
278
|
+
err == cudaSuccess,
|
279
|
+
"failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
|
280
|
+
"async copy buffer (error %d %s)",
|
281
|
+
pinnedMemSize_,
|
282
|
+
(int)err,
|
283
|
+
cudaGetErrorString(err));
|
284
|
+
|
285
|
+
pinnedMemAllocSize_ = pinnedMemSize_;
|
286
|
+
}
|
286
287
|
|
287
|
-
|
288
|
-
|
288
|
+
FAISS_ASSERT(device < getNumDevices());
|
289
|
+
DeviceScope scope(device);
|
289
290
|
|
290
|
-
|
291
|
-
|
291
|
+
// Make sure that device properties for all devices are cached
|
292
|
+
auto& prop = getDeviceProperties(device);
|
292
293
|
|
293
|
-
|
294
|
-
|
294
|
+
// Also check to make sure we meet our minimum compute capability (3.0)
|
295
|
+
FAISS_ASSERT_FMT(
|
296
|
+
prop.major >= 3,
|
297
|
+
"Device id %d with CC %d.%d not supported, "
|
298
|
+
"need 3.0+ compute capability",
|
299
|
+
device,
|
300
|
+
prop.major,
|
301
|
+
prop.minor);
|
295
302
|
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
303
|
+
// Our code is pre-built with and expects warpSize == 32, validate that
|
304
|
+
FAISS_ASSERT_FMT(
|
305
|
+
prop.warpSize == 32,
|
306
|
+
"Device id %d does not have expected warpSize of 32",
|
307
|
+
device);
|
301
308
|
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
309
|
+
// Create streams
|
310
|
+
cudaStream_t defaultStream = 0;
|
311
|
+
CUDA_VERIFY(
|
312
|
+
cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
|
306
313
|
|
307
|
-
|
314
|
+
defaultStreams_[device] = defaultStream;
|
308
315
|
|
309
|
-
|
310
|
-
|
311
|
-
|
316
|
+
cudaStream_t asyncCopyStream = 0;
|
317
|
+
CUDA_VERIFY(
|
318
|
+
cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
|
312
319
|
|
313
|
-
|
320
|
+
asyncCopyStreams_[device] = asyncCopyStream;
|
314
321
|
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
cudaStreamNonBlocking));
|
322
|
+
std::vector<cudaStream_t> deviceStreams;
|
323
|
+
for (int j = 0; j < kNumStreams; ++j) {
|
324
|
+
cudaStream_t stream = 0;
|
325
|
+
CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
320
326
|
|
321
|
-
|
322
|
-
|
327
|
+
deviceStreams.push_back(stream);
|
328
|
+
}
|
323
329
|
|
324
|
-
|
330
|
+
alternateStreams_[device] = std::move(deviceStreams);
|
325
331
|
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
332
|
+
// Create cuBLAS handle
|
333
|
+
cublasHandle_t blasHandle = 0;
|
334
|
+
auto blasStatus = cublasCreate(&blasHandle);
|
335
|
+
FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
|
336
|
+
blasHandles_[device] = blasHandle;
|
331
337
|
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
// a loss of precision.
|
338
|
+
// For CUDA 10 on V100, enabling tensor core usage would enable automatic
|
339
|
+
// rounding down of inputs to f16 (though accumulate in f32) which results
|
340
|
+
// in unacceptable loss of precision in general. For CUDA 11 / A100, only
|
341
|
+
// enable tensor core support if it doesn't result in a loss of precision.
|
337
342
|
#if CUDA_VERSION >= 11000
|
338
|
-
|
339
|
-
|
343
|
+
cublasSetMathMode(
|
344
|
+
blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
340
345
|
#endif
|
341
346
|
|
342
|
-
|
343
|
-
|
347
|
+
FAISS_ASSERT(allocs_.count(device) == 0);
|
348
|
+
allocs_[device] = std::unordered_map<void*, AllocRequest>();
|
344
349
|
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
350
|
+
FAISS_ASSERT(tempMemory_.count(device) == 0);
|
351
|
+
auto mem = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
|
352
|
+
this,
|
353
|
+
device,
|
354
|
+
// adjust for this specific device
|
355
|
+
getDefaultTempMemForGPU(device, tempMemSize_)));
|
351
356
|
|
352
|
-
|
357
|
+
tempMemory_.emplace(device, std::move(mem));
|
353
358
|
}
|
354
359
|
|
355
|
-
cublasHandle_t
|
356
|
-
|
357
|
-
|
358
|
-
return blasHandles_[device];
|
360
|
+
cublasHandle_t StandardGpuResourcesImpl::getBlasHandle(int device) {
|
361
|
+
initializeForDevice(device);
|
362
|
+
return blasHandles_[device];
|
359
363
|
}
|
360
364
|
|
361
|
-
cudaStream_t
|
362
|
-
|
363
|
-
initializeForDevice(device);
|
365
|
+
cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
|
366
|
+
initializeForDevice(device);
|
364
367
|
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
368
|
+
auto it = userDefaultStreams_.find(device);
|
369
|
+
if (it != userDefaultStreams_.end()) {
|
370
|
+
// There is a user override stream set
|
371
|
+
return it->second;
|
372
|
+
}
|
370
373
|
|
371
|
-
|
372
|
-
|
374
|
+
// Otherwise, our base default stream
|
375
|
+
return defaultStreams_[device];
|
373
376
|
}
|
374
377
|
|
375
|
-
std::vector<cudaStream_t>
|
376
|
-
|
377
|
-
|
378
|
-
|
378
|
+
std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
|
379
|
+
int device) {
|
380
|
+
initializeForDevice(device);
|
381
|
+
return alternateStreams_[device];
|
379
382
|
}
|
380
383
|
|
381
|
-
std::pair<void*, size_t>
|
382
|
-
|
383
|
-
return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
|
384
|
+
std::pair<void*, size_t> StandardGpuResourcesImpl::getPinnedMemory() {
|
385
|
+
return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
|
384
386
|
}
|
385
387
|
|
386
|
-
cudaStream_t
|
387
|
-
|
388
|
-
|
389
|
-
return asyncCopyStreams_[device];
|
388
|
+
cudaStream_t StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
|
389
|
+
initializeForDevice(device);
|
390
|
+
return asyncCopyStreams_[device];
|
390
391
|
}
|
391
392
|
|
392
|
-
void*
|
393
|
-
|
394
|
-
initializeForDevice(req.device);
|
395
|
-
|
396
|
-
// We don't allocate a placeholder for zero-sized allocations
|
397
|
-
if (req.size == 0) {
|
398
|
-
return nullptr;
|
399
|
-
}
|
400
|
-
|
401
|
-
// Make sure that the allocation is a multiple of 16 bytes for alignment
|
402
|
-
// purposes
|
403
|
-
auto adjReq = req;
|
404
|
-
adjReq.size = utils::roundUp(adjReq.size, (size_t) 16);
|
405
|
-
|
406
|
-
void* p = nullptr;
|
407
|
-
|
408
|
-
if (allocLogging_) {
|
409
|
-
std::cout << "StandardGpuResources: alloc " << adjReq.toString() << "\n";
|
410
|
-
}
|
411
|
-
|
412
|
-
if (adjReq.space == MemorySpace::Temporary) {
|
413
|
-
// If we don't have enough space in our temporary memory manager, we need
|
414
|
-
// to allocate this request separately
|
415
|
-
auto& tempMem = tempMemory_[adjReq.device];
|
416
|
-
|
417
|
-
if (adjReq.size > tempMem->getSizeAvailable()) {
|
418
|
-
// We need to allocate this ourselves
|
419
|
-
AllocRequest newReq = adjReq;
|
420
|
-
newReq.space = MemorySpace::Device;
|
421
|
-
newReq.type = AllocType::TemporaryMemoryOverflow;
|
393
|
+
void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
|
394
|
+
initializeForDevice(req.device);
|
422
395
|
|
423
|
-
|
396
|
+
// We don't allocate a placeholder for zero-sized allocations
|
397
|
+
if (req.size == 0) {
|
398
|
+
return nullptr;
|
424
399
|
}
|
425
400
|
|
426
|
-
//
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
if (
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
401
|
+
// cudaMalloc guarantees allocation alignment to 256 bytes; do the same here
|
402
|
+
// for alignment purposes (to reduce memory transaction overhead etc)
|
403
|
+
auto adjReq = req;
|
404
|
+
adjReq.size = utils::roundUp(adjReq.size, (size_t)256);
|
405
|
+
|
406
|
+
void* p = nullptr;
|
407
|
+
|
408
|
+
if (adjReq.space == MemorySpace::Temporary) {
|
409
|
+
// If we don't have enough space in our temporary memory manager, we
|
410
|
+
// need to allocate this request separately
|
411
|
+
auto& tempMem = tempMemory_[adjReq.device];
|
412
|
+
|
413
|
+
if (adjReq.size > tempMem->getSizeAvailable()) {
|
414
|
+
// We need to allocate this ourselves
|
415
|
+
AllocRequest newReq = adjReq;
|
416
|
+
newReq.space = MemorySpace::Device;
|
417
|
+
newReq.type = AllocType::TemporaryMemoryOverflow;
|
418
|
+
|
419
|
+
if (allocLogging_) {
|
420
|
+
std::cout
|
421
|
+
<< "StandardGpuResources: alloc fail "
|
422
|
+
<< adjReq.toString()
|
423
|
+
<< " (no temp space); retrying as MemorySpace::Device\n";
|
424
|
+
}
|
425
|
+
|
426
|
+
return allocMemory(newReq);
|
427
|
+
}
|
428
|
+
|
429
|
+
// Otherwise, we can handle this locally
|
430
|
+
p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
|
431
|
+
|
432
|
+
} else if (adjReq.space == MemorySpace::Device) {
|
433
|
+
auto err = cudaMalloc(&p, adjReq.size);
|
434
|
+
|
435
|
+
// Throw if we fail to allocate
|
436
|
+
if (err != cudaSuccess) {
|
437
|
+
// FIXME: as of CUDA 11, a memory allocation error appears to be
|
438
|
+
// presented via cudaGetLastError as well, and needs to be cleared.
|
439
|
+
// Just call the function to clear it
|
440
|
+
cudaGetLastError();
|
441
|
+
|
442
|
+
std::stringstream ss;
|
443
|
+
ss << "StandardGpuResources: alloc fail " << adjReq.toString()
|
444
|
+
<< " (cudaMalloc error " << cudaGetErrorString(err) << " ["
|
445
|
+
<< (int)err << "])\n";
|
446
|
+
auto str = ss.str();
|
447
|
+
|
448
|
+
if (allocLogging_) {
|
449
|
+
std::cout << str;
|
450
|
+
}
|
451
|
+
|
452
|
+
FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
|
453
|
+
}
|
454
|
+
} else if (adjReq.space == MemorySpace::Unified) {
|
455
|
+
auto err = cudaMallocManaged(&p, adjReq.size);
|
456
|
+
|
457
|
+
if (err != cudaSuccess) {
|
458
|
+
// FIXME: as of CUDA 11, a memory allocation error appears to be
|
459
|
+
// presented via cudaGetLastError as well, and needs to be cleared.
|
460
|
+
// Just call the function to clear it
|
461
|
+
cudaGetLastError();
|
462
|
+
|
463
|
+
std::stringstream ss;
|
464
|
+
ss << "StandardGpuResources: alloc fail " << adjReq.toString()
|
465
|
+
<< " failed (cudaMallocManaged error " << cudaGetErrorString(err)
|
466
|
+
<< " [" << (int)err << "])\n";
|
467
|
+
auto str = ss.str();
|
468
|
+
|
469
|
+
if (allocLogging_) {
|
470
|
+
std::cout << str;
|
471
|
+
}
|
472
|
+
|
473
|
+
FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
|
474
|
+
}
|
475
|
+
} else {
|
476
|
+
FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
|
444
477
|
}
|
445
|
-
} else if (adjReq.space == MemorySpace::Unified) {
|
446
|
-
auto err = cudaMallocManaged(&p, adjReq.size);
|
447
|
-
|
448
|
-
if (err != cudaSuccess) {
|
449
|
-
auto& map = allocs_[req.device];
|
450
478
|
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
<< "\nOutstanding allocations:\n" << allocsToString(map);
|
455
|
-
auto str = ss.str();
|
456
|
-
|
457
|
-
FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
|
479
|
+
if (allocLogging_) {
|
480
|
+
std::cout << "StandardGpuResources: alloc ok " << adjReq.toString()
|
481
|
+
<< " ptr 0x" << p << "\n";
|
458
482
|
}
|
459
|
-
} else {
|
460
|
-
FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) adjReq.space);
|
461
|
-
}
|
462
483
|
|
463
|
-
|
484
|
+
allocs_[adjReq.device][p] = adjReq;
|
464
485
|
|
465
|
-
|
486
|
+
return p;
|
466
487
|
}
|
467
488
|
|
468
|
-
void
|
469
|
-
|
470
|
-
FAISS_ASSERT(isInitialized(device));
|
489
|
+
void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
|
490
|
+
FAISS_ASSERT(isInitialized(device));
|
471
491
|
|
472
|
-
|
473
|
-
|
474
|
-
|
492
|
+
if (!p) {
|
493
|
+
return;
|
494
|
+
}
|
475
495
|
|
476
|
-
|
477
|
-
|
478
|
-
|
496
|
+
auto& a = allocs_[device];
|
497
|
+
auto it = a.find(p);
|
498
|
+
FAISS_ASSERT(it != a.end());
|
479
499
|
|
480
|
-
|
500
|
+
auto& req = it->second;
|
481
501
|
|
482
|
-
|
483
|
-
|
484
|
-
|
502
|
+
if (allocLogging_) {
|
503
|
+
std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
|
504
|
+
}
|
485
505
|
|
486
|
-
|
487
|
-
|
506
|
+
if (req.space == MemorySpace::Temporary) {
|
507
|
+
tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
|
488
508
|
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
509
|
+
} else if (
|
510
|
+
req.space == MemorySpace::Device ||
|
511
|
+
req.space == MemorySpace::Unified) {
|
512
|
+
auto err = cudaFree(p);
|
513
|
+
FAISS_ASSERT_FMT(
|
514
|
+
err == cudaSuccess,
|
515
|
+
"Failed to cudaFree pointer %p (error %d %s)",
|
516
|
+
p,
|
517
|
+
(int)err,
|
518
|
+
cudaGetErrorString(err));
|
495
519
|
|
496
|
-
|
497
|
-
|
498
|
-
|
520
|
+
} else {
|
521
|
+
FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
|
522
|
+
}
|
499
523
|
|
500
|
-
|
524
|
+
a.erase(it);
|
501
525
|
}
|
502
526
|
|
503
|
-
size_t
|
504
|
-
|
505
|
-
FAISS_ASSERT(isInitialized(device));
|
527
|
+
size_t StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
|
528
|
+
FAISS_ASSERT(isInitialized(device));
|
506
529
|
|
507
|
-
|
508
|
-
|
530
|
+
auto it = tempMemory_.find(device);
|
531
|
+
FAISS_ASSERT(it != tempMemory_.end());
|
509
532
|
|
510
|
-
|
533
|
+
return it->second->getSizeAvailable();
|
511
534
|
}
|
512
535
|
|
513
536
|
std::map<int, std::map<std::string, std::pair<int, size_t>>>
|
514
537
|
StandardGpuResourcesImpl::getMemoryInfo() const {
|
515
|
-
|
538
|
+
using AT = std::map<std::string, std::pair<int, size_t>>;
|
516
539
|
|
517
|
-
|
540
|
+
std::map<int, AT> out;
|
518
541
|
|
519
|
-
|
520
|
-
|
542
|
+
for (auto& entry : allocs_) {
|
543
|
+
AT outDevice;
|
521
544
|
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
545
|
+
for (auto& a : entry.second) {
|
546
|
+
auto& v = outDevice[allocTypeToString(a.second.type)];
|
547
|
+
v.first++;
|
548
|
+
v.second += a.second.size;
|
549
|
+
}
|
527
550
|
|
528
|
-
|
529
|
-
|
551
|
+
out[entry.first] = std::move(outDevice);
|
552
|
+
}
|
530
553
|
|
531
|
-
|
554
|
+
return out;
|
532
555
|
}
|
533
556
|
|
534
557
|
//
|
@@ -536,70 +559,58 @@ StandardGpuResourcesImpl::getMemoryInfo() const {
|
|
536
559
|
//
|
537
560
|
|
538
561
|
StandardGpuResources::StandardGpuResources()
|
539
|
-
|
540
|
-
}
|
562
|
+
: res_(new StandardGpuResourcesImpl) {}
|
541
563
|
|
542
|
-
StandardGpuResources::~StandardGpuResources() {
|
543
|
-
}
|
564
|
+
StandardGpuResources::~StandardGpuResources() {}
|
544
565
|
|
545
|
-
std::shared_ptr<GpuResources>
|
546
|
-
|
547
|
-
return res_;
|
566
|
+
std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
|
567
|
+
return res_;
|
548
568
|
}
|
549
569
|
|
550
|
-
void
|
551
|
-
|
552
|
-
res_->noTempMemory();
|
570
|
+
void StandardGpuResources::noTempMemory() {
|
571
|
+
res_->noTempMemory();
|
553
572
|
}
|
554
573
|
|
555
|
-
void
|
556
|
-
|
557
|
-
res_->setTempMemory(size);
|
574
|
+
void StandardGpuResources::setTempMemory(size_t size) {
|
575
|
+
res_->setTempMemory(size);
|
558
576
|
}
|
559
577
|
|
560
|
-
void
|
561
|
-
|
562
|
-
res_->setPinnedMemory(size);
|
578
|
+
void StandardGpuResources::setPinnedMemory(size_t size) {
|
579
|
+
res_->setPinnedMemory(size);
|
563
580
|
}
|
564
581
|
|
565
|
-
void
|
566
|
-
|
567
|
-
res_->setDefaultStream(device, stream);
|
582
|
+
void StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
|
583
|
+
res_->setDefaultStream(device, stream);
|
568
584
|
}
|
569
585
|
|
570
|
-
void
|
571
|
-
|
572
|
-
res_->revertDefaultStream(device);
|
586
|
+
void StandardGpuResources::revertDefaultStream(int device) {
|
587
|
+
res_->revertDefaultStream(device);
|
573
588
|
}
|
574
589
|
|
575
|
-
void
|
576
|
-
|
577
|
-
res_->setDefaultNullStreamAllDevices();
|
590
|
+
void StandardGpuResources::setDefaultNullStreamAllDevices() {
|
591
|
+
res_->setDefaultNullStreamAllDevices();
|
578
592
|
}
|
579
593
|
|
580
594
|
std::map<int, std::map<std::string, std::pair<int, size_t>>>
|
581
595
|
StandardGpuResources::getMemoryInfo() const {
|
582
|
-
|
596
|
+
return res_->getMemoryInfo();
|
583
597
|
}
|
584
598
|
|
585
|
-
cudaStream_t
|
586
|
-
|
587
|
-
return res_->getDefaultStream(device);
|
599
|
+
cudaStream_t StandardGpuResources::getDefaultStream(int device) {
|
600
|
+
return res_->getDefaultStream(device);
|
588
601
|
}
|
589
602
|
|
590
|
-
size_t
|
591
|
-
|
592
|
-
return res_->getTempMemoryAvailable(device);
|
603
|
+
size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
|
604
|
+
return res_->getTempMemoryAvailable(device);
|
593
605
|
}
|
594
606
|
|
595
|
-
void
|
596
|
-
|
597
|
-
res_->syncDefaultStreamCurrentDevice();
|
607
|
+
void StandardGpuResources::syncDefaultStreamCurrentDevice() {
|
608
|
+
res_->syncDefaultStreamCurrentDevice();
|
598
609
|
}
|
599
610
|
|
600
|
-
void
|
601
|
-
|
602
|
-
res_->setLogMemoryAllocations(enable);
|
611
|
+
void StandardGpuResources::setLogMemoryAllocations(bool enable) {
|
612
|
+
res_->setLogMemoryAllocations(enable);
|
603
613
|
}
|
604
614
|
|
605
|
-
}
|
615
|
+
} // namespace gpu
|
616
|
+
} // namespace faiss
|