faiss 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE.txt +1 -1
- data/README.md +7 -7
- data/ext/faiss/extconf.rb +6 -3
- data/ext/faiss/numo.hpp +4 -4
- data/ext/faiss/utils.cpp +1 -1
- data/ext/faiss/utils.h +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +292 -291
- data/vendor/faiss/faiss/AutoTune.h +55 -56
- data/vendor/faiss/faiss/Clustering.cpp +365 -194
- data/vendor/faiss/faiss/Clustering.h +102 -35
- data/vendor/faiss/faiss/IVFlib.cpp +171 -195
- data/vendor/faiss/faiss/IVFlib.h +48 -51
- data/vendor/faiss/faiss/Index.cpp +85 -103
- data/vendor/faiss/faiss/Index.h +54 -48
- data/vendor/faiss/faiss/Index2Layer.cpp +126 -224
- data/vendor/faiss/faiss/Index2Layer.h +22 -36
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +195 -0
- data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
- data/vendor/faiss/faiss/IndexBinary.h +140 -132
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
- data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
- data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
- data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
- data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
- data/vendor/faiss/faiss/IndexFlat.cpp +115 -176
- data/vendor/faiss/faiss/IndexFlat.h +42 -59
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
- data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
- data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
- data/vendor/faiss/faiss/IndexHNSW.h +57 -41
- data/vendor/faiss/faiss/IndexIVF.cpp +545 -453
- data/vendor/faiss/faiss/IndexIVF.h +169 -118
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +247 -252
- data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +459 -517
- data/vendor/faiss/faiss/IndexIVFPQ.h +75 -67
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
- data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +163 -150
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +38 -25
- data/vendor/faiss/faiss/IndexLSH.cpp +66 -113
- data/vendor/faiss/faiss/IndexLSH.h +20 -38
- data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
- data/vendor/faiss/faiss/IndexLattice.h +11 -16
- data/vendor/faiss/faiss/IndexNNDescent.cpp +229 -0
- data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
- data/vendor/faiss/faiss/IndexNSG.cpp +301 -0
- data/vendor/faiss/faiss/IndexNSG.h +85 -0
- data/vendor/faiss/faiss/IndexPQ.cpp +387 -495
- data/vendor/faiss/faiss/IndexPQ.h +64 -82
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
- data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
- data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
- data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
- data/vendor/faiss/faiss/IndexRefine.cpp +139 -127
- data/vendor/faiss/faiss/IndexRefine.h +32 -23
- data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
- data/vendor/faiss/faiss/IndexReplicas.h +62 -56
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +111 -172
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -59
- data/vendor/faiss/faiss/IndexShards.cpp +256 -240
- data/vendor/faiss/faiss/IndexShards.h +85 -73
- data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
- data/vendor/faiss/faiss/MatrixStats.h +7 -10
- data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
- data/vendor/faiss/faiss/MetaIndexes.h +40 -34
- data/vendor/faiss/faiss/MetricType.h +7 -7
- data/vendor/faiss/faiss/VectorTransform.cpp +654 -475
- data/vendor/faiss/faiss/VectorTransform.h +64 -89
- data/vendor/faiss/faiss/clone_index.cpp +78 -73
- data/vendor/faiss/faiss/clone_index.h +4 -9
- data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
- data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +198 -171
- data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
- data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
- data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
- data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
- data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
- data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
- data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
- data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
- data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +503 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +175 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
- data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
- data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
- data/vendor/faiss/faiss/impl/FaissException.h +41 -29
- data/vendor/faiss/faiss/impl/HNSW.cpp +606 -617
- data/vendor/faiss/faiss/impl/HNSW.h +179 -200
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +855 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +244 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
- data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
- data/vendor/faiss/faiss/impl/NSG.cpp +679 -0
- data/vendor/faiss/faiss/impl/NSG.h +199 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +758 -0
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +188 -0
- data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +647 -707
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
- data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
- data/vendor/faiss/faiss/impl/index_read.cpp +631 -480
- data/vendor/faiss/faiss/impl/index_write.cpp +547 -407
- data/vendor/faiss/faiss/impl/io.cpp +76 -95
- data/vendor/faiss/faiss/impl/io.h +31 -41
- data/vendor/faiss/faiss/impl/io_macros.h +60 -29
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
- data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
- data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
- data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
- data/vendor/faiss/faiss/index_factory.cpp +619 -397
- data/vendor/faiss/faiss/index_factory.h +8 -6
- data/vendor/faiss/faiss/index_io.h +23 -26
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
- data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
- data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
- data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
- data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
- data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
- data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
- data/vendor/faiss/faiss/utils/Heap.h +186 -209
- data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
- data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
- data/vendor/faiss/faiss/utils/distances.cpp +305 -312
- data/vendor/faiss/faiss/utils/distances.h +170 -122
- data/vendor/faiss/faiss/utils/distances_simd.cpp +498 -508
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
- data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
- data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
- data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
- data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
- data/vendor/faiss/faiss/utils/hamming.h +62 -85
- data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
- data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
- data/vendor/faiss/faiss/utils/partitioning.h +26 -21
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
- data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
- data/vendor/faiss/faiss/utils/random.cpp +39 -63
- data/vendor/faiss/faiss/utils/random.h +13 -16
- data/vendor/faiss/faiss/utils/simdlib.h +4 -2
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
- data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
- data/vendor/faiss/faiss/utils/utils.cpp +304 -287
- data/vendor/faiss/faiss/utils/utils.h +54 -49
- metadata +29 -4
@@ -5,16 +5,16 @@
|
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
6
6
|
*/
|
7
7
|
|
8
|
-
|
9
8
|
#include <faiss/gpu/StandardGpuResources.h>
|
10
9
|
#include <faiss/gpu/utils/DeviceUtils.h>
|
11
10
|
#include <faiss/gpu/utils/StaticUtils.h>
|
12
11
|
#include <faiss/impl/FaissAssert.h>
|
13
|
-
#include <limits>
|
14
12
|
#include <iostream>
|
13
|
+
#include <limits>
|
15
14
|
#include <sstream>
|
16
15
|
|
17
|
-
namespace faiss {
|
16
|
+
namespace faiss {
|
17
|
+
namespace gpu {
|
18
18
|
|
19
19
|
namespace {
|
20
20
|
|
@@ -22,513 +22,536 @@ namespace {
|
|
22
22
|
constexpr int kNumStreams = 2;
|
23
23
|
|
24
24
|
// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
|
25
|
-
constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)
|
25
|
+
constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
|
26
26
|
|
27
27
|
// Default temporary memory allocation for <= 4 GiB memory GPUs
|
28
|
-
constexpr size_t k4GiBTempMem = (size_t)
|
28
|
+
constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
|
29
29
|
|
30
30
|
// Default temporary memory allocation for <= 8 GiB memory GPUs
|
31
|
-
constexpr size_t k8GiBTempMem = (size_t)
|
31
|
+
constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
|
32
32
|
|
33
33
|
// Maximum temporary memory allocation for all GPUs
|
34
|
-
constexpr size_t kMaxTempMem = (size_t)
|
34
|
+
constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
|
35
35
|
|
36
36
|
std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
37
|
+
// Produce a sorted list of all outstanding allocations by type
|
38
|
+
std::unordered_map<AllocType, std::pair<int, size_t>> stats;
|
39
|
+
|
40
|
+
for (auto& entry : map) {
|
41
|
+
auto& a = entry.second;
|
42
|
+
|
43
|
+
auto it = stats.find(a.type);
|
44
|
+
if (it != stats.end()) {
|
45
|
+
stats[a.type].first++;
|
46
|
+
stats[a.type].second += a.size;
|
47
|
+
} else {
|
48
|
+
stats[a.type] = std::make_pair(1, a.size);
|
49
|
+
}
|
49
50
|
}
|
50
|
-
}
|
51
51
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
52
|
+
std::stringstream ss;
|
53
|
+
for (auto& entry : stats) {
|
54
|
+
ss << "Alloc type " << allocTypeToString(entry.first) << ": "
|
55
|
+
<< entry.second.first << " allocations, " << entry.second.second
|
56
|
+
<< " bytes\n";
|
57
|
+
}
|
58
58
|
|
59
|
-
|
59
|
+
return ss.str();
|
60
60
|
}
|
61
61
|
|
62
|
-
}
|
62
|
+
} // namespace
|
63
63
|
|
64
64
|
//
|
65
65
|
// StandardGpuResourcesImpl
|
66
66
|
//
|
67
67
|
|
68
|
-
StandardGpuResourcesImpl::StandardGpuResourcesImpl()
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
}
|
68
|
+
StandardGpuResourcesImpl::StandardGpuResourcesImpl()
|
69
|
+
: pinnedMemAlloc_(nullptr),
|
70
|
+
pinnedMemAllocSize_(0),
|
71
|
+
// let the adjustment function determine the memory size for us by
|
72
|
+
// passing in a huge value that will then be adjusted
|
73
|
+
tempMemSize_(getDefaultTempMemForGPU(
|
74
|
+
-1,
|
75
|
+
std::numeric_limits<size_t>::max())),
|
76
|
+
pinnedMemSize_(kDefaultPinnedMemoryAllocation),
|
77
|
+
allocLogging_(false) {}
|
78
78
|
|
79
79
|
StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
80
|
+
// The temporary memory allocator has allocated memory through us, so clean
|
81
|
+
// that up before we finish fully de-initializing ourselves
|
82
|
+
tempMemory_.clear();
|
83
|
+
|
84
|
+
// Make sure all allocations have been freed
|
85
|
+
bool allocError = false;
|
86
|
+
|
87
|
+
for (auto& entry : allocs_) {
|
88
|
+
auto& map = entry.second;
|
89
|
+
|
90
|
+
if (!map.empty()) {
|
91
|
+
std::cerr
|
92
|
+
<< "StandardGpuResources destroyed with allocations outstanding:\n"
|
93
|
+
<< "Device " << entry.first
|
94
|
+
<< " outstanding allocations:\n";
|
95
|
+
std::cerr << allocsToString(map);
|
96
|
+
allocError = true;
|
97
|
+
}
|
96
98
|
}
|
97
|
-
}
|
98
99
|
|
99
|
-
|
100
|
+
FAISS_ASSERT_MSG(
|
101
|
+
!allocError, "GPU memory allocations not properly cleaned up");
|
100
102
|
|
101
|
-
|
102
|
-
|
103
|
+
for (auto& entry : defaultStreams_) {
|
104
|
+
DeviceScope scope(entry.first);
|
103
105
|
|
104
|
-
|
105
|
-
|
106
|
-
|
106
|
+
// We created these streams, so are responsible for destroying them
|
107
|
+
CUDA_VERIFY(cudaStreamDestroy(entry.second));
|
108
|
+
}
|
107
109
|
|
108
|
-
|
109
|
-
|
110
|
+
for (auto& entry : alternateStreams_) {
|
111
|
+
DeviceScope scope(entry.first);
|
110
112
|
|
111
|
-
|
112
|
-
|
113
|
+
for (auto stream : entry.second) {
|
114
|
+
CUDA_VERIFY(cudaStreamDestroy(stream));
|
115
|
+
}
|
113
116
|
}
|
114
|
-
}
|
115
117
|
|
116
|
-
|
117
|
-
|
118
|
+
for (auto& entry : asyncCopyStreams_) {
|
119
|
+
DeviceScope scope(entry.first);
|
118
120
|
|
119
|
-
|
120
|
-
|
121
|
+
CUDA_VERIFY(cudaStreamDestroy(entry.second));
|
122
|
+
}
|
121
123
|
|
122
|
-
|
123
|
-
|
124
|
+
for (auto& entry : blasHandles_) {
|
125
|
+
DeviceScope scope(entry.first);
|
124
126
|
|
125
|
-
|
126
|
-
|
127
|
-
|
127
|
+
auto blasStatus = cublasDestroy(entry.second);
|
128
|
+
FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
|
129
|
+
}
|
128
130
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
131
|
+
if (pinnedMemAlloc_) {
|
132
|
+
auto err = cudaFreeHost(pinnedMemAlloc_);
|
133
|
+
FAISS_ASSERT_FMT(
|
134
|
+
err == cudaSuccess,
|
135
|
+
"Failed to cudaFreeHost pointer %p (error %d %s)",
|
136
|
+
pinnedMemAlloc_,
|
137
|
+
(int)err,
|
138
|
+
cudaGetErrorString(err));
|
139
|
+
}
|
135
140
|
}
|
136
141
|
|
137
|
-
size_t
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
std::numeric_limits<size_t>::max();
|
142
|
+
size_t StandardGpuResourcesImpl::getDefaultTempMemForGPU(
|
143
|
+
int device,
|
144
|
+
size_t requested) {
|
145
|
+
auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
|
146
|
+
: std::numeric_limits<size_t>::max();
|
143
147
|
|
144
|
-
|
145
|
-
|
148
|
+
if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
|
149
|
+
// If the GPU has <= 4 GiB of memory, reserve 512 MiB
|
146
150
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
151
|
+
if (requested > k4GiBTempMem) {
|
152
|
+
return k4GiBTempMem;
|
153
|
+
}
|
154
|
+
} else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
|
155
|
+
// If the GPU has <= 8 GiB of memory, reserve 1 GiB
|
152
156
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
157
|
+
if (requested > k8GiBTempMem) {
|
158
|
+
return k8GiBTempMem;
|
159
|
+
}
|
160
|
+
} else {
|
161
|
+
// Never use more than 1.5 GiB
|
162
|
+
if (requested > kMaxTempMem) {
|
163
|
+
return kMaxTempMem;
|
164
|
+
}
|
160
165
|
}
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
p.first,
|
192
|
-
// adjust for this specific device
|
193
|
-
getDefaultTempMemForGPU(device, tempMemSize_)));
|
166
|
+
|
167
|
+
// use whatever lower limit the user requested
|
168
|
+
return requested;
|
169
|
+
}
|
170
|
+
|
171
|
+
void StandardGpuResourcesImpl::noTempMemory() {
|
172
|
+
setTempMemory(0);
|
173
|
+
}
|
174
|
+
|
175
|
+
void StandardGpuResourcesImpl::setTempMemory(size_t size) {
|
176
|
+
if (tempMemSize_ != size) {
|
177
|
+
// adjust based on general limits
|
178
|
+
tempMemSize_ = getDefaultTempMemForGPU(-1, size);
|
179
|
+
|
180
|
+
// We need to re-initialize memory resources for all current devices
|
181
|
+
// that have been initialized. This should be safe to do, even if we are
|
182
|
+
// currently running work, because the cudaFree call that this implies
|
183
|
+
// will force-synchronize all GPUs with the CPU
|
184
|
+
for (auto& p : tempMemory_) {
|
185
|
+
int device = p.first;
|
186
|
+
// Free the existing memory first
|
187
|
+
p.second.reset();
|
188
|
+
|
189
|
+
// Allocate new
|
190
|
+
p.second = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
|
191
|
+
this,
|
192
|
+
p.first,
|
193
|
+
// adjust for this specific device
|
194
|
+
getDefaultTempMemForGPU(device, tempMemSize_)));
|
195
|
+
}
|
194
196
|
}
|
195
|
-
}
|
196
197
|
}
|
197
198
|
|
198
|
-
void
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
FAISS_ASSERT(!pinnedMemAlloc_);
|
199
|
+
void StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
|
200
|
+
// Should not call this after devices have been initialized
|
201
|
+
FAISS_ASSERT(defaultStreams_.size() == 0);
|
202
|
+
FAISS_ASSERT(!pinnedMemAlloc_);
|
203
203
|
|
204
|
-
|
204
|
+
pinnedMemSize_ = size;
|
205
205
|
}
|
206
206
|
|
207
|
-
void
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
207
|
+
void StandardGpuResourcesImpl::setDefaultStream(
|
208
|
+
int device,
|
209
|
+
cudaStream_t stream) {
|
210
|
+
if (isInitialized(device)) {
|
211
|
+
// A new series of calls may not be ordered with what was the previous
|
212
|
+
// stream, so if the stream being specified is different, then we need
|
213
|
+
// to ensure ordering between the two (new stream waits on old).
|
214
|
+
auto it = userDefaultStreams_.find(device);
|
215
|
+
cudaStream_t prevStream = nullptr;
|
215
216
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
217
|
+
if (it != userDefaultStreams_.end()) {
|
218
|
+
prevStream = it->second;
|
219
|
+
} else {
|
220
|
+
FAISS_ASSERT(defaultStreams_.count(device));
|
221
|
+
prevStream = defaultStreams_[device];
|
222
|
+
}
|
222
223
|
|
223
|
-
|
224
|
-
|
224
|
+
if (prevStream != stream) {
|
225
|
+
streamWait({stream}, {prevStream});
|
226
|
+
}
|
225
227
|
}
|
226
|
-
}
|
227
228
|
|
228
|
-
|
229
|
+
userDefaultStreams_[device] = stream;
|
229
230
|
}
|
230
231
|
|
231
|
-
void
|
232
|
-
|
233
|
-
|
234
|
-
auto it = userDefaultStreams_.find(device);
|
232
|
+
void StandardGpuResourcesImpl::revertDefaultStream(int device) {
|
233
|
+
if (isInitialized(device)) {
|
234
|
+
auto it = userDefaultStreams_.find(device);
|
235
235
|
|
236
|
-
|
237
|
-
|
238
|
-
|
236
|
+
if (it != userDefaultStreams_.end()) {
|
237
|
+
// There was a user stream set that we need to synchronize against
|
238
|
+
cudaStream_t prevStream = userDefaultStreams_[device];
|
239
239
|
|
240
|
-
|
241
|
-
|
240
|
+
FAISS_ASSERT(defaultStreams_.count(device));
|
241
|
+
cudaStream_t newStream = defaultStreams_[device];
|
242
242
|
|
243
|
-
|
243
|
+
streamWait({newStream}, {prevStream});
|
244
|
+
}
|
244
245
|
}
|
245
|
-
}
|
246
246
|
|
247
|
-
|
247
|
+
userDefaultStreams_.erase(device);
|
248
248
|
}
|
249
249
|
|
250
|
-
void
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
}
|
250
|
+
void StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
|
251
|
+
for (int dev = 0; dev < getNumDevices(); ++dev) {
|
252
|
+
setDefaultStream(dev, nullptr);
|
253
|
+
}
|
255
254
|
}
|
256
255
|
|
257
|
-
void
|
258
|
-
|
259
|
-
allocLogging_ = enable;
|
256
|
+
void StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
|
257
|
+
allocLogging_ = enable;
|
260
258
|
}
|
261
259
|
|
262
|
-
bool
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
return defaultStreams_.count(device) != 0;
|
260
|
+
bool StandardGpuResourcesImpl::isInitialized(int device) const {
|
261
|
+
// Use default streams as a marker for whether or not a certain
|
262
|
+
// device has been initialized
|
263
|
+
return defaultStreams_.count(device) != 0;
|
267
264
|
}
|
268
265
|
|
269
|
-
void
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
}
|
274
|
-
|
275
|
-
// If this is the first device that we're initializing, create our
|
276
|
-
// pinned memory allocation
|
277
|
-
if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
|
278
|
-
auto err =
|
279
|
-
cudaHostAlloc(&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
|
266
|
+
void StandardGpuResourcesImpl::initializeForDevice(int device) {
|
267
|
+
if (isInitialized(device)) {
|
268
|
+
return;
|
269
|
+
}
|
280
270
|
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
271
|
+
// If this is the first device that we're initializing, create our
|
272
|
+
// pinned memory allocation
|
273
|
+
if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
|
274
|
+
auto err = cudaHostAlloc(
|
275
|
+
&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
|
276
|
+
|
277
|
+
FAISS_THROW_IF_NOT_FMT(
|
278
|
+
err == cudaSuccess,
|
279
|
+
"failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
|
280
|
+
"async copy buffer (error %d %s)",
|
281
|
+
pinnedMemSize_,
|
282
|
+
(int)err,
|
283
|
+
cudaGetErrorString(err));
|
284
|
+
|
285
|
+
pinnedMemAllocSize_ = pinnedMemSize_;
|
286
|
+
}
|
286
287
|
|
287
|
-
|
288
|
-
|
288
|
+
FAISS_ASSERT(device < getNumDevices());
|
289
|
+
DeviceScope scope(device);
|
289
290
|
|
290
|
-
|
291
|
-
|
291
|
+
// Make sure that device properties for all devices are cached
|
292
|
+
auto& prop = getDeviceProperties(device);
|
292
293
|
|
293
|
-
|
294
|
-
|
294
|
+
// Also check to make sure we meet our minimum compute capability (3.0)
|
295
|
+
FAISS_ASSERT_FMT(
|
296
|
+
prop.major >= 3,
|
297
|
+
"Device id %d with CC %d.%d not supported, "
|
298
|
+
"need 3.0+ compute capability",
|
299
|
+
device,
|
300
|
+
prop.major,
|
301
|
+
prop.minor);
|
295
302
|
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
303
|
+
// Our code is pre-built with and expects warpSize == 32, validate that
|
304
|
+
FAISS_ASSERT_FMT(
|
305
|
+
prop.warpSize == 32,
|
306
|
+
"Device id %d does not have expected warpSize of 32",
|
307
|
+
device);
|
301
308
|
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
309
|
+
// Create streams
|
310
|
+
cudaStream_t defaultStream = 0;
|
311
|
+
CUDA_VERIFY(
|
312
|
+
cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
|
306
313
|
|
307
|
-
|
314
|
+
defaultStreams_[device] = defaultStream;
|
308
315
|
|
309
|
-
|
310
|
-
|
311
|
-
|
316
|
+
cudaStream_t asyncCopyStream = 0;
|
317
|
+
CUDA_VERIFY(
|
318
|
+
cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
|
312
319
|
|
313
|
-
|
320
|
+
asyncCopyStreams_[device] = asyncCopyStream;
|
314
321
|
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
cudaStreamNonBlocking));
|
322
|
+
std::vector<cudaStream_t> deviceStreams;
|
323
|
+
for (int j = 0; j < kNumStreams; ++j) {
|
324
|
+
cudaStream_t stream = 0;
|
325
|
+
CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
320
326
|
|
321
|
-
|
322
|
-
|
327
|
+
deviceStreams.push_back(stream);
|
328
|
+
}
|
323
329
|
|
324
|
-
|
330
|
+
alternateStreams_[device] = std::move(deviceStreams);
|
325
331
|
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
332
|
+
// Create cuBLAS handle
|
333
|
+
cublasHandle_t blasHandle = 0;
|
334
|
+
auto blasStatus = cublasCreate(&blasHandle);
|
335
|
+
FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
|
336
|
+
blasHandles_[device] = blasHandle;
|
331
337
|
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
// a loss of precision.
|
338
|
+
// For CUDA 10 on V100, enabling tensor core usage would enable automatic
|
339
|
+
// rounding down of inputs to f16 (though accumulate in f32) which results
|
340
|
+
// in unacceptable loss of precision in general. For CUDA 11 / A100, only
|
341
|
+
// enable tensor core support if it doesn't result in a loss of precision.
|
337
342
|
#if CUDA_VERSION >= 11000
|
338
|
-
|
339
|
-
|
343
|
+
cublasSetMathMode(
|
344
|
+
blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
340
345
|
#endif
|
341
346
|
|
342
|
-
|
343
|
-
|
347
|
+
FAISS_ASSERT(allocs_.count(device) == 0);
|
348
|
+
allocs_[device] = std::unordered_map<void*, AllocRequest>();
|
344
349
|
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
350
|
+
FAISS_ASSERT(tempMemory_.count(device) == 0);
|
351
|
+
auto mem = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
|
352
|
+
this,
|
353
|
+
device,
|
354
|
+
// adjust for this specific device
|
355
|
+
getDefaultTempMemForGPU(device, tempMemSize_)));
|
351
356
|
|
352
|
-
|
357
|
+
tempMemory_.emplace(device, std::move(mem));
|
353
358
|
}
|
354
359
|
|
355
|
-
cublasHandle_t
|
356
|
-
|
357
|
-
|
358
|
-
return blasHandles_[device];
|
360
|
+
cublasHandle_t StandardGpuResourcesImpl::getBlasHandle(int device) {
|
361
|
+
initializeForDevice(device);
|
362
|
+
return blasHandles_[device];
|
359
363
|
}
|
360
364
|
|
361
|
-
cudaStream_t
|
362
|
-
|
363
|
-
initializeForDevice(device);
|
365
|
+
cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
|
366
|
+
initializeForDevice(device);
|
364
367
|
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
368
|
+
auto it = userDefaultStreams_.find(device);
|
369
|
+
if (it != userDefaultStreams_.end()) {
|
370
|
+
// There is a user override stream set
|
371
|
+
return it->second;
|
372
|
+
}
|
370
373
|
|
371
|
-
|
372
|
-
|
374
|
+
// Otherwise, our base default stream
|
375
|
+
return defaultStreams_[device];
|
373
376
|
}
|
374
377
|
|
375
|
-
std::vector<cudaStream_t>
|
376
|
-
|
377
|
-
|
378
|
-
|
378
|
+
std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
|
379
|
+
int device) {
|
380
|
+
initializeForDevice(device);
|
381
|
+
return alternateStreams_[device];
|
379
382
|
}
|
380
383
|
|
381
|
-
std::pair<void*, size_t>
|
382
|
-
|
383
|
-
return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
|
384
|
+
std::pair<void*, size_t> StandardGpuResourcesImpl::getPinnedMemory() {
|
385
|
+
return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
|
384
386
|
}
|
385
387
|
|
386
|
-
cudaStream_t
|
387
|
-
|
388
|
-
|
389
|
-
return asyncCopyStreams_[device];
|
388
|
+
cudaStream_t StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
|
389
|
+
initializeForDevice(device);
|
390
|
+
return asyncCopyStreams_[device];
|
390
391
|
}
|
391
392
|
|
392
|
-
void*
|
393
|
-
|
394
|
-
initializeForDevice(req.device);
|
395
|
-
|
396
|
-
// We don't allocate a placeholder for zero-sized allocations
|
397
|
-
if (req.size == 0) {
|
398
|
-
return nullptr;
|
399
|
-
}
|
400
|
-
|
401
|
-
// Make sure that the allocation is a multiple of 16 bytes for alignment
|
402
|
-
// purposes
|
403
|
-
auto adjReq = req;
|
404
|
-
adjReq.size = utils::roundUp(adjReq.size, (size_t) 16);
|
405
|
-
|
406
|
-
void* p = nullptr;
|
407
|
-
|
408
|
-
if (allocLogging_) {
|
409
|
-
std::cout << "StandardGpuResources: alloc " << adjReq.toString() << "\n";
|
410
|
-
}
|
411
|
-
|
412
|
-
if (adjReq.space == MemorySpace::Temporary) {
|
413
|
-
// If we don't have enough space in our temporary memory manager, we need
|
414
|
-
// to allocate this request separately
|
415
|
-
auto& tempMem = tempMemory_[adjReq.device];
|
416
|
-
|
417
|
-
if (adjReq.size > tempMem->getSizeAvailable()) {
|
418
|
-
// We need to allocate this ourselves
|
419
|
-
AllocRequest newReq = adjReq;
|
420
|
-
newReq.space = MemorySpace::Device;
|
421
|
-
newReq.type = AllocType::TemporaryMemoryOverflow;
|
393
|
+
void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
|
394
|
+
initializeForDevice(req.device);
|
422
395
|
|
423
|
-
|
396
|
+
// We don't allocate a placeholder for zero-sized allocations
|
397
|
+
if (req.size == 0) {
|
398
|
+
return nullptr;
|
424
399
|
}
|
425
400
|
|
426
|
-
//
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
if (
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
401
|
+
// cudaMalloc guarantees allocation alignment to 256 bytes; do the same here
|
402
|
+
// for alignment purposes (to reduce memory transaction overhead etc)
|
403
|
+
auto adjReq = req;
|
404
|
+
adjReq.size = utils::roundUp(adjReq.size, (size_t)256);
|
405
|
+
|
406
|
+
void* p = nullptr;
|
407
|
+
|
408
|
+
if (adjReq.space == MemorySpace::Temporary) {
|
409
|
+
// If we don't have enough space in our temporary memory manager, we
|
410
|
+
// need to allocate this request separately
|
411
|
+
auto& tempMem = tempMemory_[adjReq.device];
|
412
|
+
|
413
|
+
if (adjReq.size > tempMem->getSizeAvailable()) {
|
414
|
+
// We need to allocate this ourselves
|
415
|
+
AllocRequest newReq = adjReq;
|
416
|
+
newReq.space = MemorySpace::Device;
|
417
|
+
newReq.type = AllocType::TemporaryMemoryOverflow;
|
418
|
+
|
419
|
+
if (allocLogging_) {
|
420
|
+
std::cout
|
421
|
+
<< "StandardGpuResources: alloc fail "
|
422
|
+
<< adjReq.toString()
|
423
|
+
<< " (no temp space); retrying as MemorySpace::Device\n";
|
424
|
+
}
|
425
|
+
|
426
|
+
return allocMemory(newReq);
|
427
|
+
}
|
428
|
+
|
429
|
+
// Otherwise, we can handle this locally
|
430
|
+
p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
|
431
|
+
|
432
|
+
} else if (adjReq.space == MemorySpace::Device) {
|
433
|
+
auto err = cudaMalloc(&p, adjReq.size);
|
434
|
+
|
435
|
+
// Throw if we fail to allocate
|
436
|
+
if (err != cudaSuccess) {
|
437
|
+
// FIXME: as of CUDA 11, a memory allocation error appears to be
|
438
|
+
// presented via cudaGetLastError as well, and needs to be cleared.
|
439
|
+
// Just call the function to clear it
|
440
|
+
cudaGetLastError();
|
441
|
+
|
442
|
+
std::stringstream ss;
|
443
|
+
ss << "StandardGpuResources: alloc fail " << adjReq.toString()
|
444
|
+
<< " (cudaMalloc error " << cudaGetErrorString(err) << " ["
|
445
|
+
<< (int)err << "])\n";
|
446
|
+
auto str = ss.str();
|
447
|
+
|
448
|
+
if (allocLogging_) {
|
449
|
+
std::cout << str;
|
450
|
+
}
|
451
|
+
|
452
|
+
FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
|
453
|
+
}
|
454
|
+
} else if (adjReq.space == MemorySpace::Unified) {
|
455
|
+
auto err = cudaMallocManaged(&p, adjReq.size);
|
456
|
+
|
457
|
+
if (err != cudaSuccess) {
|
458
|
+
// FIXME: as of CUDA 11, a memory allocation error appears to be
|
459
|
+
// presented via cudaGetLastError as well, and needs to be cleared.
|
460
|
+
// Just call the function to clear it
|
461
|
+
cudaGetLastError();
|
462
|
+
|
463
|
+
std::stringstream ss;
|
464
|
+
ss << "StandardGpuResources: alloc fail " << adjReq.toString()
|
465
|
+
<< " failed (cudaMallocManaged error " << cudaGetErrorString(err)
|
466
|
+
<< " [" << (int)err << "])\n";
|
467
|
+
auto str = ss.str();
|
468
|
+
|
469
|
+
if (allocLogging_) {
|
470
|
+
std::cout << str;
|
471
|
+
}
|
472
|
+
|
473
|
+
FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
|
474
|
+
}
|
475
|
+
} else {
|
476
|
+
FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
|
444
477
|
}
|
445
|
-
} else if (adjReq.space == MemorySpace::Unified) {
|
446
|
-
auto err = cudaMallocManaged(&p, adjReq.size);
|
447
|
-
|
448
|
-
if (err != cudaSuccess) {
|
449
|
-
auto& map = allocs_[req.device];
|
450
478
|
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
<< "\nOutstanding allocations:\n" << allocsToString(map);
|
455
|
-
auto str = ss.str();
|
456
|
-
|
457
|
-
FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
|
479
|
+
if (allocLogging_) {
|
480
|
+
std::cout << "StandardGpuResources: alloc ok " << adjReq.toString()
|
481
|
+
<< " ptr 0x" << p << "\n";
|
458
482
|
}
|
459
|
-
} else {
|
460
|
-
FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) adjReq.space);
|
461
|
-
}
|
462
483
|
|
463
|
-
|
484
|
+
allocs_[adjReq.device][p] = adjReq;
|
464
485
|
|
465
|
-
|
486
|
+
return p;
|
466
487
|
}
|
467
488
|
|
468
|
-
void
|
469
|
-
|
470
|
-
FAISS_ASSERT(isInitialized(device));
|
489
|
+
void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
|
490
|
+
FAISS_ASSERT(isInitialized(device));
|
471
491
|
|
472
|
-
|
473
|
-
|
474
|
-
|
492
|
+
if (!p) {
|
493
|
+
return;
|
494
|
+
}
|
475
495
|
|
476
|
-
|
477
|
-
|
478
|
-
|
496
|
+
auto& a = allocs_[device];
|
497
|
+
auto it = a.find(p);
|
498
|
+
FAISS_ASSERT(it != a.end());
|
479
499
|
|
480
|
-
|
500
|
+
auto& req = it->second;
|
481
501
|
|
482
|
-
|
483
|
-
|
484
|
-
|
502
|
+
if (allocLogging_) {
|
503
|
+
std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
|
504
|
+
}
|
485
505
|
|
486
|
-
|
487
|
-
|
506
|
+
if (req.space == MemorySpace::Temporary) {
|
507
|
+
tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
|
488
508
|
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
509
|
+
} else if (
|
510
|
+
req.space == MemorySpace::Device ||
|
511
|
+
req.space == MemorySpace::Unified) {
|
512
|
+
auto err = cudaFree(p);
|
513
|
+
FAISS_ASSERT_FMT(
|
514
|
+
err == cudaSuccess,
|
515
|
+
"Failed to cudaFree pointer %p (error %d %s)",
|
516
|
+
p,
|
517
|
+
(int)err,
|
518
|
+
cudaGetErrorString(err));
|
495
519
|
|
496
|
-
|
497
|
-
|
498
|
-
|
520
|
+
} else {
|
521
|
+
FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
|
522
|
+
}
|
499
523
|
|
500
|
-
|
524
|
+
a.erase(it);
|
501
525
|
}
|
502
526
|
|
503
|
-
size_t
|
504
|
-
|
505
|
-
FAISS_ASSERT(isInitialized(device));
|
527
|
+
size_t StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
|
528
|
+
FAISS_ASSERT(isInitialized(device));
|
506
529
|
|
507
|
-
|
508
|
-
|
530
|
+
auto it = tempMemory_.find(device);
|
531
|
+
FAISS_ASSERT(it != tempMemory_.end());
|
509
532
|
|
510
|
-
|
533
|
+
return it->second->getSizeAvailable();
|
511
534
|
}
|
512
535
|
|
513
536
|
std::map<int, std::map<std::string, std::pair<int, size_t>>>
|
514
537
|
StandardGpuResourcesImpl::getMemoryInfo() const {
|
515
|
-
|
538
|
+
using AT = std::map<std::string, std::pair<int, size_t>>;
|
516
539
|
|
517
|
-
|
540
|
+
std::map<int, AT> out;
|
518
541
|
|
519
|
-
|
520
|
-
|
542
|
+
for (auto& entry : allocs_) {
|
543
|
+
AT outDevice;
|
521
544
|
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
545
|
+
for (auto& a : entry.second) {
|
546
|
+
auto& v = outDevice[allocTypeToString(a.second.type)];
|
547
|
+
v.first++;
|
548
|
+
v.second += a.second.size;
|
549
|
+
}
|
527
550
|
|
528
|
-
|
529
|
-
|
551
|
+
out[entry.first] = std::move(outDevice);
|
552
|
+
}
|
530
553
|
|
531
|
-
|
554
|
+
return out;
|
532
555
|
}
|
533
556
|
|
534
557
|
//
|
@@ -536,70 +559,58 @@ StandardGpuResourcesImpl::getMemoryInfo() const {
|
|
536
559
|
//
|
537
560
|
|
538
561
|
StandardGpuResources::StandardGpuResources()
|
539
|
-
|
540
|
-
}
|
562
|
+
: res_(new StandardGpuResourcesImpl) {}
|
541
563
|
|
542
|
-
StandardGpuResources::~StandardGpuResources() {
|
543
|
-
}
|
564
|
+
StandardGpuResources::~StandardGpuResources() {}
|
544
565
|
|
545
|
-
std::shared_ptr<GpuResources>
|
546
|
-
|
547
|
-
return res_;
|
566
|
+
std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
|
567
|
+
return res_;
|
548
568
|
}
|
549
569
|
|
550
|
-
void
|
551
|
-
|
552
|
-
res_->noTempMemory();
|
570
|
+
void StandardGpuResources::noTempMemory() {
|
571
|
+
res_->noTempMemory();
|
553
572
|
}
|
554
573
|
|
555
|
-
void
|
556
|
-
|
557
|
-
res_->setTempMemory(size);
|
574
|
+
void StandardGpuResources::setTempMemory(size_t size) {
|
575
|
+
res_->setTempMemory(size);
|
558
576
|
}
|
559
577
|
|
560
|
-
void
|
561
|
-
|
562
|
-
res_->setPinnedMemory(size);
|
578
|
+
void StandardGpuResources::setPinnedMemory(size_t size) {
|
579
|
+
res_->setPinnedMemory(size);
|
563
580
|
}
|
564
581
|
|
565
|
-
void
|
566
|
-
|
567
|
-
res_->setDefaultStream(device, stream);
|
582
|
+
void StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
|
583
|
+
res_->setDefaultStream(device, stream);
|
568
584
|
}
|
569
585
|
|
570
|
-
void
|
571
|
-
|
572
|
-
res_->revertDefaultStream(device);
|
586
|
+
void StandardGpuResources::revertDefaultStream(int device) {
|
587
|
+
res_->revertDefaultStream(device);
|
573
588
|
}
|
574
589
|
|
575
|
-
void
|
576
|
-
|
577
|
-
res_->setDefaultNullStreamAllDevices();
|
590
|
+
void StandardGpuResources::setDefaultNullStreamAllDevices() {
|
591
|
+
res_->setDefaultNullStreamAllDevices();
|
578
592
|
}
|
579
593
|
|
580
594
|
std::map<int, std::map<std::string, std::pair<int, size_t>>>
|
581
595
|
StandardGpuResources::getMemoryInfo() const {
|
582
|
-
|
596
|
+
return res_->getMemoryInfo();
|
583
597
|
}
|
584
598
|
|
585
|
-
cudaStream_t
|
586
|
-
|
587
|
-
return res_->getDefaultStream(device);
|
599
|
+
cudaStream_t StandardGpuResources::getDefaultStream(int device) {
|
600
|
+
return res_->getDefaultStream(device);
|
588
601
|
}
|
589
602
|
|
590
|
-
size_t
|
591
|
-
|
592
|
-
return res_->getTempMemoryAvailable(device);
|
603
|
+
size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
|
604
|
+
return res_->getTempMemoryAvailable(device);
|
593
605
|
}
|
594
606
|
|
595
|
-
void
|
596
|
-
|
597
|
-
res_->syncDefaultStreamCurrentDevice();
|
607
|
+
void StandardGpuResources::syncDefaultStreamCurrentDevice() {
|
608
|
+
res_->syncDefaultStreamCurrentDevice();
|
598
609
|
}
|
599
610
|
|
600
|
-
void
|
601
|
-
|
602
|
-
res_->setLogMemoryAllocations(enable);
|
611
|
+
void StandardGpuResources::setLogMemoryAllocations(bool enable) {
|
612
|
+
res_->setLogMemoryAllocations(enable);
|
603
613
|
}
|
604
614
|
|
605
|
-
}
|
615
|
+
} // namespace gpu
|
616
|
+
} // namespace faiss
|