faiss 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +292 -291
- data/vendor/faiss/faiss/AutoTune.h +55 -56
- data/vendor/faiss/faiss/Clustering.cpp +334 -195
- data/vendor/faiss/faiss/Clustering.h +88 -35
- data/vendor/faiss/faiss/IVFlib.cpp +171 -195
- data/vendor/faiss/faiss/IVFlib.h +48 -51
- data/vendor/faiss/faiss/Index.cpp +85 -103
- data/vendor/faiss/faiss/Index.h +54 -48
- data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
- data/vendor/faiss/faiss/Index2Layer.h +22 -22
- data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
- data/vendor/faiss/faiss/IndexBinary.h +140 -132
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
- data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
- data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
- data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
- data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
- data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
- data/vendor/faiss/faiss/IndexFlat.h +35 -46
- data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
- data/vendor/faiss/faiss/IndexHNSW.h +57 -41
- data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
- data/vendor/faiss/faiss/IndexIVF.h +146 -113
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
- data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
- data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
- data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
- data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
- data/vendor/faiss/faiss/IndexLSH.h +21 -26
- data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
- data/vendor/faiss/faiss/IndexLattice.h +11 -16
- data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
- data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
- data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
- data/vendor/faiss/faiss/IndexNSG.h +85 -0
- data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
- data/vendor/faiss/faiss/IndexPQ.h +64 -67
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
- data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
- data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
- data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
- data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
- data/vendor/faiss/faiss/IndexRefine.h +22 -23
- data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
- data/vendor/faiss/faiss/IndexReplicas.h +62 -56
- data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
- data/vendor/faiss/faiss/IndexResidual.h +152 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
- data/vendor/faiss/faiss/IndexShards.cpp +256 -240
- data/vendor/faiss/faiss/IndexShards.h +85 -73
- data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
- data/vendor/faiss/faiss/MatrixStats.h +7 -10
- data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
- data/vendor/faiss/faiss/MetaIndexes.h +40 -34
- data/vendor/faiss/faiss/MetricType.h +7 -7
- data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
- data/vendor/faiss/faiss/VectorTransform.h +61 -89
- data/vendor/faiss/faiss/clone_index.cpp +77 -73
- data/vendor/faiss/faiss/clone_index.h +4 -9
- data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
- data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
- data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
- data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
- data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
- data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
- data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
- data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
- data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
- data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
- data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
- data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
- data/vendor/faiss/faiss/impl/FaissException.h +41 -29
- data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
- data/vendor/faiss/faiss/impl/HNSW.h +179 -200
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
- data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
- data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
- data/vendor/faiss/faiss/impl/NSG.h +199 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
- data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
- data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
- data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
- data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
- data/vendor/faiss/faiss/impl/io.cpp +75 -94
- data/vendor/faiss/faiss/impl/io.h +31 -41
- data/vendor/faiss/faiss/impl/io_macros.h +40 -29
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
- data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
- data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
- data/vendor/faiss/faiss/index_factory.cpp +269 -218
- data/vendor/faiss/faiss/index_factory.h +6 -7
- data/vendor/faiss/faiss/index_io.h +23 -26
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
- data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
- data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
- data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
- data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
- data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
- data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
- data/vendor/faiss/faiss/utils/Heap.h +186 -209
- data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
- data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
- data/vendor/faiss/faiss/utils/distances.cpp +301 -310
- data/vendor/faiss/faiss/utils/distances.h +133 -118
- data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
- data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
- data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
- data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
- data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
- data/vendor/faiss/faiss/utils/hamming.h +62 -85
- data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
- data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
- data/vendor/faiss/faiss/utils/partitioning.h +26 -21
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
- data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
- data/vendor/faiss/faiss/utils/random.cpp +39 -63
- data/vendor/faiss/faiss/utils/random.h +13 -16
- data/vendor/faiss/faiss/utils/simdlib.h +4 -2
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
- data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
- data/vendor/faiss/faiss/utils/utils.cpp +304 -287
- data/vendor/faiss/faiss/utils/utils.h +53 -48
- metadata +20 -2
@@ -5,210 +5,204 @@
|
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
6
6
|
*/
|
7
7
|
|
8
|
-
|
9
|
-
#include <faiss/gpu/utils/StackDeviceMemory.h>
|
10
8
|
#include <faiss/gpu/utils/DeviceUtils.h>
|
9
|
+
#include <faiss/gpu/utils/StackDeviceMemory.h>
|
11
10
|
#include <faiss/gpu/utils/StaticUtils.h>
|
12
11
|
#include <faiss/impl/FaissAssert.h>
|
13
12
|
#include <algorithm>
|
14
13
|
#include <sstream>
|
15
14
|
|
16
|
-
namespace faiss {
|
15
|
+
namespace faiss {
|
16
|
+
namespace gpu {
|
17
17
|
|
18
18
|
namespace {
|
19
19
|
|
20
20
|
size_t adjustStackSize(size_t sz) {
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
21
|
+
if (sz == 0) {
|
22
|
+
return 0;
|
23
|
+
} else {
|
24
|
+
// ensure that we have at least 16 bytes, as all allocations are bumped
|
25
|
+
// up to 16
|
26
|
+
return utils::roundUp(sz, (size_t)16);
|
27
|
+
}
|
28
28
|
}
|
29
29
|
|
30
30
|
} // namespace
|
31
31
|
|
32
32
|
StackDeviceMemory::Stack::Stack(GpuResources* res, int d, size_t sz)
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
33
|
+
: res_(res),
|
34
|
+
device_(d),
|
35
|
+
alloc_(nullptr),
|
36
|
+
allocSize_(adjustStackSize(sz)),
|
37
|
+
start_(nullptr),
|
38
|
+
end_(nullptr),
|
39
|
+
head_(nullptr),
|
40
|
+
highWaterMemoryUsed_(0) {
|
41
|
+
if (allocSize_ == 0) {
|
42
|
+
return;
|
43
|
+
}
|
44
|
+
|
45
|
+
DeviceScope s(device_);
|
46
|
+
auto req = AllocRequest(
|
47
|
+
AllocType::TemporaryMemoryBuffer,
|
48
|
+
device_,
|
49
|
+
MemorySpace::Device,
|
50
|
+
res_->getDefaultStream(device_),
|
51
|
+
allocSize_);
|
52
|
+
|
53
|
+
alloc_ = (char*)res_->allocMemory(req);
|
54
|
+
FAISS_ASSERT_FMT(
|
55
|
+
alloc_,
|
56
|
+
"could not reserve temporary memory region of size %zu",
|
57
|
+
allocSize_);
|
58
|
+
|
59
|
+
// In order to disambiguate between our entire region of temporary memory
|
60
|
+
// versus the first allocation in the temporary memory region, ensure that
|
61
|
+
// the first address returned is +16 bytes from the beginning
|
62
|
+
start_ = alloc_ + 16;
|
63
|
+
head_ = start_;
|
64
|
+
end_ = alloc_ + allocSize_;
|
63
65
|
}
|
64
66
|
|
65
67
|
StackDeviceMemory::Stack::~Stack() {
|
66
|
-
|
68
|
+
DeviceScope s(device_);
|
67
69
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
70
|
+
// FIXME: make sure there are no outstanding memory allocations?
|
71
|
+
if (alloc_) {
|
72
|
+
res_->deallocMemory(device_, alloc_);
|
73
|
+
}
|
72
74
|
}
|
73
75
|
|
74
|
-
size_t
|
75
|
-
|
76
|
-
return (end_ - head_);
|
76
|
+
size_t StackDeviceMemory::Stack::getSizeAvailable() const {
|
77
|
+
return (end_ - head_);
|
77
78
|
}
|
78
79
|
|
79
|
-
char*
|
80
|
-
|
81
|
-
|
82
|
-
// The user must check to see that the allocation fit within us
|
83
|
-
auto sizeRemaining = getSizeAvailable();
|
80
|
+
char* StackDeviceMemory::Stack::getAlloc(size_t size, cudaStream_t stream) {
|
81
|
+
// The user must check to see that the allocation fit within us
|
82
|
+
auto sizeRemaining = getSizeAvailable();
|
84
83
|
|
85
|
-
|
84
|
+
FAISS_ASSERT(size <= sizeRemaining);
|
86
85
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
86
|
+
// We can make the allocation out of our stack
|
87
|
+
// Find all the ranges that we overlap that may have been
|
88
|
+
// previously allocated; our allocation will be [head, endAlloc)
|
89
|
+
char* startAlloc = head_;
|
90
|
+
char* endAlloc = head_ + size;
|
92
91
|
|
93
|
-
|
94
|
-
|
92
|
+
while (lastUsers_.size() > 0) {
|
93
|
+
auto& prevUser = lastUsers_.back();
|
95
94
|
|
96
|
-
|
97
|
-
|
95
|
+
// Because there is a previous user, we must overlap it
|
96
|
+
FAISS_ASSERT(
|
97
|
+
prevUser.start_ <= endAlloc && prevUser.end_ >= startAlloc);
|
98
98
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
99
|
+
if (stream != prevUser.stream_) {
|
100
|
+
// Synchronization required
|
101
|
+
streamWait({stream}, {prevUser.stream_});
|
102
|
+
}
|
103
103
|
|
104
|
-
|
105
|
-
|
106
|
-
|
104
|
+
if (endAlloc < prevUser.end_) {
|
105
|
+
// Update the previous user info
|
106
|
+
prevUser.start_ = endAlloc;
|
107
107
|
|
108
|
-
|
109
|
-
|
108
|
+
break;
|
109
|
+
}
|
110
110
|
|
111
|
-
|
112
|
-
|
113
|
-
|
111
|
+
// If we're the exact size of the previous request, then we
|
112
|
+
// don't need to continue
|
113
|
+
bool done = (prevUser.end_ == endAlloc);
|
114
114
|
|
115
|
-
|
115
|
+
lastUsers_.pop_back();
|
116
116
|
|
117
|
-
|
118
|
-
|
117
|
+
if (done) {
|
118
|
+
break;
|
119
|
+
}
|
119
120
|
}
|
120
|
-
}
|
121
121
|
|
122
|
-
|
123
|
-
|
122
|
+
head_ = endAlloc;
|
123
|
+
FAISS_ASSERT(head_ <= end_);
|
124
124
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
125
|
+
highWaterMemoryUsed_ =
|
126
|
+
std::max(highWaterMemoryUsed_, (size_t)(head_ - start_));
|
127
|
+
FAISS_ASSERT(startAlloc);
|
128
|
+
return startAlloc;
|
129
129
|
}
|
130
130
|
|
131
|
-
void
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
131
|
+
void StackDeviceMemory::Stack::returnAlloc(
|
132
|
+
char* p,
|
133
|
+
size_t size,
|
134
|
+
cudaStream_t stream) {
|
135
|
+
// This allocation should be within ourselves
|
136
|
+
FAISS_ASSERT(p >= start_ && p < end_);
|
137
137
|
|
138
|
-
|
139
|
-
|
138
|
+
// All allocations should have been adjusted to a multiple of 16 bytes
|
139
|
+
FAISS_ASSERT(size % 16 == 0);
|
140
140
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
141
|
+
// This is on our stack
|
142
|
+
// Allocations should be freed in the reverse order they are made
|
143
|
+
if (p + size != head_) {
|
144
|
+
FAISS_ASSERT(p + size == head_);
|
145
|
+
}
|
146
146
|
|
147
|
-
|
148
|
-
|
147
|
+
head_ = p;
|
148
|
+
lastUsers_.push_back(Range(p, p + size, stream));
|
149
149
|
}
|
150
150
|
|
151
|
-
std::string
|
152
|
-
|
153
|
-
std::stringstream s;
|
151
|
+
std::string StackDeviceMemory::Stack::toString() const {
|
152
|
+
std::stringstream s;
|
154
153
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
154
|
+
s << "SDM device " << device_ << ": Total memory " << allocSize_ << " ["
|
155
|
+
<< (void*)start_ << ", " << (void*)end_ << ")\n";
|
156
|
+
s << " Available memory " << (size_t)(end_ - head_) << " ["
|
157
|
+
<< (void*)head_ << ", " << (void*)end_ << ")\n";
|
158
|
+
s << " High water temp alloc " << highWaterMemoryUsed_ << "\n";
|
160
159
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
160
|
+
int i = lastUsers_.size();
|
161
|
+
for (auto it = lastUsers_.rbegin(); it != lastUsers_.rend(); ++it) {
|
162
|
+
s << i-- << ": size " << (size_t)(it->end_ - it->start_) << " stream "
|
163
|
+
<< it->stream_ << " [" << (void*)it->start_ << ", " << (void*)it->end_
|
164
|
+
<< ")\n";
|
165
|
+
}
|
167
166
|
|
168
|
-
|
167
|
+
return s.str();
|
169
168
|
}
|
170
169
|
|
171
|
-
StackDeviceMemory::StackDeviceMemory(
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
}
|
170
|
+
StackDeviceMemory::StackDeviceMemory(
|
171
|
+
GpuResources* res,
|
172
|
+
int device,
|
173
|
+
size_t allocPerDevice)
|
174
|
+
: device_(device), stack_(res, device, allocPerDevice) {}
|
177
175
|
|
178
|
-
StackDeviceMemory::~StackDeviceMemory() {
|
179
|
-
}
|
176
|
+
StackDeviceMemory::~StackDeviceMemory() {}
|
180
177
|
|
181
|
-
int
|
182
|
-
|
183
|
-
return device_;
|
178
|
+
int StackDeviceMemory::getDevice() const {
|
179
|
+
return device_;
|
184
180
|
}
|
185
181
|
|
186
|
-
size_t
|
187
|
-
|
188
|
-
return stack_.getSizeAvailable();
|
182
|
+
size_t StackDeviceMemory::getSizeAvailable() const {
|
183
|
+
return stack_.getSizeAvailable();
|
189
184
|
}
|
190
185
|
|
191
|
-
std::string
|
192
|
-
|
193
|
-
return stack_.toString();
|
186
|
+
std::string StackDeviceMemory::toString() const {
|
187
|
+
return stack_.toString();
|
194
188
|
}
|
195
189
|
|
196
|
-
void*
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
return stack_.getAlloc(size, stream);
|
190
|
+
void* StackDeviceMemory::allocMemory(cudaStream_t stream, size_t size) {
|
191
|
+
// All allocations should have been adjusted to a multiple of 16 bytes
|
192
|
+
FAISS_ASSERT(size % 16 == 0);
|
193
|
+
return stack_.getAlloc(size, stream);
|
201
194
|
}
|
202
195
|
|
203
|
-
void
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
196
|
+
void StackDeviceMemory::deallocMemory(
|
197
|
+
int device,
|
198
|
+
cudaStream_t stream,
|
199
|
+
size_t size,
|
200
|
+
void* p) {
|
201
|
+
FAISS_ASSERT(p);
|
202
|
+
FAISS_ASSERT(device == device_);
|
210
203
|
|
211
|
-
|
204
|
+
stack_.returnAlloc((char*)p, size, stream);
|
212
205
|
}
|
213
206
|
|
214
|
-
}
|
207
|
+
} // namespace gpu
|
208
|
+
} // namespace faiss
|
@@ -5,110 +5,108 @@
|
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
6
6
|
*/
|
7
7
|
|
8
|
-
|
9
8
|
#pragma once
|
10
9
|
|
11
|
-
#include <faiss/gpu/GpuResources.h>
|
12
10
|
#include <cuda_runtime.h>
|
11
|
+
#include <faiss/gpu/GpuResources.h>
|
13
12
|
#include <list>
|
14
13
|
#include <memory>
|
15
|
-
#include <unordered_map>
|
16
14
|
#include <tuple>
|
15
|
+
#include <unordered_map>
|
17
16
|
|
18
|
-
namespace faiss {
|
17
|
+
namespace faiss {
|
18
|
+
namespace gpu {
|
19
19
|
|
20
20
|
/// Device memory manager that provides temporary memory allocations
|
21
21
|
/// out of a region of memory, for a single device
|
22
22
|
class StackDeviceMemory {
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
int device,
|
27
|
-
size_t allocPerDevice);
|
23
|
+
public:
|
24
|
+
/// Allocate a new region of memory that we manage
|
25
|
+
StackDeviceMemory(GpuResources* res, int device, size_t allocPerDevice);
|
28
26
|
|
29
|
-
|
30
|
-
|
31
|
-
|
27
|
+
/// Manage a region of memory for a particular device, with or
|
28
|
+
/// without ownership
|
29
|
+
StackDeviceMemory(int device, void* p, size_t size, bool isOwner);
|
32
30
|
|
33
|
-
|
31
|
+
~StackDeviceMemory();
|
34
32
|
|
35
|
-
|
33
|
+
int getDevice() const;
|
36
34
|
|
37
|
-
|
38
|
-
|
39
|
-
|
35
|
+
/// All allocations requested should be a multiple of 16 bytes
|
36
|
+
void* allocMemory(cudaStream_t stream, size_t size);
|
37
|
+
void deallocMemory(int device, cudaStream_t, size_t size, void* p);
|
40
38
|
|
41
|
-
|
42
|
-
|
39
|
+
size_t getSizeAvailable() const;
|
40
|
+
std::string toString() const;
|
43
41
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
}
|
42
|
+
protected:
|
43
|
+
/// Previous allocation ranges and the streams for which
|
44
|
+
/// synchronization is required
|
45
|
+
struct Range {
|
46
|
+
inline Range(char* s, char* e, cudaStream_t str)
|
47
|
+
: start_(s), end_(e), stream_(str) {}
|
51
48
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
49
|
+
// References a memory range [start, end)
|
50
|
+
char* start_;
|
51
|
+
char* end_;
|
52
|
+
cudaStream_t stream_;
|
53
|
+
};
|
57
54
|
|
58
|
-
|
59
|
-
|
60
|
-
|
55
|
+
struct Stack {
|
56
|
+
/// Constructor that allocates memory via cudaMalloc
|
57
|
+
Stack(GpuResources* res, int device, size_t size);
|
61
58
|
|
62
|
-
|
59
|
+
~Stack();
|
63
60
|
|
64
|
-
|
65
|
-
|
66
|
-
|
61
|
+
/// Returns how much size is available for an allocation without
|
62
|
+
/// calling cudaMalloc
|
63
|
+
size_t getSizeAvailable() const;
|
67
64
|
|
68
|
-
|
69
|
-
|
70
|
-
|
65
|
+
/// Obtains an allocation; all allocations are guaranteed to be 16
|
66
|
+
/// byte aligned
|
67
|
+
char* getAlloc(size_t size, cudaStream_t stream);
|
71
68
|
|
72
|
-
|
73
|
-
|
69
|
+
/// Returns an allocation
|
70
|
+
void returnAlloc(char* p, size_t size, cudaStream_t stream);
|
74
71
|
|
75
|
-
|
76
|
-
|
72
|
+
/// Returns the stack state
|
73
|
+
std::string toString() const;
|
77
74
|
|
78
|
-
|
79
|
-
|
75
|
+
/// Our GpuResources object
|
76
|
+
GpuResources* res_;
|
80
77
|
|
81
|
-
|
82
|
-
|
78
|
+
/// Device this allocation is on
|
79
|
+
int device_;
|
83
80
|
|
84
|
-
|
85
|
-
|
86
|
-
|
81
|
+
/// Where our temporary memory buffer is allocated; we allocate starting
|
82
|
+
/// 16 bytes into this
|
83
|
+
char* alloc_;
|
87
84
|
|
88
|
-
|
89
|
-
|
85
|
+
/// Total size of our allocation
|
86
|
+
size_t allocSize_;
|
90
87
|
|
91
|
-
|
92
|
-
|
93
|
-
|
88
|
+
/// Our temporary memory region; [start_, end_) is valid
|
89
|
+
char* start_;
|
90
|
+
char* end_;
|
94
91
|
|
95
|
-
|
96
|
-
|
92
|
+
/// Stack head within [start, end)
|
93
|
+
char* head_;
|
97
94
|
|
98
|
-
|
99
|
-
|
100
|
-
|
95
|
+
/// List of previous last users of allocations on our stack, for
|
96
|
+
/// possible synchronization purposes
|
97
|
+
std::list<Range> lastUsers_;
|
101
98
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
99
|
+
/// What's the high water mark in terms of memory used from the
|
100
|
+
/// temporary buffer?
|
101
|
+
size_t highWaterMemoryUsed_;
|
102
|
+
};
|
106
103
|
|
107
|
-
|
108
|
-
|
104
|
+
/// Our device
|
105
|
+
int device_;
|
109
106
|
|
110
|
-
|
111
|
-
|
107
|
+
/// Memory stack
|
108
|
+
Stack stack_;
|
112
109
|
};
|
113
110
|
|
114
|
-
}
|
111
|
+
} // namespace gpu
|
112
|
+
} // namespace faiss
|