faiss 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +103 -3
- data/ext/faiss/ext.cpp +99 -32
- data/ext/faiss/extconf.rb +12 -2
- data/lib/faiss/ext.bundle +0 -0
- data/lib/faiss/index.rb +3 -3
- data/lib/faiss/index_binary.rb +3 -3
- data/lib/faiss/kmeans.rb +1 -1
- data/lib/faiss/pca_matrix.rb +2 -2
- data/lib/faiss/product_quantizer.rb +3 -3
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/AutoTune.cpp +719 -0
- data/vendor/faiss/AutoTune.h +212 -0
- data/vendor/faiss/Clustering.cpp +261 -0
- data/vendor/faiss/Clustering.h +101 -0
- data/vendor/faiss/IVFlib.cpp +339 -0
- data/vendor/faiss/IVFlib.h +132 -0
- data/vendor/faiss/Index.cpp +171 -0
- data/vendor/faiss/Index.h +261 -0
- data/vendor/faiss/Index2Layer.cpp +437 -0
- data/vendor/faiss/Index2Layer.h +85 -0
- data/vendor/faiss/IndexBinary.cpp +77 -0
- data/vendor/faiss/IndexBinary.h +163 -0
- data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
- data/vendor/faiss/IndexBinaryFlat.h +54 -0
- data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
- data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
- data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
- data/vendor/faiss/IndexBinaryHNSW.h +56 -0
- data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
- data/vendor/faiss/IndexBinaryIVF.h +211 -0
- data/vendor/faiss/IndexFlat.cpp +508 -0
- data/vendor/faiss/IndexFlat.h +175 -0
- data/vendor/faiss/IndexHNSW.cpp +1090 -0
- data/vendor/faiss/IndexHNSW.h +170 -0
- data/vendor/faiss/IndexIVF.cpp +909 -0
- data/vendor/faiss/IndexIVF.h +353 -0
- data/vendor/faiss/IndexIVFFlat.cpp +502 -0
- data/vendor/faiss/IndexIVFFlat.h +118 -0
- data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
- data/vendor/faiss/IndexIVFPQ.h +161 -0
- data/vendor/faiss/IndexIVFPQR.cpp +219 -0
- data/vendor/faiss/IndexIVFPQR.h +65 -0
- data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
- data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
- data/vendor/faiss/IndexLSH.cpp +225 -0
- data/vendor/faiss/IndexLSH.h +87 -0
- data/vendor/faiss/IndexLattice.cpp +143 -0
- data/vendor/faiss/IndexLattice.h +68 -0
- data/vendor/faiss/IndexPQ.cpp +1188 -0
- data/vendor/faiss/IndexPQ.h +199 -0
- data/vendor/faiss/IndexPreTransform.cpp +288 -0
- data/vendor/faiss/IndexPreTransform.h +91 -0
- data/vendor/faiss/IndexReplicas.cpp +123 -0
- data/vendor/faiss/IndexReplicas.h +76 -0
- data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
- data/vendor/faiss/IndexScalarQuantizer.h +127 -0
- data/vendor/faiss/IndexShards.cpp +317 -0
- data/vendor/faiss/IndexShards.h +100 -0
- data/vendor/faiss/InvertedLists.cpp +623 -0
- data/vendor/faiss/InvertedLists.h +334 -0
- data/vendor/faiss/LICENSE +21 -0
- data/vendor/faiss/MatrixStats.cpp +252 -0
- data/vendor/faiss/MatrixStats.h +62 -0
- data/vendor/faiss/MetaIndexes.cpp +351 -0
- data/vendor/faiss/MetaIndexes.h +126 -0
- data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
- data/vendor/faiss/OnDiskInvertedLists.h +127 -0
- data/vendor/faiss/VectorTransform.cpp +1157 -0
- data/vendor/faiss/VectorTransform.h +322 -0
- data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
- data/vendor/faiss/c_api/AutoTune_c.h +64 -0
- data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
- data/vendor/faiss/c_api/Clustering_c.h +117 -0
- data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
- data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
- data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
- data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
- data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
- data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
- data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
- data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
- data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
- data/vendor/faiss/c_api/IndexShards_c.h +42 -0
- data/vendor/faiss/c_api/Index_c.cpp +105 -0
- data/vendor/faiss/c_api/Index_c.h +183 -0
- data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
- data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
- data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
- data/vendor/faiss/c_api/clone_index_c.h +32 -0
- data/vendor/faiss/c_api/error_c.h +42 -0
- data/vendor/faiss/c_api/error_impl.cpp +27 -0
- data/vendor/faiss/c_api/error_impl.h +16 -0
- data/vendor/faiss/c_api/faiss_c.h +58 -0
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
- data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
- data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
- data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
- data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
- data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
- data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
- data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
- data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
- data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
- data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
- data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
- data/vendor/faiss/c_api/index_factory_c.h +30 -0
- data/vendor/faiss/c_api/index_io_c.cpp +42 -0
- data/vendor/faiss/c_api/index_io_c.h +50 -0
- data/vendor/faiss/c_api/macros_impl.h +110 -0
- data/vendor/faiss/clone_index.cpp +147 -0
- data/vendor/faiss/clone_index.h +38 -0
- data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
- data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
- data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
- data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
- data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
- data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
- data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
- data/vendor/faiss/gpu/GpuCloner.h +82 -0
- data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
- data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
- data/vendor/faiss/gpu/GpuDistance.h +52 -0
- data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
- data/vendor/faiss/gpu/GpuIndex.h +148 -0
- data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
- data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
- data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
- data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
- data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
- data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
- data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
- data/vendor/faiss/gpu/GpuResources.cpp +52 -0
- data/vendor/faiss/gpu/GpuResources.h +73 -0
- data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
- data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
- data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
- data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
- data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
- data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
- data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
- data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
- data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
- data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
- data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
- data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
- data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
- data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
- data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
- data/vendor/faiss/gpu/test/TestUtils.h +93 -0
- data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
- data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
- data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
- data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
- data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
- data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
- data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
- data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
- data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
- data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
- data/vendor/faiss/gpu/utils/Timer.h +52 -0
- data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
- data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
- data/vendor/faiss/impl/FaissAssert.h +95 -0
- data/vendor/faiss/impl/FaissException.cpp +66 -0
- data/vendor/faiss/impl/FaissException.h +71 -0
- data/vendor/faiss/impl/HNSW.cpp +818 -0
- data/vendor/faiss/impl/HNSW.h +275 -0
- data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
- data/vendor/faiss/impl/PolysemousTraining.h +158 -0
- data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
- data/vendor/faiss/impl/ProductQuantizer.h +242 -0
- data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
- data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
- data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
- data/vendor/faiss/impl/ThreadedIndex.h +80 -0
- data/vendor/faiss/impl/index_read.cpp +793 -0
- data/vendor/faiss/impl/index_write.cpp +558 -0
- data/vendor/faiss/impl/io.cpp +142 -0
- data/vendor/faiss/impl/io.h +98 -0
- data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
- data/vendor/faiss/impl/lattice_Zn.h +199 -0
- data/vendor/faiss/index_factory.cpp +392 -0
- data/vendor/faiss/index_factory.h +25 -0
- data/vendor/faiss/index_io.h +75 -0
- data/vendor/faiss/misc/test_blas.cpp +84 -0
- data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
- data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
- data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
- data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
- data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
- data/vendor/faiss/tests/test_merge.cpp +258 -0
- data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
- data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
- data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
- data/vendor/faiss/tests/test_params_override.cpp +231 -0
- data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
- data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
- data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
- data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
- data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
- data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
- data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
- data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
- data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
- data/vendor/faiss/utils/Heap.cpp +122 -0
- data/vendor/faiss/utils/Heap.h +495 -0
- data/vendor/faiss/utils/WorkerThread.cpp +126 -0
- data/vendor/faiss/utils/WorkerThread.h +61 -0
- data/vendor/faiss/utils/distances.cpp +765 -0
- data/vendor/faiss/utils/distances.h +243 -0
- data/vendor/faiss/utils/distances_simd.cpp +809 -0
- data/vendor/faiss/utils/extra_distances.cpp +336 -0
- data/vendor/faiss/utils/extra_distances.h +54 -0
- data/vendor/faiss/utils/hamming-inl.h +472 -0
- data/vendor/faiss/utils/hamming.cpp +792 -0
- data/vendor/faiss/utils/hamming.h +220 -0
- data/vendor/faiss/utils/random.cpp +192 -0
- data/vendor/faiss/utils/random.h +60 -0
- data/vendor/faiss/utils/utils.cpp +783 -0
- data/vendor/faiss/utils/utils.h +181 -0
- metadata +216 -2
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
#include <faiss/gpu/utils/StackDeviceMemory.h>
|
|
10
|
+
#include <faiss/gpu/utils/DeviceUtils.h>
|
|
11
|
+
#include <faiss/gpu/utils/MemorySpace.h>
|
|
12
|
+
#include <faiss/gpu/utils/StaticUtils.h>
|
|
13
|
+
#include <faiss/impl/FaissAssert.h>
|
|
14
|
+
#include <stdio.h>
|
|
15
|
+
#include <sstream>
|
|
16
|
+
|
|
17
|
+
namespace faiss { namespace gpu {
|
|
18
|
+
|
|
19
|
+
StackDeviceMemory::Stack::Stack(int d, size_t sz)
|
|
20
|
+
: device_(d),
|
|
21
|
+
isOwner_(true),
|
|
22
|
+
start_(nullptr),
|
|
23
|
+
end_(nullptr),
|
|
24
|
+
size_(sz),
|
|
25
|
+
head_(nullptr),
|
|
26
|
+
mallocCurrent_(0),
|
|
27
|
+
highWaterMemoryUsed_(0),
|
|
28
|
+
highWaterMalloc_(0),
|
|
29
|
+
cudaMallocWarning_(true) {
|
|
30
|
+
DeviceScope s(device_);
|
|
31
|
+
|
|
32
|
+
allocMemorySpace(MemorySpace::Device, &start_, size_);
|
|
33
|
+
|
|
34
|
+
head_ = start_;
|
|
35
|
+
end_ = start_ + size_;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
StackDeviceMemory::Stack::Stack(int d, void* p, size_t sz, bool isOwner)
|
|
39
|
+
: device_(d),
|
|
40
|
+
isOwner_(isOwner),
|
|
41
|
+
start_((char*) p),
|
|
42
|
+
end_(((char*) p) + sz),
|
|
43
|
+
size_(sz),
|
|
44
|
+
head_((char*) p),
|
|
45
|
+
mallocCurrent_(0),
|
|
46
|
+
highWaterMemoryUsed_(0),
|
|
47
|
+
highWaterMalloc_(0),
|
|
48
|
+
cudaMallocWarning_(true) {
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
StackDeviceMemory::Stack::~Stack() {
|
|
52
|
+
if (isOwner_) {
|
|
53
|
+
DeviceScope s(device_);
|
|
54
|
+
|
|
55
|
+
freeMemorySpace(MemorySpace::Device, start_);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
size_t
|
|
60
|
+
StackDeviceMemory::Stack::getSizeAvailable() const {
|
|
61
|
+
return (end_ - head_);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
char*
|
|
65
|
+
StackDeviceMemory::Stack::getAlloc(size_t size,
|
|
66
|
+
cudaStream_t stream) {
|
|
67
|
+
if (size > (end_ - head_)) {
|
|
68
|
+
// Too large for our stack
|
|
69
|
+
DeviceScope s(device_);
|
|
70
|
+
|
|
71
|
+
if (cudaMallocWarning_) {
|
|
72
|
+
// Print our requested size before we attempt the allocation
|
|
73
|
+
fprintf(stderr, "WARN: increase temp memory to avoid cudaMalloc, "
|
|
74
|
+
"or decrease query/add size (alloc %zu B, highwater %zu B)\n",
|
|
75
|
+
size, highWaterMalloc_);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
char* p = nullptr;
|
|
79
|
+
allocMemorySpace(MemorySpace::Device, &p, size);
|
|
80
|
+
|
|
81
|
+
mallocCurrent_ += size;
|
|
82
|
+
highWaterMalloc_ = std::max(highWaterMalloc_, mallocCurrent_);
|
|
83
|
+
|
|
84
|
+
return p;
|
|
85
|
+
} else {
|
|
86
|
+
// We can make the allocation out of our stack
|
|
87
|
+
// Find all the ranges that we overlap that may have been
|
|
88
|
+
// previously allocated; our allocation will be [head, endAlloc)
|
|
89
|
+
char* startAlloc = head_;
|
|
90
|
+
char* endAlloc = head_ + size;
|
|
91
|
+
|
|
92
|
+
while (lastUsers_.size() > 0) {
|
|
93
|
+
auto& prevUser = lastUsers_.back();
|
|
94
|
+
|
|
95
|
+
// Because there is a previous user, we must overlap it
|
|
96
|
+
FAISS_ASSERT(prevUser.start_ <= endAlloc && prevUser.end_ >= startAlloc);
|
|
97
|
+
|
|
98
|
+
if (stream != prevUser.stream_) {
|
|
99
|
+
// Synchronization required
|
|
100
|
+
// FIXME
|
|
101
|
+
FAISS_ASSERT(false);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (endAlloc < prevUser.end_) {
|
|
105
|
+
// Update the previous user info
|
|
106
|
+
prevUser.start_ = endAlloc;
|
|
107
|
+
|
|
108
|
+
break;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// If we're the exact size of the previous request, then we
|
|
112
|
+
// don't need to continue
|
|
113
|
+
bool done = (prevUser.end_ == endAlloc);
|
|
114
|
+
|
|
115
|
+
lastUsers_.pop_back();
|
|
116
|
+
|
|
117
|
+
if (done) {
|
|
118
|
+
break;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
head_ = endAlloc;
|
|
123
|
+
FAISS_ASSERT(head_ <= end_);
|
|
124
|
+
|
|
125
|
+
highWaterMemoryUsed_ = std::max(highWaterMemoryUsed_,
|
|
126
|
+
(size_t) (head_ - start_));
|
|
127
|
+
return startAlloc;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
void
|
|
132
|
+
StackDeviceMemory::Stack::returnAlloc(char* p,
|
|
133
|
+
size_t size,
|
|
134
|
+
cudaStream_t stream) {
|
|
135
|
+
if (p < start_ || p >= end_) {
|
|
136
|
+
// This is not on our stack; it was a one-off allocation
|
|
137
|
+
DeviceScope s(device_);
|
|
138
|
+
|
|
139
|
+
freeMemorySpace(MemorySpace::Device, p);
|
|
140
|
+
|
|
141
|
+
FAISS_ASSERT(mallocCurrent_ >= size);
|
|
142
|
+
mallocCurrent_ -= size;
|
|
143
|
+
} else {
|
|
144
|
+
// This is on our stack
|
|
145
|
+
// Allocations should be freed in the reverse order they are made
|
|
146
|
+
FAISS_ASSERT(p + size == head_);
|
|
147
|
+
|
|
148
|
+
head_ = p;
|
|
149
|
+
lastUsers_.push_back(Range(p, p + size, stream));
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
std::string
|
|
154
|
+
StackDeviceMemory::Stack::toString() const {
|
|
155
|
+
std::stringstream s;
|
|
156
|
+
|
|
157
|
+
s << "SDM device " << device_ << ": Total memory " << size_ << " ["
|
|
158
|
+
<< (void*) start_ << ", " << (void*) end_ << ")\n";
|
|
159
|
+
s << " Available memory " << (size_t) (end_ - head_)
|
|
160
|
+
<< " [" << (void*) head_ << ", " << (void*) end_ << ")\n";
|
|
161
|
+
s << " High water temp alloc " << highWaterMemoryUsed_ << "\n";
|
|
162
|
+
s << " High water cudaMalloc " << highWaterMalloc_ << "\n";
|
|
163
|
+
|
|
164
|
+
int i = lastUsers_.size();
|
|
165
|
+
for (auto it = lastUsers_.rbegin(); it != lastUsers_.rend(); ++it) {
|
|
166
|
+
s << i-- << ": size " << (size_t) (it->end_ - it->start_)
|
|
167
|
+
<< " stream " << it->stream_
|
|
168
|
+
<< " [" << (void*) it->start_ << ", " << (void*) it->end_ << ")\n";
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return s.str();
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
size_t
|
|
175
|
+
StackDeviceMemory::Stack::getHighWaterCudaMalloc() const {
|
|
176
|
+
return highWaterMalloc_;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
StackDeviceMemory::StackDeviceMemory(int device, size_t allocPerDevice)
|
|
180
|
+
: device_(device),
|
|
181
|
+
stack_(device, allocPerDevice) {
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
StackDeviceMemory::StackDeviceMemory(int device,
|
|
185
|
+
void* p, size_t size, bool isOwner)
|
|
186
|
+
: device_(device),
|
|
187
|
+
stack_(device, p, size, isOwner) {
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
StackDeviceMemory::~StackDeviceMemory() {
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
void
|
|
194
|
+
StackDeviceMemory::setCudaMallocWarning(bool b) {
|
|
195
|
+
stack_.cudaMallocWarning_ = b;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
int
|
|
199
|
+
StackDeviceMemory::getDevice() const {
|
|
200
|
+
return device_;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
DeviceMemoryReservation
|
|
204
|
+
StackDeviceMemory::getMemory(cudaStream_t stream, size_t size) {
|
|
205
|
+
// We guarantee 16 byte alignment for allocations, so bump up `size`
|
|
206
|
+
// to the next highest multiple of 16
|
|
207
|
+
size = utils::roundUp(size, (size_t) 16);
|
|
208
|
+
|
|
209
|
+
return DeviceMemoryReservation(this,
|
|
210
|
+
device_,
|
|
211
|
+
stack_.getAlloc(size, stream),
|
|
212
|
+
size,
|
|
213
|
+
stream);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
size_t
|
|
217
|
+
StackDeviceMemory::getSizeAvailable() const {
|
|
218
|
+
return stack_.getSizeAvailable();
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
std::string
|
|
222
|
+
StackDeviceMemory::toString() const {
|
|
223
|
+
return stack_.toString();
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
size_t
|
|
227
|
+
StackDeviceMemory::getHighWaterCudaMalloc() const {
|
|
228
|
+
return stack_.getHighWaterCudaMalloc();
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
void
|
|
232
|
+
StackDeviceMemory::returnAllocation(DeviceMemoryReservation& m) {
|
|
233
|
+
FAISS_ASSERT(m.get());
|
|
234
|
+
FAISS_ASSERT(device_ == m.device());
|
|
235
|
+
|
|
236
|
+
stack_.returnAlloc((char*) m.get(), m.size(), m.stream());
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
} } // namespace
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
#pragma once
|
|
10
|
+
|
|
11
|
+
#include <faiss/gpu/utils/DeviceMemory.h>
|
|
12
|
+
#include <list>
|
|
13
|
+
#include <memory>
|
|
14
|
+
#include <unordered_map>
|
|
15
|
+
|
|
16
|
+
namespace faiss { namespace gpu {
|
|
17
|
+
|
|
18
|
+
/// Device memory manager that provides temporary memory allocations
|
|
19
|
+
/// out of a region of memory
|
|
20
|
+
class StackDeviceMemory : public DeviceMemory {
|
|
21
|
+
public:
|
|
22
|
+
/// Allocate a new region of memory that we manage
|
|
23
|
+
explicit StackDeviceMemory(int device, size_t allocPerDevice);
|
|
24
|
+
|
|
25
|
+
/// Manage a region of memory for a particular device, with or
|
|
26
|
+
/// without ownership
|
|
27
|
+
StackDeviceMemory(int device, void* p, size_t size, bool isOwner);
|
|
28
|
+
|
|
29
|
+
~StackDeviceMemory() override;
|
|
30
|
+
|
|
31
|
+
/// Enable or disable the warning about not having enough temporary memory
|
|
32
|
+
/// when cudaMalloc gets called
|
|
33
|
+
void setCudaMallocWarning(bool b);
|
|
34
|
+
|
|
35
|
+
int getDevice() const override;
|
|
36
|
+
|
|
37
|
+
DeviceMemoryReservation getMemory(cudaStream_t stream,
|
|
38
|
+
size_t size) override;
|
|
39
|
+
|
|
40
|
+
size_t getSizeAvailable() const override;
|
|
41
|
+
std::string toString() const override;
|
|
42
|
+
size_t getHighWaterCudaMalloc() const override;
|
|
43
|
+
|
|
44
|
+
protected:
|
|
45
|
+
void returnAllocation(DeviceMemoryReservation& m) override;
|
|
46
|
+
|
|
47
|
+
protected:
|
|
48
|
+
/// Previous allocation ranges and the streams for which
|
|
49
|
+
/// synchronization is required
|
|
50
|
+
struct Range {
|
|
51
|
+
inline Range(char* s, char* e, cudaStream_t str) :
|
|
52
|
+
start_(s), end_(e), stream_(str) {
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// References a memory range [start, end)
|
|
56
|
+
char* start_;
|
|
57
|
+
char* end_;
|
|
58
|
+
cudaStream_t stream_;
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
struct Stack {
|
|
62
|
+
/// Constructor that allocates memory via cudaMalloc
|
|
63
|
+
Stack(int device, size_t size);
|
|
64
|
+
|
|
65
|
+
/// Constructor that references a pre-allocated region of memory
|
|
66
|
+
Stack(int device, void* p, size_t size, bool isOwner);
|
|
67
|
+
~Stack();
|
|
68
|
+
|
|
69
|
+
/// Returns how much size is available for an allocation without
|
|
70
|
+
/// calling cudaMalloc
|
|
71
|
+
size_t getSizeAvailable() const;
|
|
72
|
+
|
|
73
|
+
/// Obtains an allocation; all allocations are guaranteed to be 16
|
|
74
|
+
/// byte aligned
|
|
75
|
+
char* getAlloc(size_t size, cudaStream_t stream);
|
|
76
|
+
|
|
77
|
+
/// Returns an allocation
|
|
78
|
+
void returnAlloc(char* p, size_t size, cudaStream_t stream);
|
|
79
|
+
|
|
80
|
+
/// Returns the stack state
|
|
81
|
+
std::string toString() const;
|
|
82
|
+
|
|
83
|
+
/// Returns the high-water mark of cudaMalloc activity
|
|
84
|
+
size_t getHighWaterCudaMalloc() const;
|
|
85
|
+
|
|
86
|
+
/// Device this allocation is on
|
|
87
|
+
int device_;
|
|
88
|
+
|
|
89
|
+
/// Do we own our region of memory?
|
|
90
|
+
bool isOwner_;
|
|
91
|
+
|
|
92
|
+
/// Where our allocation begins and ends
|
|
93
|
+
/// [start_, end_) is valid
|
|
94
|
+
char* start_;
|
|
95
|
+
char* end_;
|
|
96
|
+
|
|
97
|
+
/// Total size end_ - start_
|
|
98
|
+
size_t size_;
|
|
99
|
+
|
|
100
|
+
/// Stack head within [start, end)
|
|
101
|
+
char* head_;
|
|
102
|
+
|
|
103
|
+
/// List of previous last users of allocations on our stack, for
|
|
104
|
+
/// possible synchronization purposes
|
|
105
|
+
std::list<Range> lastUsers_;
|
|
106
|
+
|
|
107
|
+
/// How much cudaMalloc memory is currently outstanding?
|
|
108
|
+
size_t mallocCurrent_;
|
|
109
|
+
|
|
110
|
+
/// What's the high water mark in terms of memory used from the
|
|
111
|
+
/// temporary buffer?
|
|
112
|
+
size_t highWaterMemoryUsed_;
|
|
113
|
+
|
|
114
|
+
/// What's the high water mark in terms of memory allocated via
|
|
115
|
+
/// cudaMalloc?
|
|
116
|
+
size_t highWaterMalloc_;
|
|
117
|
+
|
|
118
|
+
/// Whether or not a warning upon cudaMalloc is generated
|
|
119
|
+
bool cudaMallocWarning_;
|
|
120
|
+
};
|
|
121
|
+
|
|
122
|
+
/// Our device
|
|
123
|
+
int device_;
|
|
124
|
+
|
|
125
|
+
/// Memory stack
|
|
126
|
+
Stack stack_;
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
} } // namespace
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
#pragma once
|
|
10
|
+
|
|
11
|
+
#include <cuda.h>
|
|
12
|
+
|
|
13
|
+
namespace faiss { namespace gpu { namespace utils {
|
|
14
|
+
|
|
15
|
+
template <typename U, typename V>
|
|
16
|
+
constexpr __host__ __device__ auto divDown(U a, V b) -> decltype(a + b) {
|
|
17
|
+
return (a / b);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
template <typename U, typename V>
|
|
21
|
+
constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
|
|
22
|
+
return (a + b - 1) / b;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
template <typename U, typename V>
|
|
26
|
+
constexpr __host__ __device__ auto roundDown(U a, V b) -> decltype(a + b) {
|
|
27
|
+
return divDown(a, b) * b;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
template <typename U, typename V>
|
|
31
|
+
constexpr __host__ __device__ auto roundUp(U a, V b) -> decltype(a + b) {
|
|
32
|
+
return divUp(a, b) * b;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
template <class T>
|
|
36
|
+
constexpr __host__ __device__ T pow(T n, T power) {
|
|
37
|
+
return (power > 0 ? n * pow(n, power - 1) : 1);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
template <class T>
|
|
41
|
+
constexpr __host__ __device__ T pow2(T n) {
|
|
42
|
+
return pow(2, (T) n);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
static_assert(pow2(8) == 256, "pow2");
|
|
46
|
+
|
|
47
|
+
template <typename T>
|
|
48
|
+
constexpr __host__ __device__ int log2(T n, int p = 0) {
|
|
49
|
+
return (n <= 1) ? p : log2(n / 2, p + 1);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
static_assert(log2(2) == 1, "log2");
|
|
53
|
+
static_assert(log2(3) == 1, "log2");
|
|
54
|
+
static_assert(log2(4) == 2, "log2");
|
|
55
|
+
|
|
56
|
+
template <typename T>
|
|
57
|
+
constexpr __host__ __device__ bool isPowerOf2(T v) {
|
|
58
|
+
return (v && !(v & (v - 1)));
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
static_assert(isPowerOf2(2048), "isPowerOf2");
|
|
62
|
+
static_assert(!isPowerOf2(3333), "isPowerOf2");
|
|
63
|
+
|
|
64
|
+
template <typename T>
|
|
65
|
+
constexpr __host__ __device__ T nextHighestPowerOf2(T v) {
|
|
66
|
+
return (isPowerOf2(v) ? (T) 2 * v : ((T) 1 << (log2(v) + 1)));
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2");
|
|
70
|
+
static_assert(nextHighestPowerOf2(2) == 4, "nextHighestPowerOf2");
|
|
71
|
+
static_assert(nextHighestPowerOf2(3) == 4, "nextHighestPowerOf2");
|
|
72
|
+
static_assert(nextHighestPowerOf2(4) == 8, "nextHighestPowerOf2");
|
|
73
|
+
|
|
74
|
+
static_assert(nextHighestPowerOf2(15) == 16, "nextHighestPowerOf2");
|
|
75
|
+
static_assert(nextHighestPowerOf2(16) == 32, "nextHighestPowerOf2");
|
|
76
|
+
static_assert(nextHighestPowerOf2(17) == 32, "nextHighestPowerOf2");
|
|
77
|
+
|
|
78
|
+
static_assert(nextHighestPowerOf2(1536000000u) == 2147483648u,
|
|
79
|
+
"nextHighestPowerOf2");
|
|
80
|
+
static_assert(nextHighestPowerOf2((size_t) 2147483648ULL) ==
|
|
81
|
+
(size_t) 4294967296ULL, "nextHighestPowerOf2");
|
|
82
|
+
|
|
83
|
+
} } } // namespace
|