cumo 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +27 -0
- data/.travis.yml +5 -0
- data/3rd_party/mkmf-cu/.gitignore +36 -0
- data/3rd_party/mkmf-cu/Gemfile +3 -0
- data/3rd_party/mkmf-cu/LICENSE +21 -0
- data/3rd_party/mkmf-cu/README.md +36 -0
- data/3rd_party/mkmf-cu/Rakefile +11 -0
- data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
- data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
- data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +82 -0
- data/README.md +252 -0
- data/Rakefile +43 -0
- data/bench/broadcast_fp32.rb +138 -0
- data/bench/cumo_bench.rb +193 -0
- data/bench/numo_bench.rb +138 -0
- data/bench/reduction_fp32.rb +117 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/cumo.gemspec +32 -0
- data/ext/cumo/cuda/cublas.c +278 -0
- data/ext/cumo/cuda/driver.c +421 -0
- data/ext/cumo/cuda/memory_pool.cpp +185 -0
- data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
- data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
- data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
- data/ext/cumo/cuda/nvrtc.c +207 -0
- data/ext/cumo/cuda/runtime.c +167 -0
- data/ext/cumo/cumo.c +148 -0
- data/ext/cumo/depend.erb +58 -0
- data/ext/cumo/extconf.rb +179 -0
- data/ext/cumo/include/cumo.h +25 -0
- data/ext/cumo/include/cumo/compat.h +23 -0
- data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
- data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
- data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
- data/ext/cumo/include/cumo/cuda/driver.h +22 -0
- data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
- data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
- data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
- data/ext/cumo/include/cumo/indexer.h +238 -0
- data/ext/cumo/include/cumo/intern.h +142 -0
- data/ext/cumo/include/cumo/intern_fwd.h +38 -0
- data/ext/cumo/include/cumo/intern_kernel.h +6 -0
- data/ext/cumo/include/cumo/narray.h +429 -0
- data/ext/cumo/include/cumo/narray_kernel.h +149 -0
- data/ext/cumo/include/cumo/ndloop.h +95 -0
- data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
- data/ext/cumo/include/cumo/template.h +158 -0
- data/ext/cumo/include/cumo/template_kernel.h +77 -0
- data/ext/cumo/include/cumo/types/bit.h +40 -0
- data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
- data/ext/cumo/include/cumo/types/complex.h +402 -0
- data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
- data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
- data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
- data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
- data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
- data/ext/cumo/include/cumo/types/dfloat.h +47 -0
- data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
- data/ext/cumo/include/cumo/types/float_def.h +34 -0
- data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
- data/ext/cumo/include/cumo/types/float_macro.h +191 -0
- data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
- data/ext/cumo/include/cumo/types/int16.h +24 -0
- data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
- data/ext/cumo/include/cumo/types/int32.h +24 -0
- data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int64.h +24 -0
- data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int8.h +24 -0
- data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int_macro.h +67 -0
- data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
- data/ext/cumo/include/cumo/types/real_accum.h +486 -0
- data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
- data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
- data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
- data/ext/cumo/include/cumo/types/robject.h +27 -0
- data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
- data/ext/cumo/include/cumo/types/scomplex.h +46 -0
- data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
- data/ext/cumo/include/cumo/types/sfloat.h +48 -0
- data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
- data/ext/cumo/include/cumo/types/uint16.h +25 -0
- data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint32.h +25 -0
- data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint64.h +25 -0
- data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint8.h +25 -0
- data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
- data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
- data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
- data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
- data/ext/cumo/narray/SFMT-params.h +97 -0
- data/ext/cumo/narray/SFMT-params19937.h +46 -0
- data/ext/cumo/narray/SFMT.c +620 -0
- data/ext/cumo/narray/SFMT.h +167 -0
- data/ext/cumo/narray/array.c +638 -0
- data/ext/cumo/narray/data.c +961 -0
- data/ext/cumo/narray/gen/cogen.rb +56 -0
- data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
- data/ext/cumo/narray/gen/def/bit.rb +37 -0
- data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
- data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
- data/ext/cumo/narray/gen/def/int16.rb +36 -0
- data/ext/cumo/narray/gen/def/int32.rb +36 -0
- data/ext/cumo/narray/gen/def/int64.rb +36 -0
- data/ext/cumo/narray/gen/def/int8.rb +36 -0
- data/ext/cumo/narray/gen/def/robject.rb +37 -0
- data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
- data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
- data/ext/cumo/narray/gen/def/uint16.rb +36 -0
- data/ext/cumo/narray/gen/def/uint32.rb +36 -0
- data/ext/cumo/narray/gen/def/uint64.rb +36 -0
- data/ext/cumo/narray/gen/def/uint8.rb +36 -0
- data/ext/cumo/narray/gen/erbpp2.rb +346 -0
- data/ext/cumo/narray/gen/narray_def.rb +268 -0
- data/ext/cumo/narray/gen/spec.rb +425 -0
- data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
- data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
- data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
- data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
- data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
- data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
- data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
- data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
- data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
- data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
- data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
- data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
- data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
- data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
- data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
- data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
- data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
- data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
- data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
- data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
- data/ext/cumo/narray/gen/tmpl/class.c +9 -0
- data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
- data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
- data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
- data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
- data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
- data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
- data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
- data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
- data/ext/cumo/narray/gen/tmpl/each.c +47 -0
- data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
- data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
- data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
- data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
- data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
- data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
- data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
- data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
- data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
- data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
- data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
- data/ext/cumo/narray/gen/tmpl/format.c +62 -0
- data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
- data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
- data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
- data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
- data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
- data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
- data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
- data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
- data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
- data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
- data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
- data/ext/cumo/narray/gen/tmpl/median.c +66 -0
- data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
- data/ext/cumo/narray/gen/tmpl/module.c +9 -0
- data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
- data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
- data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
- data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
- data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
- data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
- data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
- data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
- data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
- data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
- data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
- data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
- data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
- data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
- data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
- data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
- data/ext/cumo/narray/gen/tmpl/store.c +41 -0
- data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
- data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
- data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
- data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
- data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
- data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
- data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
- data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
- data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
- data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
- data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
- data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
- data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
- data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
- data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
- data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
- data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
- data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
- data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
- data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
- data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
- data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
- data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
- data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
- data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
- data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
- data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
- data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
- data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
- data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
- data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
- data/ext/cumo/narray/index.c +880 -0
- data/ext/cumo/narray/kwargs.c +153 -0
- data/ext/cumo/narray/math.c +142 -0
- data/ext/cumo/narray/narray.c +1948 -0
- data/ext/cumo/narray/ndloop.c +2105 -0
- data/ext/cumo/narray/rand.c +45 -0
- data/ext/cumo/narray/step.c +474 -0
- data/ext/cumo/narray/struct.c +886 -0
- data/lib/cumo.rb +3 -0
- data/lib/cumo/cuda.rb +11 -0
- data/lib/cumo/cuda/compile_error.rb +36 -0
- data/lib/cumo/cuda/compiler.rb +161 -0
- data/lib/cumo/cuda/device.rb +47 -0
- data/lib/cumo/cuda/link_state.rb +31 -0
- data/lib/cumo/cuda/module.rb +40 -0
- data/lib/cumo/cuda/nvrtc_program.rb +27 -0
- data/lib/cumo/linalg.rb +12 -0
- data/lib/cumo/narray.rb +2 -0
- data/lib/cumo/narray/extra.rb +1278 -0
- data/lib/erbpp.rb +294 -0
- data/lib/erbpp/line_number.rb +137 -0
- data/lib/erbpp/narray_def.rb +381 -0
- data/numo-narray-version +1 -0
- data/run.gdb +7 -0
- metadata +353 -0
@@ -0,0 +1,308 @@
|
|
1
|
+
#include "memory_pool_impl.hpp"
|
2
|
+
|
3
|
+
#include <ruby.h>
|
4
|
+
|
5
|
+
namespace cumo {
|
6
|
+
namespace internal {
|
7
|
+
|
8
|
+
void CheckStatus(cudaError_t status) {
|
9
|
+
if (status != 0) {
|
10
|
+
throw CUDARuntimeError(status);
|
11
|
+
}
|
12
|
+
}
|
13
|
+
|
14
|
+
Memory::Memory(size_t size) : size_(size) {
|
15
|
+
if (size_ > 0) {
|
16
|
+
CheckStatus(cudaGetDevice(&device_id_));
|
17
|
+
CheckStatus(cudaMallocManaged(&ptr_, size_, cudaMemAttachGlobal));
|
18
|
+
// std::cout << "cudaMalloc " << ptr_ << std::endl;
|
19
|
+
}
|
20
|
+
}
|
21
|
+
|
22
|
+
Memory::~Memory() {
|
23
|
+
if (size_ > 0) {
|
24
|
+
// std::cout << "cudaFree " << ptr_ << std::endl;
|
25
|
+
cudaError_t status = cudaFree(ptr_);
|
26
|
+
// CUDA driver may shut down before freeing memory inside memory pool.
|
27
|
+
// It is okay to simply ignore because CUDA driver automatically frees memory.
|
28
|
+
if (status != cudaErrorCudartUnloading) {
|
29
|
+
CheckStatus(status);
|
30
|
+
}
|
31
|
+
}
|
32
|
+
}
|
33
|
+
|
34
|
+
std::shared_ptr<Chunk> Split(std::shared_ptr<Chunk>& self, size_t size) {
|
35
|
+
assert(self->size_ >= size);
|
36
|
+
if (self->size_ == size) {
|
37
|
+
return nullptr;
|
38
|
+
}
|
39
|
+
|
40
|
+
auto remaining = std::make_shared<Chunk>(self->mem_, self->offset_ + size, self->size_ - size, self->stream_ptr_);
|
41
|
+
self->size_ = size;
|
42
|
+
|
43
|
+
if (self->next_) {
|
44
|
+
remaining->set_next(std::move(self->next_));
|
45
|
+
remaining->next()->set_prev(remaining);
|
46
|
+
}
|
47
|
+
self->next_ = remaining;
|
48
|
+
remaining->set_prev(self);
|
49
|
+
|
50
|
+
return remaining;
|
51
|
+
}
|
52
|
+
|
53
|
+
|
54
|
+
void Merge(std::shared_ptr<Chunk>& self, std::shared_ptr<Chunk> remaining) {
|
55
|
+
assert(remaining != nullptr);
|
56
|
+
assert(self->stream_ptr_ == remaining->stream_ptr());
|
57
|
+
self->size_ += remaining->size();
|
58
|
+
self->next_ = remaining->next();
|
59
|
+
if (remaining->next() != nullptr) {
|
60
|
+
self->next_->set_prev(self);
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
void SingleDeviceMemoryPool::AppendToFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr) {
|
65
|
+
assert(chunk != nullptr && !chunk->in_use());
|
66
|
+
int bin_index = GetBinIndex(size);
|
67
|
+
|
68
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
69
|
+
|
70
|
+
Arena& arena = GetArena(stream_ptr);
|
71
|
+
ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
|
72
|
+
int arena_index = std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
|
73
|
+
int length = static_cast<int>(arena_index_map.size());
|
74
|
+
if (arena_index >= length || arena_index_map.at(arena_index) != bin_index) {
|
75
|
+
arena_index_map.insert(arena_index_map.begin() + arena_index, bin_index);
|
76
|
+
arena.insert(arena.begin() + arena_index, FreeList{});
|
77
|
+
}
|
78
|
+
FreeList& free_list = arena[arena_index];
|
79
|
+
free_list.emplace_back(chunk);
|
80
|
+
}
|
81
|
+
|
82
|
+
bool SingleDeviceMemoryPool::RemoveFromFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr) {
|
83
|
+
assert(chunk != nullptr && !chunk->in_use());
|
84
|
+
int bin_index = GetBinIndex(size);
|
85
|
+
|
86
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
87
|
+
|
88
|
+
Arena& arena = GetArena(stream_ptr);
|
89
|
+
ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
|
90
|
+
if (arena_index_map.size() == 0) {
|
91
|
+
return false;
|
92
|
+
}
|
93
|
+
int arena_index = std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
|
94
|
+
if (static_cast<size_t>(arena_index) == arena_index_map.size()) {
|
95
|
+
// Bin does not exist for the given chunk size.
|
96
|
+
return false;
|
97
|
+
}
|
98
|
+
if (arena_index_map.at(arena_index) != bin_index) {
|
99
|
+
return false;
|
100
|
+
}
|
101
|
+
assert(arena.size() > static_cast<size_t>(arena_index));
|
102
|
+
FreeList& free_list = arena[arena_index];
|
103
|
+
return EraseFromFreeList(free_list, chunk);
|
104
|
+
}
|
105
|
+
|
106
|
+
intptr_t SingleDeviceMemoryPool::Malloc(size_t size, cudaStream_t stream_ptr) {
|
107
|
+
size = GetRoundedSize(size);
|
108
|
+
std::shared_ptr<Chunk> chunk = nullptr;
|
109
|
+
|
110
|
+
{
|
111
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
112
|
+
|
113
|
+
// find best-fit, or a smallest larger allocation
|
114
|
+
Arena& arena = GetArena(stream_ptr);
|
115
|
+
int arena_index = GetArenaIndex(size);
|
116
|
+
int arena_length = static_cast<int>(arena.size());
|
117
|
+
for (int i = arena_index; i < arena_length; ++i) {
|
118
|
+
FreeList& free_list = arena[i];
|
119
|
+
if (free_list.empty()) {
|
120
|
+
continue;
|
121
|
+
}
|
122
|
+
chunk = PopFromFreeList(free_list);
|
123
|
+
// TODO(sonots): compact_index
|
124
|
+
break;
|
125
|
+
}
|
126
|
+
}
|
127
|
+
|
128
|
+
if (chunk != nullptr) {
|
129
|
+
std::shared_ptr<Chunk> remaining = Split(chunk, size);
|
130
|
+
if (remaining != nullptr) {
|
131
|
+
AppendToFreeList(remaining->size(), remaining, stream_ptr);
|
132
|
+
}
|
133
|
+
} else {
|
134
|
+
// cudaMalloc if a cache is not found
|
135
|
+
std::shared_ptr<Memory> mem = nullptr;
|
136
|
+
try {
|
137
|
+
mem = std::make_shared<Memory>(size);
|
138
|
+
} catch (const CUDARuntimeError& e) {
|
139
|
+
if (e.status() != cudaErrorMemoryAllocation) {
|
140
|
+
throw;
|
141
|
+
}
|
142
|
+
FreeAllBlocks();
|
143
|
+
try {
|
144
|
+
mem = std::make_shared<Memory>(size);
|
145
|
+
} catch (const CUDARuntimeError& e) {
|
146
|
+
if (e.status() != cudaErrorMemoryAllocation) {
|
147
|
+
throw;
|
148
|
+
}
|
149
|
+
#ifdef NO_RUBY // cpp test does not bind with libruby
|
150
|
+
size_t total = size + GetTotalBytes();
|
151
|
+
throw OutOfMemoryError(size, total);
|
152
|
+
#else
|
153
|
+
rb_funcall(rb_define_module("GC"), rb_intern("start"), 0);
|
154
|
+
try {
|
155
|
+
mem = std::make_shared<Memory>(size);
|
156
|
+
} catch (const CUDARuntimeError& e) {
|
157
|
+
if (e.status() != cudaErrorMemoryAllocation) {
|
158
|
+
throw;
|
159
|
+
}
|
160
|
+
size_t total = size + GetTotalBytes();
|
161
|
+
throw OutOfMemoryError(size, total);
|
162
|
+
}
|
163
|
+
#endif
|
164
|
+
}
|
165
|
+
}
|
166
|
+
chunk = std::make_shared<Chunk>(mem, 0, size, stream_ptr);
|
167
|
+
}
|
168
|
+
|
169
|
+
assert(chunk != nullptr);
|
170
|
+
assert(chunk->stream_ptr() == stream_ptr);
|
171
|
+
{
|
172
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
173
|
+
|
174
|
+
chunk->set_in_use(true);
|
175
|
+
in_use_.emplace(chunk->ptr(), chunk);
|
176
|
+
}
|
177
|
+
return chunk->ptr();
|
178
|
+
}
|
179
|
+
|
180
|
+
void SingleDeviceMemoryPool::Free(intptr_t ptr, cudaStream_t stream_ptr) {
|
181
|
+
std::shared_ptr<Chunk> chunk = nullptr;
|
182
|
+
|
183
|
+
{
|
184
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
185
|
+
|
186
|
+
chunk = in_use_[ptr];
|
187
|
+
// assert(chunk != nullptr);
|
188
|
+
if (!chunk) return;
|
189
|
+
chunk->set_in_use(false);
|
190
|
+
in_use_.erase(ptr);
|
191
|
+
}
|
192
|
+
|
193
|
+
if (chunk->next() != nullptr && !chunk->next()->in_use()) {
|
194
|
+
if (RemoveFromFreeList(chunk->next()->size(), chunk->next(), stream_ptr)) {
|
195
|
+
Merge(chunk, chunk->next());
|
196
|
+
}
|
197
|
+
}
|
198
|
+
if (chunk->prev() != nullptr && !chunk->prev()->in_use()) {
|
199
|
+
if (RemoveFromFreeList(chunk->prev()->size(), chunk->prev(), stream_ptr)) {
|
200
|
+
chunk = chunk->prev();
|
201
|
+
Merge(chunk, chunk->next());
|
202
|
+
}
|
203
|
+
}
|
204
|
+
AppendToFreeList(chunk->size(), chunk, stream_ptr);
|
205
|
+
}
|
206
|
+
|
207
|
+
void SingleDeviceMemoryPool::CompactIndex(cudaStream_t stream_ptr, bool free) {
|
208
|
+
// need lock ouside this function
|
209
|
+
if (!HasArena(stream_ptr)) return;
|
210
|
+
|
211
|
+
Arena new_arena;
|
212
|
+
ArenaIndexMap new_arena_index_map;
|
213
|
+
Arena& arena = GetArena(stream_ptr);
|
214
|
+
ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
|
215
|
+
size_t arena_length = arena.size();
|
216
|
+
for (size_t arena_index = 0; arena_index < arena_length; ++arena_index) {
|
217
|
+
FreeList& free_list = arena[arena_index];
|
218
|
+
if (free_list.empty()) {
|
219
|
+
continue;
|
220
|
+
}
|
221
|
+
if (free) {
|
222
|
+
FreeList keep_list;
|
223
|
+
for (auto chunk : free_list) {
|
224
|
+
if (chunk->prev() != nullptr || chunk->next() != nullptr) {
|
225
|
+
keep_list.emplace_back(chunk);
|
226
|
+
}
|
227
|
+
}
|
228
|
+
if (keep_list.size() == 0) {
|
229
|
+
continue;
|
230
|
+
}
|
231
|
+
new_arena_index_map.emplace_back(arena_index_map[arena_index]);
|
232
|
+
new_arena.emplace_back(keep_list);
|
233
|
+
} else {
|
234
|
+
new_arena_index_map.emplace_back(arena_index_map[arena_index]);
|
235
|
+
new_arena.emplace_back(free_list);
|
236
|
+
}
|
237
|
+
}
|
238
|
+
if (new_arena.empty()) {
|
239
|
+
index_.erase(stream_ptr);
|
240
|
+
free_.erase(stream_ptr);
|
241
|
+
} else {
|
242
|
+
arena_index_map.swap(new_arena_index_map);
|
243
|
+
arena.swap(new_arena);
|
244
|
+
}
|
245
|
+
}
|
246
|
+
|
247
|
+
// Free all **non-split** chunks in all arenas
|
248
|
+
void SingleDeviceMemoryPool::FreeAllBlocks() {
|
249
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
250
|
+
|
251
|
+
std::vector<cudaStream_t> keys(free_.size());
|
252
|
+
transform(free_.begin(), free_.end(), keys.begin(), [](auto pair) { return pair.first; });
|
253
|
+
for (cudaStream_t stream_ptr : keys) {
|
254
|
+
CompactIndex(stream_ptr, true);
|
255
|
+
}
|
256
|
+
}
|
257
|
+
|
258
|
+
// Free all **non-split** chunks in specified arena
|
259
|
+
void SingleDeviceMemoryPool::FreeAllBlocks(cudaStream_t stream_ptr) {
|
260
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
261
|
+
|
262
|
+
CompactIndex(stream_ptr, true);
|
263
|
+
}
|
264
|
+
|
265
|
+
size_t SingleDeviceMemoryPool::GetNumFreeBlocks() {
|
266
|
+
size_t n = 0;
|
267
|
+
|
268
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
269
|
+
|
270
|
+
for (auto kv : free_) {
|
271
|
+
Arena& arena = kv.second;
|
272
|
+
for (auto free_list : arena) {
|
273
|
+
n += free_list.size();
|
274
|
+
}
|
275
|
+
}
|
276
|
+
return n;
|
277
|
+
}
|
278
|
+
|
279
|
+
size_t SingleDeviceMemoryPool::GetUsedBytes() {
|
280
|
+
size_t size = 0;
|
281
|
+
|
282
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
283
|
+
|
284
|
+
for (auto kv : in_use_) {
|
285
|
+
std::shared_ptr<Chunk>& chunk = kv.second;
|
286
|
+
if (chunk) size += chunk->size();
|
287
|
+
}
|
288
|
+
return size;
|
289
|
+
}
|
290
|
+
|
291
|
+
size_t SingleDeviceMemoryPool::GetFreeBytes() {
|
292
|
+
size_t size = 0;
|
293
|
+
|
294
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
295
|
+
|
296
|
+
for (auto kv : free_) {
|
297
|
+
Arena& arena = kv.second;
|
298
|
+
for (auto free_list : arena) {
|
299
|
+
for (auto chunk : free_list) {
|
300
|
+
if (chunk) size += chunk->size();
|
301
|
+
}
|
302
|
+
}
|
303
|
+
}
|
304
|
+
return size;
|
305
|
+
}
|
306
|
+
|
307
|
+
} // namespace internal
|
308
|
+
} // namespace cumo
|
@@ -0,0 +1,370 @@
|
|
1
|
+
#ifndef CUMO_CUDA_MEMORY_POOL_IMPL_H
|
2
|
+
#define CUMO_CUDA_MEMORY_POOL_IMPL_H
|
3
|
+
|
4
|
+
#include <algorithm>
|
5
|
+
#include <cassert>
|
6
|
+
#include <memory>
|
7
|
+
#include <mutex>
|
8
|
+
#include <stdexcept>
|
9
|
+
#include <unordered_map>
|
10
|
+
#include <vector>
|
11
|
+
|
12
|
+
#include <cuda_runtime.h>
|
13
|
+
|
14
|
+
// CUDA memory pool implementation highly referring CuPy
|
15
|
+
|
16
|
+
namespace cumo {
|
17
|
+
namespace internal {
|
18
|
+
|
19
|
+
// cudaMalloc() is aligned to at least 512 bytes
|
20
|
+
// cf. https://gist.github.com/sonots/41daaa6432b1c8b27ef782cd14064269
|
21
|
+
constexpr int kRoundSize = 512; // bytes
|
22
|
+
|
23
|
+
class CUDARuntimeError : public std::runtime_error {
|
24
|
+
private:
|
25
|
+
cudaError_t status_;
|
26
|
+
|
27
|
+
public:
|
28
|
+
CUDARuntimeError(cudaError_t status) :
|
29
|
+
runtime_error(cudaGetErrorString(status)), status_(status) {}
|
30
|
+
cudaError_t status() const { return status_; }
|
31
|
+
};
|
32
|
+
|
33
|
+
|
34
|
+
class OutOfMemoryError : public std::runtime_error {
|
35
|
+
public:
|
36
|
+
OutOfMemoryError(size_t size, size_t total) :
|
37
|
+
runtime_error("out of memory to allocate " + std::to_string(size) + " bytes (total " + std::to_string(total) + " bytes)") {}
|
38
|
+
};
|
39
|
+
|
40
|
+
void CheckStatus(cudaError_t status);
|
41
|
+
|
42
|
+
// Memory allocation on a CUDA device.
|
43
|
+
//
|
44
|
+
// This class provides an RAII interface of the CUDA memory allocation.
|
45
|
+
class Memory {
|
46
|
+
private:
|
47
|
+
// Pointer to the place within the buffer.
|
48
|
+
void* ptr_ = nullptr;
|
49
|
+
// Size of the memory allocation in bytes.
|
50
|
+
size_t size_ = 0;
|
51
|
+
// GPU device id whose memory the pointer refers to.
|
52
|
+
int device_id_ = -1;
|
53
|
+
|
54
|
+
public:
|
55
|
+
Memory(size_t size);
|
56
|
+
|
57
|
+
~Memory();
|
58
|
+
|
59
|
+
intptr_t ptr() const { return reinterpret_cast<intptr_t>(ptr_); }
|
60
|
+
|
61
|
+
size_t size() const { return size_; }
|
62
|
+
|
63
|
+
int device_id() const { return device_id_; }
|
64
|
+
};
|
65
|
+
|
66
|
+
// A chunk points to a device memory.
|
67
|
+
//
|
68
|
+
// A chunk might be a splitted memory block from a larger allocation.
|
69
|
+
// The prev/next pointers contruct a doubly-linked list of memory addresses
|
70
|
+
// sorted by base address that must be contiguous.
|
71
|
+
class Chunk {
|
72
|
+
private:
|
73
|
+
// The device memory buffer.
|
74
|
+
std::shared_ptr<Memory> mem_;
|
75
|
+
// Memory address.
|
76
|
+
intptr_t ptr_ = 0;
|
77
|
+
// An offset bytes from the head of the buffer.
|
78
|
+
size_t offset_ = 0;
|
79
|
+
// Chunk size in bytes.
|
80
|
+
size_t size_ = 0;
|
81
|
+
// GPU device id whose memory the pointer refers to.
|
82
|
+
int device_id_;
|
83
|
+
// prev memory pointer if split from a larger allocation
|
84
|
+
std::shared_ptr<Chunk> prev_;
|
85
|
+
// next memory pointer if split from a larger allocation
|
86
|
+
std::shared_ptr<Chunk> next_;
|
87
|
+
// Raw stream handle of cuda stream
|
88
|
+
cudaStream_t stream_ptr_;
|
89
|
+
// chunk is in use
|
90
|
+
bool in_use_ = false;
|
91
|
+
|
92
|
+
public:
|
93
|
+
Chunk() {}
|
94
|
+
|
95
|
+
// mem: The device memory buffer.
|
96
|
+
// offset: An offset bytes from the head of the buffer.
|
97
|
+
// size: Chunk size in bytes.
|
98
|
+
// stream_ptr: Raw stream handle of cuda stream
|
99
|
+
Chunk(const std::shared_ptr<Memory>& mem, size_t offset, size_t size, cudaStream_t stream_ptr = 0) :
|
100
|
+
mem_(mem), ptr_(mem->ptr() + offset), offset_(offset), size_(size), device_id_(mem->device_id()), stream_ptr_(stream_ptr) {
|
101
|
+
assert(mem->ptr() > 0 || offset == 0);
|
102
|
+
}
|
103
|
+
|
104
|
+
Chunk(const Chunk&) = default;
|
105
|
+
|
106
|
+
~Chunk() {
|
107
|
+
// std::cout << "Chunk dtor " << (void*)ptr_ << " " << this << std::endl;
|
108
|
+
}
|
109
|
+
|
110
|
+
intptr_t ptr() const { return ptr_; }
|
111
|
+
|
112
|
+
size_t offset() const { return offset_; }
|
113
|
+
|
114
|
+
size_t size() const { return size_; }
|
115
|
+
|
116
|
+
int device_id() const { return device_id_; }
|
117
|
+
|
118
|
+
const std::shared_ptr<Chunk>& prev() const { return prev_; }
|
119
|
+
|
120
|
+
std::shared_ptr<Chunk>& prev() { return prev_; }
|
121
|
+
|
122
|
+
const std::shared_ptr<Chunk>& next() const { return next_; }
|
123
|
+
|
124
|
+
std::shared_ptr<Chunk>& next() { return next_; }
|
125
|
+
|
126
|
+
cudaStream_t stream_ptr() const { return stream_ptr_; }
|
127
|
+
|
128
|
+
void set_prev(const std::shared_ptr<Chunk>& prev) { prev_ = prev; }
|
129
|
+
|
130
|
+
void set_next(const std::shared_ptr<Chunk>& next) { next_ = next; }
|
131
|
+
|
132
|
+
bool in_use() const { return in_use_; }
|
133
|
+
|
134
|
+
void set_in_use(bool in_use) { in_use_ = in_use; }
|
135
|
+
|
136
|
+
// Split contiguous block of a larger allocation
|
137
|
+
friend std::shared_ptr<Chunk> Split(std::shared_ptr<Chunk>& self, size_t size);
|
138
|
+
|
139
|
+
// Merge previously splitted block (chunk)
|
140
|
+
friend void Merge(std::shared_ptr<Chunk>& self, std::shared_ptr<Chunk> remaining);
|
141
|
+
};
|
142
|
+
|
143
|
+
using FreeList = std::vector<std::shared_ptr<Chunk>>; // list of free chunk
|
144
|
+
using Arena = std::vector<FreeList>; // free_list w.r.t arena index
|
145
|
+
using ArenaIndexMap = std::vector<int>; // arena index <=> bin size index
|
146
|
+
|
147
|
+
// Memory pool implementation for single device.
|
148
|
+
// - The allocator attempts to find the smallest cached block that will fit
|
149
|
+
// the requested size. If the block is larger than the requested size,
|
150
|
+
// it may be split. If no block is found, the allocator will delegate to
|
151
|
+
// cudaMalloc.
|
152
|
+
// - If the cudaMalloc fails, the allocator will free all cached blocks that
|
153
|
+
// are not split and retry the allocation.
|
154
|
+
class SingleDeviceMemoryPool {
|
155
|
+
private:
|
156
|
+
int device_id_;
|
157
|
+
std::unordered_map<intptr_t, std::shared_ptr<Chunk>> in_use_; // ptr => Chunk
|
158
|
+
std::unordered_map<cudaStream_t, Arena> free_;
|
159
|
+
std::unordered_map<cudaStream_t, ArenaIndexMap> index_;
|
160
|
+
std::recursive_mutex mutex_;
|
161
|
+
|
162
|
+
public:
|
163
|
+
SingleDeviceMemoryPool() {
|
164
|
+
CheckStatus(cudaGetDevice(&device_id_));
|
165
|
+
}
|
166
|
+
|
167
|
+
intptr_t Malloc(size_t size, cudaStream_t stream_ptr = 0);
|
168
|
+
|
169
|
+
void Free(intptr_t ptr, cudaStream_t stream_ptr = 0);
|
170
|
+
|
171
|
+
// Free all **non-split** chunks in all arenas
|
172
|
+
void FreeAllBlocks();
|
173
|
+
|
174
|
+
// Free all **non-split** chunks in specified arena
|
175
|
+
void FreeAllBlocks(cudaStream_t stream_ptr);
|
176
|
+
|
177
|
+
size_t GetNumFreeBlocks();
|
178
|
+
|
179
|
+
size_t GetUsedBytes();
|
180
|
+
|
181
|
+
size_t GetFreeBytes();
|
182
|
+
|
183
|
+
size_t GetTotalBytes() {
|
184
|
+
return GetUsedBytes() + GetFreeBytes();
|
185
|
+
}
|
186
|
+
|
187
|
+
// private:
|
188
|
+
|
189
|
+
// Rounds up the memory size to fit memory alignment of cudaMalloc.
|
190
|
+
size_t GetRoundedSize(size_t size) {
|
191
|
+
return ((size + kRoundSize - 1) / kRoundSize) * kRoundSize;
|
192
|
+
}
|
193
|
+
|
194
|
+
// Get bin index regarding the memory size
|
195
|
+
int GetBinIndex(size_t size) {
|
196
|
+
return (size - 1) / kRoundSize;
|
197
|
+
}
|
198
|
+
|
199
|
+
int GetArenaIndex(size_t size, cudaStream_t stream_ptr = 0) {
|
200
|
+
int bin_index = GetBinIndex(size);
|
201
|
+
ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
|
202
|
+
return std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
|
203
|
+
}
|
204
|
+
|
205
|
+
bool HasArena(cudaStream_t stream_ptr) {
|
206
|
+
auto it = free_.find(stream_ptr);
|
207
|
+
return it != free_.end();
|
208
|
+
}
|
209
|
+
|
210
|
+
// Returns appropriate arena (list of bins) of a given stream.
|
211
|
+
//
|
212
|
+
// All free chunks in the stream belong to one of the bin in the arena.
|
213
|
+
//
|
214
|
+
// Caller is responsible to acquire lock.
|
215
|
+
Arena& GetArena(cudaStream_t stream_ptr) {
|
216
|
+
return free_[stream_ptr]; // find or create
|
217
|
+
}
|
218
|
+
|
219
|
+
// Returns appropriate arena sparse index of a given stream.
|
220
|
+
//
|
221
|
+
// Each element of the returned vector is an index value of the arena
|
222
|
+
// for the stream. The k-th element of the arena index is the bin index
|
223
|
+
// of the arena. For example, when the arena index is `[1, 3]`, it means
|
224
|
+
// that the arena has 2 bins, and `arena[0]` is for bin index 1 and
|
225
|
+
// `arena[1]` is for bin index 3.
|
226
|
+
//
|
227
|
+
// Caller is responsible to acquire lock.
|
228
|
+
ArenaIndexMap& GetArenaIndexMap(cudaStream_t stream_ptr) {
|
229
|
+
return index_[stream_ptr]; // find or create
|
230
|
+
}
|
231
|
+
|
232
|
+
std::shared_ptr<Chunk> PopFromFreeList(FreeList& free_list) {
|
233
|
+
auto data = free_list.back();
|
234
|
+
free_list.pop_back();
|
235
|
+
return data;
|
236
|
+
}
|
237
|
+
|
238
|
+
// std::vector erase-remove idiom
|
239
|
+
// http://minus9d.hatenablog.com/entry/20120605/1338896754
|
240
|
+
bool EraseFromFreeList(FreeList& free_list, const std::shared_ptr<Chunk>& chunk) {
|
241
|
+
assert(!chunk->in_use());
|
242
|
+
auto iter = std::find(free_list.begin(), free_list.end(), chunk);
|
243
|
+
if (iter == free_list.end()) {
|
244
|
+
return false;
|
245
|
+
}
|
246
|
+
free_list.erase(iter);
|
247
|
+
return true;
|
248
|
+
}
|
249
|
+
|
250
|
+
void AppendToFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr = 0);
|
251
|
+
|
252
|
+
// Removes the chunk from the free list.
|
253
|
+
//
|
254
|
+
// @return true if the chunk can successfully be removed from
|
255
|
+
// the free list. false` otherwise (e.g., the chunk could not
|
256
|
+
// be found in the free list as the chunk is allocated.)
|
257
|
+
bool RemoveFromFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr = 0);
|
258
|
+
|
259
|
+
void CompactIndex(cudaStream_t stream_ptr, bool free);
|
260
|
+
};
|
261
|
+
|
262
|
+
// Memory pool for all GPU devices on the host.
|
263
|
+
//
|
264
|
+
// A memory pool preserves any allocations even if they are freed by the user.
|
265
|
+
// Freed memory buffers are held by the memory pool as *free blocks*, and they
|
266
|
+
// are reused for further memory allocations of the same sizes. The allocated
|
267
|
+
// blocks are managed for each device, so one instance of this class can be
|
268
|
+
// used for multiple devices.
|
269
|
+
// .. note::
|
270
|
+
// When the allocation is skipped by reusing the pre-allocated block, it
|
271
|
+
// does not call ``cudaMalloc`` and therefore CPU-GPU synchronization does
|
272
|
+
// not occur. It makes interleaves of memory allocations and kernel
|
273
|
+
// invocations very fast.
|
274
|
+
// .. note::
|
275
|
+
// The memory pool holds allocated blocks without freeing as much as
|
276
|
+
// possible. It makes the program hold most of the device memory, which may
|
277
|
+
// make other CUDA programs running in parallel out-of-memory situation.
|
278
|
+
class MemoryPool {
|
279
|
+
private:
|
280
|
+
int device_id() {
|
281
|
+
int device_id = -1;
|
282
|
+
CheckStatus(cudaGetDevice(&device_id));
|
283
|
+
return device_id;
|
284
|
+
}
|
285
|
+
|
286
|
+
std::unordered_map<int, SingleDeviceMemoryPool> pools_;
|
287
|
+
|
288
|
+
public:
|
289
|
+
MemoryPool() {}
|
290
|
+
|
291
|
+
~MemoryPool() { pools_.clear(); }
|
292
|
+
|
293
|
+
// Allocates the memory, from the pool if possible.
|
294
|
+
//
|
295
|
+
// Args:
|
296
|
+
// size (int): Size of the memory buffer to allocate in bytes.
|
297
|
+
// stream_ptr (cudaStream_t): Get the memory from the arena of given stream
|
298
|
+
// Returns:
|
299
|
+
// intptr_t: Pointer address to the allocated buffer.
|
300
|
+
intptr_t Malloc(size_t size, cudaStream_t stream_ptr = 0) {
|
301
|
+
auto& mp = pools_[device_id()];
|
302
|
+
return mp.Malloc(size, stream_ptr);
|
303
|
+
}
|
304
|
+
|
305
|
+
// Frees the memory, to the pool
|
306
|
+
//
|
307
|
+
// Args:
|
308
|
+
// ptr (intptr_t): Pointer of the memory buffer
|
309
|
+
// stream_ptr (cudaStream_t): Return the memory to the arena of given stream
|
310
|
+
void Free(intptr_t ptr, cudaStream_t stream_ptr = 0) {
|
311
|
+
auto& mp = pools_[device_id()];
|
312
|
+
mp.Free(ptr, stream_ptr);
|
313
|
+
}
|
314
|
+
|
315
|
+
// Free all **non-split** chunks in all arenas
|
316
|
+
void FreeAllBlocks() {
|
317
|
+
auto& mp = pools_[device_id()];
|
318
|
+
return mp.FreeAllBlocks();
|
319
|
+
}
|
320
|
+
|
321
|
+
// Free all **non-split** chunks in specified arena
|
322
|
+
//
|
323
|
+
// Args:
|
324
|
+
// stream_ptr (cudaStream_t): Release free blocks in the arena of given stream
|
325
|
+
void FreeAllBlocks(cudaStream_t stream_ptr) {
|
326
|
+
auto& mp = pools_[device_id()];
|
327
|
+
return mp.FreeAllBlocks(stream_ptr);
|
328
|
+
}
|
329
|
+
|
330
|
+
// Count the total number of free blocks.
|
331
|
+
//
|
332
|
+
// Returns:
|
333
|
+
// size_t: The total number of free blocks.
|
334
|
+
size_t GetNumFreeBlocks() {
|
335
|
+
auto& mp = pools_[device_id()];
|
336
|
+
return mp.GetNumFreeBlocks();
|
337
|
+
}
|
338
|
+
|
339
|
+
// Get the total number of bytes used.
|
340
|
+
//
|
341
|
+
// Returns:
|
342
|
+
// size_t: The total number of bytes used.
|
343
|
+
size_t GetUsedBytes() {
|
344
|
+
auto& mp = pools_[device_id()];
|
345
|
+
return mp.GetUsedBytes();
|
346
|
+
}
|
347
|
+
|
348
|
+
// Get the total number of bytes acquired but not used in the pool.
|
349
|
+
//
|
350
|
+
// Returns:
|
351
|
+
// size_t: The total number of bytes acquired but not used in the pool.
|
352
|
+
size_t GetFreeBytes() {
|
353
|
+
auto& mp = pools_[device_id()];
|
354
|
+
return mp.GetFreeBytes();
|
355
|
+
}
|
356
|
+
|
357
|
+
// Get the total number of bytes acquired in the pool.
|
358
|
+
//
|
359
|
+
// Returns:
|
360
|
+
// size_t: The total number of bytes acquired in the pool.
|
361
|
+
size_t GetTotalBytes() {
|
362
|
+
auto& mp = pools_[device_id()];
|
363
|
+
return mp.GetTotalBytes();
|
364
|
+
}
|
365
|
+
};
|
366
|
+
|
367
|
+
} // namespace internal
|
368
|
+
} // namespace cumo
|
369
|
+
|
370
|
+
#endif /* ifndef CUMO_CUDA_MEMORY_POOL_IMPL_H */
|