cumo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +27 -0
- data/.travis.yml +5 -0
- data/3rd_party/mkmf-cu/.gitignore +36 -0
- data/3rd_party/mkmf-cu/Gemfile +3 -0
- data/3rd_party/mkmf-cu/LICENSE +21 -0
- data/3rd_party/mkmf-cu/README.md +36 -0
- data/3rd_party/mkmf-cu/Rakefile +11 -0
- data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
- data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
- data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +82 -0
- data/README.md +252 -0
- data/Rakefile +43 -0
- data/bench/broadcast_fp32.rb +138 -0
- data/bench/cumo_bench.rb +193 -0
- data/bench/numo_bench.rb +138 -0
- data/bench/reduction_fp32.rb +117 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/cumo.gemspec +32 -0
- data/ext/cumo/cuda/cublas.c +278 -0
- data/ext/cumo/cuda/driver.c +421 -0
- data/ext/cumo/cuda/memory_pool.cpp +185 -0
- data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
- data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
- data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
- data/ext/cumo/cuda/nvrtc.c +207 -0
- data/ext/cumo/cuda/runtime.c +167 -0
- data/ext/cumo/cumo.c +148 -0
- data/ext/cumo/depend.erb +58 -0
- data/ext/cumo/extconf.rb +179 -0
- data/ext/cumo/include/cumo.h +25 -0
- data/ext/cumo/include/cumo/compat.h +23 -0
- data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
- data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
- data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
- data/ext/cumo/include/cumo/cuda/driver.h +22 -0
- data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
- data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
- data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
- data/ext/cumo/include/cumo/indexer.h +238 -0
- data/ext/cumo/include/cumo/intern.h +142 -0
- data/ext/cumo/include/cumo/intern_fwd.h +38 -0
- data/ext/cumo/include/cumo/intern_kernel.h +6 -0
- data/ext/cumo/include/cumo/narray.h +429 -0
- data/ext/cumo/include/cumo/narray_kernel.h +149 -0
- data/ext/cumo/include/cumo/ndloop.h +95 -0
- data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
- data/ext/cumo/include/cumo/template.h +158 -0
- data/ext/cumo/include/cumo/template_kernel.h +77 -0
- data/ext/cumo/include/cumo/types/bit.h +40 -0
- data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
- data/ext/cumo/include/cumo/types/complex.h +402 -0
- data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
- data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
- data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
- data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
- data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
- data/ext/cumo/include/cumo/types/dfloat.h +47 -0
- data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
- data/ext/cumo/include/cumo/types/float_def.h +34 -0
- data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
- data/ext/cumo/include/cumo/types/float_macro.h +191 -0
- data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
- data/ext/cumo/include/cumo/types/int16.h +24 -0
- data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
- data/ext/cumo/include/cumo/types/int32.h +24 -0
- data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int64.h +24 -0
- data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int8.h +24 -0
- data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int_macro.h +67 -0
- data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
- data/ext/cumo/include/cumo/types/real_accum.h +486 -0
- data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
- data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
- data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
- data/ext/cumo/include/cumo/types/robject.h +27 -0
- data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
- data/ext/cumo/include/cumo/types/scomplex.h +46 -0
- data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
- data/ext/cumo/include/cumo/types/sfloat.h +48 -0
- data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
- data/ext/cumo/include/cumo/types/uint16.h +25 -0
- data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint32.h +25 -0
- data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint64.h +25 -0
- data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint8.h +25 -0
- data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
- data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
- data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
- data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
- data/ext/cumo/narray/SFMT-params.h +97 -0
- data/ext/cumo/narray/SFMT-params19937.h +46 -0
- data/ext/cumo/narray/SFMT.c +620 -0
- data/ext/cumo/narray/SFMT.h +167 -0
- data/ext/cumo/narray/array.c +638 -0
- data/ext/cumo/narray/data.c +961 -0
- data/ext/cumo/narray/gen/cogen.rb +56 -0
- data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
- data/ext/cumo/narray/gen/def/bit.rb +37 -0
- data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
- data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
- data/ext/cumo/narray/gen/def/int16.rb +36 -0
- data/ext/cumo/narray/gen/def/int32.rb +36 -0
- data/ext/cumo/narray/gen/def/int64.rb +36 -0
- data/ext/cumo/narray/gen/def/int8.rb +36 -0
- data/ext/cumo/narray/gen/def/robject.rb +37 -0
- data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
- data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
- data/ext/cumo/narray/gen/def/uint16.rb +36 -0
- data/ext/cumo/narray/gen/def/uint32.rb +36 -0
- data/ext/cumo/narray/gen/def/uint64.rb +36 -0
- data/ext/cumo/narray/gen/def/uint8.rb +36 -0
- data/ext/cumo/narray/gen/erbpp2.rb +346 -0
- data/ext/cumo/narray/gen/narray_def.rb +268 -0
- data/ext/cumo/narray/gen/spec.rb +425 -0
- data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
- data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
- data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
- data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
- data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
- data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
- data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
- data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
- data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
- data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
- data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
- data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
- data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
- data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
- data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
- data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
- data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
- data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
- data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
- data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
- data/ext/cumo/narray/gen/tmpl/class.c +9 -0
- data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
- data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
- data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
- data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
- data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
- data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
- data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
- data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
- data/ext/cumo/narray/gen/tmpl/each.c +47 -0
- data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
- data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
- data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
- data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
- data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
- data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
- data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
- data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
- data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
- data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
- data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
- data/ext/cumo/narray/gen/tmpl/format.c +62 -0
- data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
- data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
- data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
- data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
- data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
- data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
- data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
- data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
- data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
- data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
- data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
- data/ext/cumo/narray/gen/tmpl/median.c +66 -0
- data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
- data/ext/cumo/narray/gen/tmpl/module.c +9 -0
- data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
- data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
- data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
- data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
- data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
- data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
- data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
- data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
- data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
- data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
- data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
- data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
- data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
- data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
- data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
- data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
- data/ext/cumo/narray/gen/tmpl/store.c +41 -0
- data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
- data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
- data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
- data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
- data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
- data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
- data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
- data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
- data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
- data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
- data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
- data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
- data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
- data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
- data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
- data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
- data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
- data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
- data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
- data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
- data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
- data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
- data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
- data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
- data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
- data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
- data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
- data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
- data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
- data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
- data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
- data/ext/cumo/narray/index.c +880 -0
- data/ext/cumo/narray/kwargs.c +153 -0
- data/ext/cumo/narray/math.c +142 -0
- data/ext/cumo/narray/narray.c +1948 -0
- data/ext/cumo/narray/ndloop.c +2105 -0
- data/ext/cumo/narray/rand.c +45 -0
- data/ext/cumo/narray/step.c +474 -0
- data/ext/cumo/narray/struct.c +886 -0
- data/lib/cumo.rb +3 -0
- data/lib/cumo/cuda.rb +11 -0
- data/lib/cumo/cuda/compile_error.rb +36 -0
- data/lib/cumo/cuda/compiler.rb +161 -0
- data/lib/cumo/cuda/device.rb +47 -0
- data/lib/cumo/cuda/link_state.rb +31 -0
- data/lib/cumo/cuda/module.rb +40 -0
- data/lib/cumo/cuda/nvrtc_program.rb +27 -0
- data/lib/cumo/linalg.rb +12 -0
- data/lib/cumo/narray.rb +2 -0
- data/lib/cumo/narray/extra.rb +1278 -0
- data/lib/erbpp.rb +294 -0
- data/lib/erbpp/line_number.rb +137 -0
- data/lib/erbpp/narray_def.rb +381 -0
- data/numo-narray-version +1 -0
- data/run.gdb +7 -0
- metadata +353 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
#include "memory_pool_impl.hpp"
|
|
2
|
+
|
|
3
|
+
#include <ruby.h>
|
|
4
|
+
|
|
5
|
+
namespace cumo {
|
|
6
|
+
namespace internal {
|
|
7
|
+
|
|
8
|
+
void CheckStatus(cudaError_t status) {
|
|
9
|
+
if (status != 0) {
|
|
10
|
+
throw CUDARuntimeError(status);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
Memory::Memory(size_t size) : size_(size) {
|
|
15
|
+
if (size_ > 0) {
|
|
16
|
+
CheckStatus(cudaGetDevice(&device_id_));
|
|
17
|
+
CheckStatus(cudaMallocManaged(&ptr_, size_, cudaMemAttachGlobal));
|
|
18
|
+
// std::cout << "cudaMalloc " << ptr_ << std::endl;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
Memory::~Memory() {
|
|
23
|
+
if (size_ > 0) {
|
|
24
|
+
// std::cout << "cudaFree " << ptr_ << std::endl;
|
|
25
|
+
cudaError_t status = cudaFree(ptr_);
|
|
26
|
+
// CUDA driver may shut down before freeing memory inside memory pool.
|
|
27
|
+
// It is okay to simply ignore because CUDA driver automatically frees memory.
|
|
28
|
+
if (status != cudaErrorCudartUnloading) {
|
|
29
|
+
CheckStatus(status);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
std::shared_ptr<Chunk> Split(std::shared_ptr<Chunk>& self, size_t size) {
|
|
35
|
+
assert(self->size_ >= size);
|
|
36
|
+
if (self->size_ == size) {
|
|
37
|
+
return nullptr;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
auto remaining = std::make_shared<Chunk>(self->mem_, self->offset_ + size, self->size_ - size, self->stream_ptr_);
|
|
41
|
+
self->size_ = size;
|
|
42
|
+
|
|
43
|
+
if (self->next_) {
|
|
44
|
+
remaining->set_next(std::move(self->next_));
|
|
45
|
+
remaining->next()->set_prev(remaining);
|
|
46
|
+
}
|
|
47
|
+
self->next_ = remaining;
|
|
48
|
+
remaining->set_prev(self);
|
|
49
|
+
|
|
50
|
+
return remaining;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
void Merge(std::shared_ptr<Chunk>& self, std::shared_ptr<Chunk> remaining) {
|
|
55
|
+
assert(remaining != nullptr);
|
|
56
|
+
assert(self->stream_ptr_ == remaining->stream_ptr());
|
|
57
|
+
self->size_ += remaining->size();
|
|
58
|
+
self->next_ = remaining->next();
|
|
59
|
+
if (remaining->next() != nullptr) {
|
|
60
|
+
self->next_->set_prev(self);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
void SingleDeviceMemoryPool::AppendToFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr) {
|
|
65
|
+
assert(chunk != nullptr && !chunk->in_use());
|
|
66
|
+
int bin_index = GetBinIndex(size);
|
|
67
|
+
|
|
68
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
|
69
|
+
|
|
70
|
+
Arena& arena = GetArena(stream_ptr);
|
|
71
|
+
ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
|
|
72
|
+
int arena_index = std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
|
|
73
|
+
int length = static_cast<int>(arena_index_map.size());
|
|
74
|
+
if (arena_index >= length || arena_index_map.at(arena_index) != bin_index) {
|
|
75
|
+
arena_index_map.insert(arena_index_map.begin() + arena_index, bin_index);
|
|
76
|
+
arena.insert(arena.begin() + arena_index, FreeList{});
|
|
77
|
+
}
|
|
78
|
+
FreeList& free_list = arena[arena_index];
|
|
79
|
+
free_list.emplace_back(chunk);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
bool SingleDeviceMemoryPool::RemoveFromFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr) {
|
|
83
|
+
assert(chunk != nullptr && !chunk->in_use());
|
|
84
|
+
int bin_index = GetBinIndex(size);
|
|
85
|
+
|
|
86
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
|
87
|
+
|
|
88
|
+
Arena& arena = GetArena(stream_ptr);
|
|
89
|
+
ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
|
|
90
|
+
if (arena_index_map.size() == 0) {
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
int arena_index = std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
|
|
94
|
+
if (static_cast<size_t>(arena_index) == arena_index_map.size()) {
|
|
95
|
+
// Bin does not exist for the given chunk size.
|
|
96
|
+
return false;
|
|
97
|
+
}
|
|
98
|
+
if (arena_index_map.at(arena_index) != bin_index) {
|
|
99
|
+
return false;
|
|
100
|
+
}
|
|
101
|
+
assert(arena.size() > static_cast<size_t>(arena_index));
|
|
102
|
+
FreeList& free_list = arena[arena_index];
|
|
103
|
+
return EraseFromFreeList(free_list, chunk);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
intptr_t SingleDeviceMemoryPool::Malloc(size_t size, cudaStream_t stream_ptr) {
|
|
107
|
+
size = GetRoundedSize(size);
|
|
108
|
+
std::shared_ptr<Chunk> chunk = nullptr;
|
|
109
|
+
|
|
110
|
+
{
|
|
111
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
|
112
|
+
|
|
113
|
+
// find best-fit, or a smallest larger allocation
|
|
114
|
+
Arena& arena = GetArena(stream_ptr);
|
|
115
|
+
int arena_index = GetArenaIndex(size);
|
|
116
|
+
int arena_length = static_cast<int>(arena.size());
|
|
117
|
+
for (int i = arena_index; i < arena_length; ++i) {
|
|
118
|
+
FreeList& free_list = arena[i];
|
|
119
|
+
if (free_list.empty()) {
|
|
120
|
+
continue;
|
|
121
|
+
}
|
|
122
|
+
chunk = PopFromFreeList(free_list);
|
|
123
|
+
// TODO(sonots): compact_index
|
|
124
|
+
break;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (chunk != nullptr) {
|
|
129
|
+
std::shared_ptr<Chunk> remaining = Split(chunk, size);
|
|
130
|
+
if (remaining != nullptr) {
|
|
131
|
+
AppendToFreeList(remaining->size(), remaining, stream_ptr);
|
|
132
|
+
}
|
|
133
|
+
} else {
|
|
134
|
+
// cudaMalloc if a cache is not found
|
|
135
|
+
std::shared_ptr<Memory> mem = nullptr;
|
|
136
|
+
try {
|
|
137
|
+
mem = std::make_shared<Memory>(size);
|
|
138
|
+
} catch (const CUDARuntimeError& e) {
|
|
139
|
+
if (e.status() != cudaErrorMemoryAllocation) {
|
|
140
|
+
throw;
|
|
141
|
+
}
|
|
142
|
+
FreeAllBlocks();
|
|
143
|
+
try {
|
|
144
|
+
mem = std::make_shared<Memory>(size);
|
|
145
|
+
} catch (const CUDARuntimeError& e) {
|
|
146
|
+
if (e.status() != cudaErrorMemoryAllocation) {
|
|
147
|
+
throw;
|
|
148
|
+
}
|
|
149
|
+
#ifdef NO_RUBY // cpp test does not bind with libruby
|
|
150
|
+
size_t total = size + GetTotalBytes();
|
|
151
|
+
throw OutOfMemoryError(size, total);
|
|
152
|
+
#else
|
|
153
|
+
rb_funcall(rb_define_module("GC"), rb_intern("start"), 0);
|
|
154
|
+
try {
|
|
155
|
+
mem = std::make_shared<Memory>(size);
|
|
156
|
+
} catch (const CUDARuntimeError& e) {
|
|
157
|
+
if (e.status() != cudaErrorMemoryAllocation) {
|
|
158
|
+
throw;
|
|
159
|
+
}
|
|
160
|
+
size_t total = size + GetTotalBytes();
|
|
161
|
+
throw OutOfMemoryError(size, total);
|
|
162
|
+
}
|
|
163
|
+
#endif
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
chunk = std::make_shared<Chunk>(mem, 0, size, stream_ptr);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
assert(chunk != nullptr);
|
|
170
|
+
assert(chunk->stream_ptr() == stream_ptr);
|
|
171
|
+
{
|
|
172
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
|
173
|
+
|
|
174
|
+
chunk->set_in_use(true);
|
|
175
|
+
in_use_.emplace(chunk->ptr(), chunk);
|
|
176
|
+
}
|
|
177
|
+
return chunk->ptr();
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
void SingleDeviceMemoryPool::Free(intptr_t ptr, cudaStream_t stream_ptr) {
|
|
181
|
+
std::shared_ptr<Chunk> chunk = nullptr;
|
|
182
|
+
|
|
183
|
+
{
|
|
184
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
|
185
|
+
|
|
186
|
+
chunk = in_use_[ptr];
|
|
187
|
+
// assert(chunk != nullptr);
|
|
188
|
+
if (!chunk) return;
|
|
189
|
+
chunk->set_in_use(false);
|
|
190
|
+
in_use_.erase(ptr);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
if (chunk->next() != nullptr && !chunk->next()->in_use()) {
|
|
194
|
+
if (RemoveFromFreeList(chunk->next()->size(), chunk->next(), stream_ptr)) {
|
|
195
|
+
Merge(chunk, chunk->next());
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
if (chunk->prev() != nullptr && !chunk->prev()->in_use()) {
|
|
199
|
+
if (RemoveFromFreeList(chunk->prev()->size(), chunk->prev(), stream_ptr)) {
|
|
200
|
+
chunk = chunk->prev();
|
|
201
|
+
Merge(chunk, chunk->next());
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
AppendToFreeList(chunk->size(), chunk, stream_ptr);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
void SingleDeviceMemoryPool::CompactIndex(cudaStream_t stream_ptr, bool free) {
|
|
208
|
+
// need lock ouside this function
|
|
209
|
+
if (!HasArena(stream_ptr)) return;
|
|
210
|
+
|
|
211
|
+
Arena new_arena;
|
|
212
|
+
ArenaIndexMap new_arena_index_map;
|
|
213
|
+
Arena& arena = GetArena(stream_ptr);
|
|
214
|
+
ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
|
|
215
|
+
size_t arena_length = arena.size();
|
|
216
|
+
for (size_t arena_index = 0; arena_index < arena_length; ++arena_index) {
|
|
217
|
+
FreeList& free_list = arena[arena_index];
|
|
218
|
+
if (free_list.empty()) {
|
|
219
|
+
continue;
|
|
220
|
+
}
|
|
221
|
+
if (free) {
|
|
222
|
+
FreeList keep_list;
|
|
223
|
+
for (auto chunk : free_list) {
|
|
224
|
+
if (chunk->prev() != nullptr || chunk->next() != nullptr) {
|
|
225
|
+
keep_list.emplace_back(chunk);
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
if (keep_list.size() == 0) {
|
|
229
|
+
continue;
|
|
230
|
+
}
|
|
231
|
+
new_arena_index_map.emplace_back(arena_index_map[arena_index]);
|
|
232
|
+
new_arena.emplace_back(keep_list);
|
|
233
|
+
} else {
|
|
234
|
+
new_arena_index_map.emplace_back(arena_index_map[arena_index]);
|
|
235
|
+
new_arena.emplace_back(free_list);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
if (new_arena.empty()) {
|
|
239
|
+
index_.erase(stream_ptr);
|
|
240
|
+
free_.erase(stream_ptr);
|
|
241
|
+
} else {
|
|
242
|
+
arena_index_map.swap(new_arena_index_map);
|
|
243
|
+
arena.swap(new_arena);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Free all **non-split** chunks in all arenas
|
|
248
|
+
void SingleDeviceMemoryPool::FreeAllBlocks() {
|
|
249
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
|
250
|
+
|
|
251
|
+
std::vector<cudaStream_t> keys(free_.size());
|
|
252
|
+
transform(free_.begin(), free_.end(), keys.begin(), [](auto pair) { return pair.first; });
|
|
253
|
+
for (cudaStream_t stream_ptr : keys) {
|
|
254
|
+
CompactIndex(stream_ptr, true);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// Free all **non-split** chunks in specified arena
|
|
259
|
+
void SingleDeviceMemoryPool::FreeAllBlocks(cudaStream_t stream_ptr) {
|
|
260
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
|
261
|
+
|
|
262
|
+
CompactIndex(stream_ptr, true);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
size_t SingleDeviceMemoryPool::GetNumFreeBlocks() {
|
|
266
|
+
size_t n = 0;
|
|
267
|
+
|
|
268
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
|
269
|
+
|
|
270
|
+
for (auto kv : free_) {
|
|
271
|
+
Arena& arena = kv.second;
|
|
272
|
+
for (auto free_list : arena) {
|
|
273
|
+
n += free_list.size();
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
return n;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
size_t SingleDeviceMemoryPool::GetUsedBytes() {
|
|
280
|
+
size_t size = 0;
|
|
281
|
+
|
|
282
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
|
283
|
+
|
|
284
|
+
for (auto kv : in_use_) {
|
|
285
|
+
std::shared_ptr<Chunk>& chunk = kv.second;
|
|
286
|
+
if (chunk) size += chunk->size();
|
|
287
|
+
}
|
|
288
|
+
return size;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
size_t SingleDeviceMemoryPool::GetFreeBytes() {
|
|
292
|
+
size_t size = 0;
|
|
293
|
+
|
|
294
|
+
std::lock_guard<std::recursive_mutex> lock{mutex_};
|
|
295
|
+
|
|
296
|
+
for (auto kv : free_) {
|
|
297
|
+
Arena& arena = kv.second;
|
|
298
|
+
for (auto free_list : arena) {
|
|
299
|
+
for (auto chunk : free_list) {
|
|
300
|
+
if (chunk) size += chunk->size();
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
return size;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
} // namespace internal
|
|
308
|
+
} // namespace cumo
|
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
#ifndef CUMO_CUDA_MEMORY_POOL_IMPL_H
|
|
2
|
+
#define CUMO_CUDA_MEMORY_POOL_IMPL_H
|
|
3
|
+
|
|
4
|
+
#include <algorithm>
|
|
5
|
+
#include <cassert>
|
|
6
|
+
#include <memory>
|
|
7
|
+
#include <mutex>
|
|
8
|
+
#include <stdexcept>
|
|
9
|
+
#include <unordered_map>
|
|
10
|
+
#include <vector>
|
|
11
|
+
|
|
12
|
+
#include <cuda_runtime.h>
|
|
13
|
+
|
|
14
|
+
// CUDA memory pool implementation highly referring CuPy
|
|
15
|
+
|
|
16
|
+
namespace cumo {
|
|
17
|
+
namespace internal {
|
|
18
|
+
|
|
19
|
+
// cudaMalloc() is aligned to at least 512 bytes
|
|
20
|
+
// cf. https://gist.github.com/sonots/41daaa6432b1c8b27ef782cd14064269
|
|
21
|
+
constexpr int kRoundSize = 512; // bytes
|
|
22
|
+
|
|
23
|
+
class CUDARuntimeError : public std::runtime_error {
|
|
24
|
+
private:
|
|
25
|
+
cudaError_t status_;
|
|
26
|
+
|
|
27
|
+
public:
|
|
28
|
+
CUDARuntimeError(cudaError_t status) :
|
|
29
|
+
runtime_error(cudaGetErrorString(status)), status_(status) {}
|
|
30
|
+
cudaError_t status() const { return status_; }
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class OutOfMemoryError : public std::runtime_error {
|
|
35
|
+
public:
|
|
36
|
+
OutOfMemoryError(size_t size, size_t total) :
|
|
37
|
+
runtime_error("out of memory to allocate " + std::to_string(size) + " bytes (total " + std::to_string(total) + " bytes)") {}
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
void CheckStatus(cudaError_t status);
|
|
41
|
+
|
|
42
|
+
// Memory allocation on a CUDA device.
|
|
43
|
+
//
|
|
44
|
+
// This class provides an RAII interface of the CUDA memory allocation.
|
|
45
|
+
class Memory {
|
|
46
|
+
private:
|
|
47
|
+
// Pointer to the place within the buffer.
|
|
48
|
+
void* ptr_ = nullptr;
|
|
49
|
+
// Size of the memory allocation in bytes.
|
|
50
|
+
size_t size_ = 0;
|
|
51
|
+
// GPU device id whose memory the pointer refers to.
|
|
52
|
+
int device_id_ = -1;
|
|
53
|
+
|
|
54
|
+
public:
|
|
55
|
+
Memory(size_t size);
|
|
56
|
+
|
|
57
|
+
~Memory();
|
|
58
|
+
|
|
59
|
+
intptr_t ptr() const { return reinterpret_cast<intptr_t>(ptr_); }
|
|
60
|
+
|
|
61
|
+
size_t size() const { return size_; }
|
|
62
|
+
|
|
63
|
+
int device_id() const { return device_id_; }
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
// A chunk points to a device memory.
|
|
67
|
+
//
|
|
68
|
+
// A chunk might be a splitted memory block from a larger allocation.
|
|
69
|
+
// The prev/next pointers contruct a doubly-linked list of memory addresses
|
|
70
|
+
// sorted by base address that must be contiguous.
|
|
71
|
+
class Chunk {
|
|
72
|
+
private:
|
|
73
|
+
// The device memory buffer.
|
|
74
|
+
std::shared_ptr<Memory> mem_;
|
|
75
|
+
// Memory address.
|
|
76
|
+
intptr_t ptr_ = 0;
|
|
77
|
+
// An offset bytes from the head of the buffer.
|
|
78
|
+
size_t offset_ = 0;
|
|
79
|
+
// Chunk size in bytes.
|
|
80
|
+
size_t size_ = 0;
|
|
81
|
+
// GPU device id whose memory the pointer refers to.
|
|
82
|
+
int device_id_;
|
|
83
|
+
// prev memory pointer if split from a larger allocation
|
|
84
|
+
std::shared_ptr<Chunk> prev_;
|
|
85
|
+
// next memory pointer if split from a larger allocation
|
|
86
|
+
std::shared_ptr<Chunk> next_;
|
|
87
|
+
// Raw stream handle of cuda stream
|
|
88
|
+
cudaStream_t stream_ptr_;
|
|
89
|
+
// chunk is in use
|
|
90
|
+
bool in_use_ = false;
|
|
91
|
+
|
|
92
|
+
public:
|
|
93
|
+
Chunk() {}
|
|
94
|
+
|
|
95
|
+
// mem: The device memory buffer.
|
|
96
|
+
// offset: An offset bytes from the head of the buffer.
|
|
97
|
+
// size: Chunk size in bytes.
|
|
98
|
+
// stream_ptr: Raw stream handle of cuda stream
|
|
99
|
+
Chunk(const std::shared_ptr<Memory>& mem, size_t offset, size_t size, cudaStream_t stream_ptr = 0) :
|
|
100
|
+
mem_(mem), ptr_(mem->ptr() + offset), offset_(offset), size_(size), device_id_(mem->device_id()), stream_ptr_(stream_ptr) {
|
|
101
|
+
assert(mem->ptr() > 0 || offset == 0);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
Chunk(const Chunk&) = default;
|
|
105
|
+
|
|
106
|
+
~Chunk() {
|
|
107
|
+
// std::cout << "Chunk dtor " << (void*)ptr_ << " " << this << std::endl;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
intptr_t ptr() const { return ptr_; }
|
|
111
|
+
|
|
112
|
+
size_t offset() const { return offset_; }
|
|
113
|
+
|
|
114
|
+
size_t size() const { return size_; }
|
|
115
|
+
|
|
116
|
+
int device_id() const { return device_id_; }
|
|
117
|
+
|
|
118
|
+
const std::shared_ptr<Chunk>& prev() const { return prev_; }
|
|
119
|
+
|
|
120
|
+
std::shared_ptr<Chunk>& prev() { return prev_; }
|
|
121
|
+
|
|
122
|
+
const std::shared_ptr<Chunk>& next() const { return next_; }
|
|
123
|
+
|
|
124
|
+
std::shared_ptr<Chunk>& next() { return next_; }
|
|
125
|
+
|
|
126
|
+
cudaStream_t stream_ptr() const { return stream_ptr_; }
|
|
127
|
+
|
|
128
|
+
void set_prev(const std::shared_ptr<Chunk>& prev) { prev_ = prev; }
|
|
129
|
+
|
|
130
|
+
void set_next(const std::shared_ptr<Chunk>& next) { next_ = next; }
|
|
131
|
+
|
|
132
|
+
bool in_use() const { return in_use_; }
|
|
133
|
+
|
|
134
|
+
void set_in_use(bool in_use) { in_use_ = in_use; }
|
|
135
|
+
|
|
136
|
+
// Split contiguous block of a larger allocation
|
|
137
|
+
friend std::shared_ptr<Chunk> Split(std::shared_ptr<Chunk>& self, size_t size);
|
|
138
|
+
|
|
139
|
+
// Merge previously splitted block (chunk)
|
|
140
|
+
friend void Merge(std::shared_ptr<Chunk>& self, std::shared_ptr<Chunk> remaining);
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
using FreeList = std::vector<std::shared_ptr<Chunk>>; // list of free chunk
|
|
144
|
+
using Arena = std::vector<FreeList>; // free_list w.r.t arena index
|
|
145
|
+
using ArenaIndexMap = std::vector<int>; // arena index <=> bin size index
|
|
146
|
+
|
|
147
|
+
// Memory pool implementation for single device.
|
|
148
|
+
// - The allocator attempts to find the smallest cached block that will fit
|
|
149
|
+
// the requested size. If the block is larger than the requested size,
|
|
150
|
+
// it may be split. If no block is found, the allocator will delegate to
|
|
151
|
+
// cudaMalloc.
|
|
152
|
+
// - If the cudaMalloc fails, the allocator will free all cached blocks that
|
|
153
|
+
// are not split and retry the allocation.
|
|
154
|
+
class SingleDeviceMemoryPool {
|
|
155
|
+
private:
|
|
156
|
+
int device_id_;
|
|
157
|
+
std::unordered_map<intptr_t, std::shared_ptr<Chunk>> in_use_; // ptr => Chunk
|
|
158
|
+
std::unordered_map<cudaStream_t, Arena> free_;
|
|
159
|
+
std::unordered_map<cudaStream_t, ArenaIndexMap> index_;
|
|
160
|
+
std::recursive_mutex mutex_;
|
|
161
|
+
|
|
162
|
+
public:
|
|
163
|
+
SingleDeviceMemoryPool() {
|
|
164
|
+
CheckStatus(cudaGetDevice(&device_id_));
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
intptr_t Malloc(size_t size, cudaStream_t stream_ptr = 0);
|
|
168
|
+
|
|
169
|
+
void Free(intptr_t ptr, cudaStream_t stream_ptr = 0);
|
|
170
|
+
|
|
171
|
+
// Free all **non-split** chunks in all arenas
|
|
172
|
+
void FreeAllBlocks();
|
|
173
|
+
|
|
174
|
+
// Free all **non-split** chunks in specified arena
|
|
175
|
+
void FreeAllBlocks(cudaStream_t stream_ptr);
|
|
176
|
+
|
|
177
|
+
size_t GetNumFreeBlocks();
|
|
178
|
+
|
|
179
|
+
size_t GetUsedBytes();
|
|
180
|
+
|
|
181
|
+
size_t GetFreeBytes();
|
|
182
|
+
|
|
183
|
+
size_t GetTotalBytes() {
|
|
184
|
+
return GetUsedBytes() + GetFreeBytes();
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// private:
|
|
188
|
+
|
|
189
|
+
// Rounds up the memory size to fit memory alignment of cudaMalloc.
|
|
190
|
+
size_t GetRoundedSize(size_t size) {
|
|
191
|
+
return ((size + kRoundSize - 1) / kRoundSize) * kRoundSize;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Get bin index regarding the memory size
|
|
195
|
+
int GetBinIndex(size_t size) {
|
|
196
|
+
return (size - 1) / kRoundSize;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
int GetArenaIndex(size_t size, cudaStream_t stream_ptr = 0) {
|
|
200
|
+
int bin_index = GetBinIndex(size);
|
|
201
|
+
ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
|
|
202
|
+
return std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
bool HasArena(cudaStream_t stream_ptr) {
|
|
206
|
+
auto it = free_.find(stream_ptr);
|
|
207
|
+
return it != free_.end();
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Returns appropriate arena (list of bins) of a given stream.
|
|
211
|
+
//
|
|
212
|
+
// All free chunks in the stream belong to one of the bin in the arena.
|
|
213
|
+
//
|
|
214
|
+
// Caller is responsible to acquire lock.
|
|
215
|
+
Arena& GetArena(cudaStream_t stream_ptr) {
|
|
216
|
+
return free_[stream_ptr]; // find or create
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// Returns appropriate arena sparse index of a given stream.
|
|
220
|
+
//
|
|
221
|
+
// Each element of the returned vector is an index value of the arena
|
|
222
|
+
// for the stream. The k-th element of the arena index is the bin index
|
|
223
|
+
// of the arena. For example, when the arena index is `[1, 3]`, it means
|
|
224
|
+
// that the arena has 2 bins, and `arena[0]` is for bin index 1 and
|
|
225
|
+
// `arena[1]` is for bin index 3.
|
|
226
|
+
//
|
|
227
|
+
// Caller is responsible to acquire lock.
|
|
228
|
+
ArenaIndexMap& GetArenaIndexMap(cudaStream_t stream_ptr) {
|
|
229
|
+
return index_[stream_ptr]; // find or create
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
std::shared_ptr<Chunk> PopFromFreeList(FreeList& free_list) {
|
|
233
|
+
auto data = free_list.back();
|
|
234
|
+
free_list.pop_back();
|
|
235
|
+
return data;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// std::vector erase-remove idiom
|
|
239
|
+
// http://minus9d.hatenablog.com/entry/20120605/1338896754
|
|
240
|
+
bool EraseFromFreeList(FreeList& free_list, const std::shared_ptr<Chunk>& chunk) {
|
|
241
|
+
assert(!chunk->in_use());
|
|
242
|
+
auto iter = std::find(free_list.begin(), free_list.end(), chunk);
|
|
243
|
+
if (iter == free_list.end()) {
|
|
244
|
+
return false;
|
|
245
|
+
}
|
|
246
|
+
free_list.erase(iter);
|
|
247
|
+
return true;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
void AppendToFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr = 0);
|
|
251
|
+
|
|
252
|
+
// Removes the chunk from the free list.
|
|
253
|
+
//
|
|
254
|
+
// @return true if the chunk can successfully be removed from
|
|
255
|
+
// the free list. false` otherwise (e.g., the chunk could not
|
|
256
|
+
// be found in the free list as the chunk is allocated.)
|
|
257
|
+
bool RemoveFromFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr = 0);
|
|
258
|
+
|
|
259
|
+
void CompactIndex(cudaStream_t stream_ptr, bool free);
|
|
260
|
+
};
|
|
261
|
+
|
|
262
|
+
// Memory pool for all GPU devices on the host.
|
|
263
|
+
//
|
|
264
|
+
// A memory pool preserves any allocations even if they are freed by the user.
|
|
265
|
+
// Freed memory buffers are held by the memory pool as *free blocks*, and they
|
|
266
|
+
// are reused for further memory allocations of the same sizes. The allocated
|
|
267
|
+
// blocks are managed for each device, so one instance of this class can be
|
|
268
|
+
// used for multiple devices.
|
|
269
|
+
// .. note::
|
|
270
|
+
// When the allocation is skipped by reusing the pre-allocated block, it
|
|
271
|
+
// does not call ``cudaMalloc`` and therefore CPU-GPU synchronization does
|
|
272
|
+
// not occur. It makes interleaves of memory allocations and kernel
|
|
273
|
+
// invocations very fast.
|
|
274
|
+
// .. note::
|
|
275
|
+
// The memory pool holds allocated blocks without freeing as much as
|
|
276
|
+
// possible. It makes the program hold most of the device memory, which may
|
|
277
|
+
// make other CUDA programs running in parallel out-of-memory situation.
|
|
278
|
+
class MemoryPool {
|
|
279
|
+
private:
|
|
280
|
+
int device_id() {
|
|
281
|
+
int device_id = -1;
|
|
282
|
+
CheckStatus(cudaGetDevice(&device_id));
|
|
283
|
+
return device_id;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
std::unordered_map<int, SingleDeviceMemoryPool> pools_;
|
|
287
|
+
|
|
288
|
+
public:
|
|
289
|
+
MemoryPool() {}
|
|
290
|
+
|
|
291
|
+
~MemoryPool() { pools_.clear(); }
|
|
292
|
+
|
|
293
|
+
// Allocates the memory, from the pool if possible.
|
|
294
|
+
//
|
|
295
|
+
// Args:
|
|
296
|
+
// size (int): Size of the memory buffer to allocate in bytes.
|
|
297
|
+
// stream_ptr (cudaStream_t): Get the memory from the arena of given stream
|
|
298
|
+
// Returns:
|
|
299
|
+
// intptr_t: Pointer address to the allocated buffer.
|
|
300
|
+
intptr_t Malloc(size_t size, cudaStream_t stream_ptr = 0) {
|
|
301
|
+
auto& mp = pools_[device_id()];
|
|
302
|
+
return mp.Malloc(size, stream_ptr);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// Frees the memory, to the pool
|
|
306
|
+
//
|
|
307
|
+
// Args:
|
|
308
|
+
// ptr (intptr_t): Pointer of the memory buffer
|
|
309
|
+
// stream_ptr (cudaStream_t): Return the memory to the arena of given stream
|
|
310
|
+
void Free(intptr_t ptr, cudaStream_t stream_ptr = 0) {
|
|
311
|
+
auto& mp = pools_[device_id()];
|
|
312
|
+
mp.Free(ptr, stream_ptr);
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Free all **non-split** chunks in all arenas
|
|
316
|
+
void FreeAllBlocks() {
|
|
317
|
+
auto& mp = pools_[device_id()];
|
|
318
|
+
return mp.FreeAllBlocks();
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// Free all **non-split** chunks in specified arena
|
|
322
|
+
//
|
|
323
|
+
// Args:
|
|
324
|
+
// stream_ptr (cudaStream_t): Release free blocks in the arena of given stream
|
|
325
|
+
void FreeAllBlocks(cudaStream_t stream_ptr) {
|
|
326
|
+
auto& mp = pools_[device_id()];
|
|
327
|
+
return mp.FreeAllBlocks(stream_ptr);
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// Count the total number of free blocks.
|
|
331
|
+
//
|
|
332
|
+
// Returns:
|
|
333
|
+
// size_t: The total number of free blocks.
|
|
334
|
+
size_t GetNumFreeBlocks() {
|
|
335
|
+
auto& mp = pools_[device_id()];
|
|
336
|
+
return mp.GetNumFreeBlocks();
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// Get the total number of bytes used.
|
|
340
|
+
//
|
|
341
|
+
// Returns:
|
|
342
|
+
// size_t: The total number of bytes used.
|
|
343
|
+
size_t GetUsedBytes() {
|
|
344
|
+
auto& mp = pools_[device_id()];
|
|
345
|
+
return mp.GetUsedBytes();
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
// Get the total number of bytes acquired but not used in the pool.
|
|
349
|
+
//
|
|
350
|
+
// Returns:
|
|
351
|
+
// size_t: The total number of bytes acquired but not used in the pool.
|
|
352
|
+
size_t GetFreeBytes() {
|
|
353
|
+
auto& mp = pools_[device_id()];
|
|
354
|
+
return mp.GetFreeBytes();
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// Get the total number of bytes acquired in the pool.
|
|
358
|
+
//
|
|
359
|
+
// Returns:
|
|
360
|
+
// size_t: The total number of bytes acquired in the pool.
|
|
361
|
+
size_t GetTotalBytes() {
|
|
362
|
+
auto& mp = pools_[device_id()];
|
|
363
|
+
return mp.GetTotalBytes();
|
|
364
|
+
}
|
|
365
|
+
};
|
|
366
|
+
|
|
367
|
+
} // namespace internal
|
|
368
|
+
} // namespace cumo
|
|
369
|
+
|
|
370
|
+
#endif /* ifndef CUMO_CUDA_MEMORY_POOL_IMPL_H */
|