RubyGems - cumo - Versions diffs - 0.1.0 - Mend

cumo 0.1.0

Files changed (266) hide show

checksums.yaml +7 -0
data/.gitignore +27 -0
data/.travis.yml +5 -0
data/3rd_party/mkmf-cu/.gitignore +36 -0
data/3rd_party/mkmf-cu/Gemfile +3 -0
data/3rd_party/mkmf-cu/LICENSE +21 -0
data/3rd_party/mkmf-cu/README.md +36 -0
data/3rd_party/mkmf-cu/Rakefile +11 -0
data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
data/CODE_OF_CONDUCT.md +46 -0
data/Gemfile +8 -0
data/LICENSE.txt +82 -0
data/README.md +252 -0
data/Rakefile +43 -0
data/bench/broadcast_fp32.rb +138 -0
data/bench/cumo_bench.rb +193 -0
data/bench/numo_bench.rb +138 -0
data/bench/reduction_fp32.rb +117 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/cumo.gemspec +32 -0
data/ext/cumo/cuda/cublas.c +278 -0
data/ext/cumo/cuda/driver.c +421 -0
data/ext/cumo/cuda/memory_pool.cpp +185 -0
data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
data/ext/cumo/cuda/nvrtc.c +207 -0
data/ext/cumo/cuda/runtime.c +167 -0
data/ext/cumo/cumo.c +148 -0
data/ext/cumo/depend.erb +58 -0
data/ext/cumo/extconf.rb +179 -0
data/ext/cumo/include/cumo.h +25 -0
data/ext/cumo/include/cumo/compat.h +23 -0
data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
data/ext/cumo/include/cumo/cuda/driver.h +22 -0
data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
data/ext/cumo/include/cumo/indexer.h +238 -0
data/ext/cumo/include/cumo/intern.h +142 -0
data/ext/cumo/include/cumo/intern_fwd.h +38 -0
data/ext/cumo/include/cumo/intern_kernel.h +6 -0
data/ext/cumo/include/cumo/narray.h +429 -0
data/ext/cumo/include/cumo/narray_kernel.h +149 -0
data/ext/cumo/include/cumo/ndloop.h +95 -0
data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
data/ext/cumo/include/cumo/template.h +158 -0
data/ext/cumo/include/cumo/template_kernel.h +77 -0
data/ext/cumo/include/cumo/types/bit.h +40 -0
data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
data/ext/cumo/include/cumo/types/complex.h +402 -0
data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
data/ext/cumo/include/cumo/types/dfloat.h +47 -0
data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
data/ext/cumo/include/cumo/types/float_def.h +34 -0
data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
data/ext/cumo/include/cumo/types/float_macro.h +191 -0
data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
data/ext/cumo/include/cumo/types/int16.h +24 -0
data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
data/ext/cumo/include/cumo/types/int32.h +24 -0
data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
data/ext/cumo/include/cumo/types/int64.h +24 -0
data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
data/ext/cumo/include/cumo/types/int8.h +24 -0
data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
data/ext/cumo/include/cumo/types/int_macro.h +67 -0
data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
data/ext/cumo/include/cumo/types/real_accum.h +486 -0
data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
data/ext/cumo/include/cumo/types/robject.h +27 -0
data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
data/ext/cumo/include/cumo/types/scomplex.h +46 -0
data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
data/ext/cumo/include/cumo/types/sfloat.h +48 -0
data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
data/ext/cumo/include/cumo/types/uint16.h +25 -0
data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
data/ext/cumo/include/cumo/types/uint32.h +25 -0
data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
data/ext/cumo/include/cumo/types/uint64.h +25 -0
data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
data/ext/cumo/include/cumo/types/uint8.h +25 -0
data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
data/ext/cumo/narray/SFMT-params.h +97 -0
data/ext/cumo/narray/SFMT-params19937.h +46 -0
data/ext/cumo/narray/SFMT.c +620 -0
data/ext/cumo/narray/SFMT.h +167 -0
data/ext/cumo/narray/array.c +638 -0
data/ext/cumo/narray/data.c +961 -0
data/ext/cumo/narray/gen/cogen.rb +56 -0
data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
data/ext/cumo/narray/gen/def/bit.rb +37 -0
data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
data/ext/cumo/narray/gen/def/int16.rb +36 -0
data/ext/cumo/narray/gen/def/int32.rb +36 -0
data/ext/cumo/narray/gen/def/int64.rb +36 -0
data/ext/cumo/narray/gen/def/int8.rb +36 -0
data/ext/cumo/narray/gen/def/robject.rb +37 -0
data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
data/ext/cumo/narray/gen/def/uint16.rb +36 -0
data/ext/cumo/narray/gen/def/uint32.rb +36 -0
data/ext/cumo/narray/gen/def/uint64.rb +36 -0
data/ext/cumo/narray/gen/def/uint8.rb +36 -0
data/ext/cumo/narray/gen/erbpp2.rb +346 -0
data/ext/cumo/narray/gen/narray_def.rb +268 -0
data/ext/cumo/narray/gen/spec.rb +425 -0
data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
data/ext/cumo/narray/gen/tmpl/class.c +9 -0
data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
data/ext/cumo/narray/gen/tmpl/each.c +47 -0
data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
data/ext/cumo/narray/gen/tmpl/format.c +62 -0
data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
data/ext/cumo/narray/gen/tmpl/median.c +66 -0
data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
data/ext/cumo/narray/gen/tmpl/module.c +9 -0
data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
data/ext/cumo/narray/gen/tmpl/store.c +41 -0
data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
data/ext/cumo/narray/index.c +880 -0
data/ext/cumo/narray/kwargs.c +153 -0
data/ext/cumo/narray/math.c +142 -0
data/ext/cumo/narray/narray.c +1948 -0
data/ext/cumo/narray/ndloop.c +2105 -0
data/ext/cumo/narray/rand.c +45 -0
data/ext/cumo/narray/step.c +474 -0
data/ext/cumo/narray/struct.c +886 -0
data/lib/cumo.rb +3 -0
data/lib/cumo/cuda.rb +11 -0
data/lib/cumo/cuda/compile_error.rb +36 -0
data/lib/cumo/cuda/compiler.rb +161 -0
data/lib/cumo/cuda/device.rb +47 -0
data/lib/cumo/cuda/link_state.rb +31 -0
data/lib/cumo/cuda/module.rb +40 -0
data/lib/cumo/cuda/nvrtc_program.rb +27 -0
data/lib/cumo/linalg.rb +12 -0
data/lib/cumo/narray.rb +2 -0
data/lib/cumo/narray/extra.rb +1278 -0
data/lib/erbpp.rb +294 -0
data/lib/erbpp/line_number.rb +137 -0
data/lib/erbpp/narray_def.rb +381 -0
data/numo-narray-version +1 -0
data/run.gdb +7 -0
metadata +353 -0

data/ext/cumo/cuda/memory_pool_impl.cpp ADDED

@@ -0,0 +1,308 @@
+#include "memory_pool_impl.hpp"
+#include <ruby.h>
+namespace cumo {
+namespace internal {
+void CheckStatus(cudaError_t status) {
+    if (status != 0) {
+        throw CUDARuntimeError(status);
+    }
+}
+Memory::Memory(size_t size) : size_(size) {
+    if (size_ > 0) {
+        CheckStatus(cudaGetDevice(&device_id_));
+        CheckStatus(cudaMallocManaged(&ptr_, size_, cudaMemAttachGlobal));
+        // std::cout << "cudaMalloc " << ptr_ << std::endl;
+    }
+}
+Memory::~Memory() {
+    if (size_ > 0) {
+        // std::cout << "cudaFree  " << ptr_ << std::endl;
+        cudaError_t status = cudaFree(ptr_);
+        // CUDA driver may shut down before freeing memory inside memory pool.
+        // It is okay to simply ignore because CUDA driver automatically frees memory.
+        if (status != cudaErrorCudartUnloading) {
+            CheckStatus(status);
+        }
+    }
+}
+std::shared_ptr<Chunk> Split(std::shared_ptr<Chunk>& self, size_t size) {
+    assert(self->size_ >= size);
+    if (self->size_ == size) {
+        return nullptr;
+    }
+    auto remaining = std::make_shared<Chunk>(self->mem_, self->offset_ + size, self->size_ - size, self->stream_ptr_);
+    self->size_ = size;
+    if (self->next_) {
+        remaining->set_next(std::move(self->next_));
+        remaining->next()->set_prev(remaining);
+    }
+    self->next_ = remaining;
+    remaining->set_prev(self);
+    return remaining;
+}
+void Merge(std::shared_ptr<Chunk>& self, std::shared_ptr<Chunk> remaining) {
+    assert(remaining != nullptr);
+    assert(self->stream_ptr_ == remaining->stream_ptr());
+    self->size_ += remaining->size();
+    self->next_ = remaining->next();
+    if (remaining->next() != nullptr) {
+        self->next_->set_prev(self);
+    }
+}
+void SingleDeviceMemoryPool::AppendToFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr) {
+    assert(chunk != nullptr && !chunk->in_use());
+    int bin_index = GetBinIndex(size);
+    std::lock_guard<std::recursive_mutex> lock{mutex_};
+    Arena& arena = GetArena(stream_ptr);
+    ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
+    int arena_index = std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
+    int length = static_cast<int>(arena_index_map.size());
+    if (arena_index >= length || arena_index_map.at(arena_index) != bin_index) {
+        arena_index_map.insert(arena_index_map.begin() + arena_index, bin_index);
+        arena.insert(arena.begin() + arena_index, FreeList{});
+    }
+    FreeList& free_list = arena[arena_index];
+    free_list.emplace_back(chunk);
+}
+bool SingleDeviceMemoryPool::RemoveFromFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr) {
+    assert(chunk != nullptr && !chunk->in_use());
+    int bin_index = GetBinIndex(size);
+    std::lock_guard<std::recursive_mutex> lock{mutex_};
+    Arena& arena = GetArena(stream_ptr);
+    ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
+    if (arena_index_map.size() == 0) {
+        return false;
+    }
+    int arena_index = std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
+    if (static_cast<size_t>(arena_index) == arena_index_map.size()) {
+        // Bin does not exist for the given chunk size.
+        return false;
+    }
+    if (arena_index_map.at(arena_index) != bin_index) {
+        return false;
+    }
+    assert(arena.size() > static_cast<size_t>(arena_index));
+    FreeList& free_list = arena[arena_index];
+    return EraseFromFreeList(free_list, chunk);
+}
+intptr_t SingleDeviceMemoryPool::Malloc(size_t size, cudaStream_t stream_ptr) {
+    size = GetRoundedSize(size);
+    std::shared_ptr<Chunk> chunk = nullptr;
+    {
+        std::lock_guard<std::recursive_mutex> lock{mutex_};
+        // find best-fit, or a smallest larger allocation
+        Arena& arena = GetArena(stream_ptr);
+        int arena_index = GetArenaIndex(size);
+        int arena_length = static_cast<int>(arena.size());
+        for (int i = arena_index; i < arena_length; ++i) {
+            FreeList& free_list = arena[i];
+            if (free_list.empty()) {
+                continue;
+            }
+            chunk = PopFromFreeList(free_list);
+            // TODO(sonots): compact_index
+            break;
+        }
+    }
+    if (chunk != nullptr) {
+        std::shared_ptr<Chunk> remaining = Split(chunk, size);
+        if (remaining != nullptr) {
+            AppendToFreeList(remaining->size(), remaining, stream_ptr);
+        }
+    } else {
+        // cudaMalloc if a cache is not found
+        std::shared_ptr<Memory> mem = nullptr;
+        try {
+            mem = std::make_shared<Memory>(size);
+        } catch (const CUDARuntimeError& e) {
+            if (e.status() != cudaErrorMemoryAllocation) {
+                throw;
+            }
+            FreeAllBlocks();
+            try {
+                mem = std::make_shared<Memory>(size);
+            } catch (const CUDARuntimeError& e) {
+                if (e.status() != cudaErrorMemoryAllocation) {
+                    throw;
+                }
+#ifdef NO_RUBY // cpp test does not bind with libruby
+                size_t total = size + GetTotalBytes();
+                throw OutOfMemoryError(size, total);
+#else
+                rb_funcall(rb_define_module("GC"), rb_intern("start"), 0);
+                try {
+                    mem = std::make_shared<Memory>(size);
+                } catch (const CUDARuntimeError& e) {
+                    if (e.status() != cudaErrorMemoryAllocation) {
+                        throw;
+                    }
+                    size_t total = size + GetTotalBytes();
+                    throw OutOfMemoryError(size, total);
+                }
+#endif
+            }
+        }
+        chunk = std::make_shared<Chunk>(mem, 0, size, stream_ptr);
+    }
+    assert(chunk != nullptr);
+    assert(chunk->stream_ptr() == stream_ptr);
+    {
+        std::lock_guard<std::recursive_mutex> lock{mutex_};
+        chunk->set_in_use(true);
+        in_use_.emplace(chunk->ptr(), chunk);
+    }
+    return chunk->ptr();
+}
+void SingleDeviceMemoryPool::Free(intptr_t ptr, cudaStream_t stream_ptr) {
+    std::shared_ptr<Chunk> chunk = nullptr;
+    {
+        std::lock_guard<std::recursive_mutex> lock{mutex_};
+        chunk = in_use_[ptr];
+        // assert(chunk != nullptr);
+        if (!chunk) return;
+        chunk->set_in_use(false);
+        in_use_.erase(ptr);
+    }
+    if (chunk->next() != nullptr && !chunk->next()->in_use()) {
+        if (RemoveFromFreeList(chunk->next()->size(), chunk->next(), stream_ptr)) {
+            Merge(chunk, chunk->next());
+        }
+    }
+    if (chunk->prev() != nullptr && !chunk->prev()->in_use()) {
+        if (RemoveFromFreeList(chunk->prev()->size(), chunk->prev(), stream_ptr)) {
+            chunk = chunk->prev();
+            Merge(chunk, chunk->next());
+        }
+    }
+    AppendToFreeList(chunk->size(), chunk, stream_ptr);
+}
+void SingleDeviceMemoryPool::CompactIndex(cudaStream_t stream_ptr, bool free) {
+    // need lock ouside this function
+    if (!HasArena(stream_ptr)) return;
+    Arena new_arena;
+    ArenaIndexMap new_arena_index_map;
+    Arena& arena = GetArena(stream_ptr);
+    ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
+    size_t arena_length = arena.size();
+    for (size_t arena_index = 0; arena_index < arena_length; ++arena_index) {
+        FreeList& free_list = arena[arena_index];
+        if (free_list.empty()) {
+            continue;
+        }
+        if (free) {
+            FreeList keep_list;
+            for (auto chunk : free_list) {
+                if (chunk->prev() != nullptr || chunk->next() != nullptr) {
+                    keep_list.emplace_back(chunk);
+                }
+            }
+            if (keep_list.size() == 0) {
+                continue;
+            }
+            new_arena_index_map.emplace_back(arena_index_map[arena_index]);
+            new_arena.emplace_back(keep_list);
+        } else {
+            new_arena_index_map.emplace_back(arena_index_map[arena_index]);
+            new_arena.emplace_back(free_list);
+        }
+    }
+    if (new_arena.empty()) {
+        index_.erase(stream_ptr);
+        free_.erase(stream_ptr);
+    } else {
+        arena_index_map.swap(new_arena_index_map);
+        arena.swap(new_arena);
+    }
+}
+// Free all **non-split** chunks in all arenas
+void SingleDeviceMemoryPool::FreeAllBlocks() {
+    std::lock_guard<std::recursive_mutex> lock{mutex_};
+    std::vector<cudaStream_t> keys(free_.size());
+    transform(free_.begin(), free_.end(), keys.begin(), [](auto pair) { return pair.first; });
+    for (cudaStream_t stream_ptr : keys) {
+        CompactIndex(stream_ptr, true);
+    }
+}
+// Free all **non-split** chunks in specified arena
+void SingleDeviceMemoryPool::FreeAllBlocks(cudaStream_t stream_ptr) {
+    std::lock_guard<std::recursive_mutex> lock{mutex_};
+    CompactIndex(stream_ptr, true);
+}
+size_t SingleDeviceMemoryPool::GetNumFreeBlocks() {
+    size_t n = 0;
+    std::lock_guard<std::recursive_mutex> lock{mutex_};
+    for (auto kv : free_) {
+        Arena& arena = kv.second;
+        for (auto free_list : arena) {
+            n += free_list.size();
+        }
+    }
+    return n;
+}
+size_t SingleDeviceMemoryPool::GetUsedBytes() {
+    size_t size = 0;
+    std::lock_guard<std::recursive_mutex> lock{mutex_};
+    for (auto kv : in_use_) {
+        std::shared_ptr<Chunk>& chunk = kv.second;
+        if (chunk) size += chunk->size();
+    }
+    return size;
+}
+size_t SingleDeviceMemoryPool::GetFreeBytes() {
+    size_t size = 0;
+    std::lock_guard<std::recursive_mutex> lock{mutex_};
+    for (auto kv : free_) {
+        Arena& arena = kv.second;
+        for (auto free_list : arena) {
+            for (auto chunk : free_list) {
+                if (chunk) size += chunk->size();
+            }
+        }
+    }
+    return size;
+}
+} // namespace internal
+} // namespace cumo

data/ext/cumo/cuda/memory_pool_impl.hpp ADDED

@@ -0,0 +1,370 @@
+#ifndef CUMO_CUDA_MEMORY_POOL_IMPL_H
+#define CUMO_CUDA_MEMORY_POOL_IMPL_H
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <unordered_map>
+#include <vector>
+#include <cuda_runtime.h>
+// CUDA memory pool implementation highly referring CuPy
+namespace cumo {
+namespace internal {
+// cudaMalloc() is aligned to at least 512 bytes
+// cf. https://gist.github.com/sonots/41daaa6432b1c8b27ef782cd14064269
+constexpr int kRoundSize = 512; // bytes
+class CUDARuntimeError : public std::runtime_error {
+private:
+    cudaError_t status_;
+public:
+    CUDARuntimeError(cudaError_t status) :
+        runtime_error(cudaGetErrorString(status)), status_(status) {}
+    cudaError_t status() const { return status_; }
+};
+class OutOfMemoryError : public std::runtime_error {
+public:
+    OutOfMemoryError(size_t size, size_t total) :
+        runtime_error("out of memory to allocate " + std::to_string(size) + " bytes (total " + std::to_string(total) + " bytes)") {}
+};
+void CheckStatus(cudaError_t status);
+// Memory allocation on a CUDA device.
+//
+// This class provides an RAII interface of the CUDA memory allocation.
+class Memory {
+private:
+    // Pointer to the place within the buffer.
+    void* ptr_ = nullptr;
+    // Size of the memory allocation in bytes.
+    size_t size_ = 0;
+    // GPU device id whose memory the pointer refers to.
+    int device_id_ = -1;
+public:
+    Memory(size_t size);
+    ~Memory();
+    intptr_t ptr() const { return reinterpret_cast<intptr_t>(ptr_); }
+    size_t size() const { return size_; }
+    int device_id() const { return device_id_; }
+};
+// A chunk points to a device memory.
+//
+// A chunk might be a splitted memory block from a larger allocation.
+// The prev/next pointers contruct a doubly-linked list of memory addresses
+// sorted by base address that must be contiguous.
+class Chunk {
+private:
+    // The device memory buffer.
+    std::shared_ptr<Memory> mem_;
+    // Memory address.
+    intptr_t ptr_ = 0;
+    // An offset bytes from the head of the buffer.
+    size_t offset_ = 0;
+    // Chunk size in bytes.
+    size_t size_ = 0;
+    // GPU device id whose memory the pointer refers to.
+    int device_id_;
+    // prev memory pointer if split from a larger allocation
+    std::shared_ptr<Chunk> prev_;
+    // next memory pointer if split from a larger allocation
+    std::shared_ptr<Chunk> next_;
+    // Raw stream handle of cuda stream
+    cudaStream_t stream_ptr_;
+    // chunk is in use
+    bool in_use_ = false;
+public:
+    Chunk() {}
+    // mem: The device memory buffer.
+    // offset: An offset bytes from the head of the buffer.
+    // size: Chunk size in bytes.
+    // stream_ptr: Raw stream handle of cuda stream
+    Chunk(const std::shared_ptr<Memory>& mem, size_t offset, size_t size, cudaStream_t stream_ptr = 0) :
+        mem_(mem), ptr_(mem->ptr() + offset), offset_(offset), size_(size), device_id_(mem->device_id()), stream_ptr_(stream_ptr) {
+        assert(mem->ptr() > 0 || offset == 0);
+    }
+    Chunk(const Chunk&) = default;
+    ~Chunk() {
+        // std::cout << "Chunk dtor " << (void*)ptr_ << " " << this << std::endl;
+    }
+    intptr_t ptr() const { return ptr_; }
+    size_t offset() const { return offset_; }
+    size_t size() const { return size_; }
+    int device_id() const { return device_id_; }
+    const std::shared_ptr<Chunk>& prev() const { return prev_; }
+    std::shared_ptr<Chunk>& prev() { return prev_; }
+    const std::shared_ptr<Chunk>& next() const { return next_; }
+    std::shared_ptr<Chunk>& next() { return next_; }
+    cudaStream_t stream_ptr() const { return stream_ptr_; }
+    void set_prev(const std::shared_ptr<Chunk>& prev) { prev_ = prev; }
+    void set_next(const std::shared_ptr<Chunk>& next) { next_ = next; }
+    bool in_use() const { return in_use_; }
+    void set_in_use(bool in_use) { in_use_ = in_use; }
+    // Split contiguous block of a larger allocation
+    friend std::shared_ptr<Chunk> Split(std::shared_ptr<Chunk>& self, size_t size);
+    // Merge previously splitted block (chunk)
+    friend void Merge(std::shared_ptr<Chunk>& self, std::shared_ptr<Chunk> remaining);
+};
+using FreeList = std::vector<std::shared_ptr<Chunk>>;  // list of free chunk
+using Arena = std::vector<FreeList>;  // free_list w.r.t arena index
+using ArenaIndexMap = std::vector<int>;  // arena index <=> bin size index
+// Memory pool implementation for single device.
+// - The allocator attempts to find the smallest cached block that will fit
+//   the requested size. If the block is larger than the requested size,
+//   it may be split. If no block is found, the allocator will delegate to
+//   cudaMalloc.
+// - If the cudaMalloc fails, the allocator will free all cached blocks that
+//   are not split and retry the allocation.
+class SingleDeviceMemoryPool {
+private:
+    int device_id_;
+    std::unordered_map<intptr_t, std::shared_ptr<Chunk>> in_use_; // ptr => Chunk
+    std::unordered_map<cudaStream_t, Arena> free_;
+    std::unordered_map<cudaStream_t, ArenaIndexMap> index_;
+    std::recursive_mutex mutex_;
+public:
+    SingleDeviceMemoryPool() {
+        CheckStatus(cudaGetDevice(&device_id_));
+    }
+    intptr_t Malloc(size_t size, cudaStream_t stream_ptr = 0);
+    void Free(intptr_t ptr, cudaStream_t stream_ptr = 0);
+    // Free all **non-split** chunks in all arenas
+    void FreeAllBlocks();
+    // Free all **non-split** chunks in specified arena
+    void FreeAllBlocks(cudaStream_t stream_ptr);
+    size_t GetNumFreeBlocks();
+    size_t GetUsedBytes();
+    size_t GetFreeBytes();
+    size_t GetTotalBytes() {
+        return GetUsedBytes() + GetFreeBytes();
+    }
+// private:
+    // Rounds up the memory size to fit memory alignment of cudaMalloc.
+    size_t GetRoundedSize(size_t size) {
+        return ((size + kRoundSize - 1) / kRoundSize) * kRoundSize;
+    }
+    // Get bin index regarding the memory size
+    int GetBinIndex(size_t size) {
+        return (size - 1) / kRoundSize;
+    }
+    int GetArenaIndex(size_t size, cudaStream_t stream_ptr = 0) {
+        int bin_index = GetBinIndex(size);
+        ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
+        return std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
+    }
+    bool HasArena(cudaStream_t stream_ptr) {
+        auto it = free_.find(stream_ptr);
+        return it != free_.end();
+    }
+    // Returns appropriate arena (list of bins) of a given stream.
+    //
+    // All free chunks in the stream belong to one of the bin in the arena.
+    //
+    // Caller is responsible to acquire lock.
+    Arena& GetArena(cudaStream_t stream_ptr) {
+        return free_[stream_ptr];  // find or create
+    }
+    // Returns appropriate arena sparse index of a given stream.
+    //
+    // Each element of the returned vector is an index value of the arena
+    // for the stream. The k-th element of the arena index is the bin index
+    // of the arena. For example, when the arena index is `[1, 3]`, it means
+    // that the arena has 2 bins, and `arena[0]` is for bin index 1 and
+    // `arena[1]` is for bin index 3.
+    //
+    // Caller is responsible to acquire lock.
+    ArenaIndexMap& GetArenaIndexMap(cudaStream_t stream_ptr) {
+        return index_[stream_ptr];  // find or create
+    }
+    std::shared_ptr<Chunk> PopFromFreeList(FreeList& free_list) {
+        auto data = free_list.back();
+        free_list.pop_back();
+        return data;
+    }
+    // std::vector erase-remove idiom
+    // http://minus9d.hatenablog.com/entry/20120605/1338896754
+    bool EraseFromFreeList(FreeList& free_list, const std::shared_ptr<Chunk>& chunk) {
+        assert(!chunk->in_use());
+        auto iter = std::find(free_list.begin(), free_list.end(), chunk);
+        if (iter == free_list.end()) {
+            return false;
+        }
+        free_list.erase(iter);
+        return true;
+    }
+    void AppendToFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr = 0);
+    // Removes the chunk from the free list.
+    //
+    // @return true if the chunk can successfully be removed from
+    //         the free list. false` otherwise (e.g., the chunk could not
+    //         be found in the free list as the chunk is allocated.)
+    bool RemoveFromFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr = 0);
+    void CompactIndex(cudaStream_t stream_ptr, bool free);
+};
+// Memory pool for all GPU devices on the host.
+//
+// A memory pool preserves any allocations even if they are freed by the user.
+// Freed memory buffers are held by the memory pool as *free blocks*, and they
+// are reused for further memory allocations of the same sizes. The allocated
+// blocks are managed for each device, so one instance of this class can be
+// used for multiple devices.
+// .. note::
+//    When the allocation is skipped by reusing the pre-allocated block, it
+//    does not call ``cudaMalloc`` and therefore CPU-GPU synchronization does
+//    not occur. It makes interleaves of memory allocations and kernel
+//    invocations very fast.
+// .. note::
+//    The memory pool holds allocated blocks without freeing as much as
+//    possible. It makes the program hold most of the device memory, which may
+//    make other CUDA programs running in parallel out-of-memory situation.
+class MemoryPool {
+private:
+    int device_id() {
+        int device_id = -1;
+        CheckStatus(cudaGetDevice(&device_id));
+        return device_id;
+    }
+    std::unordered_map<int, SingleDeviceMemoryPool> pools_;
+public:
+    MemoryPool() {}
+    ~MemoryPool() { pools_.clear(); }
+    // Allocates the memory, from the pool if possible.
+    //
+    // Args:
+    //     size (int): Size of the memory buffer to allocate in bytes.
+    //     stream_ptr (cudaStream_t): Get the memory from the arena of given stream
+    // Returns:
+    //     intptr_t: Pointer address to the allocated buffer.
+    intptr_t Malloc(size_t size, cudaStream_t stream_ptr = 0) {
+        auto& mp = pools_[device_id()];
+        return mp.Malloc(size, stream_ptr);
+    }
+    // Frees the memory, to the pool
+    //
+    // Args:
+    //     ptr (intptr_t): Pointer of the memory buffer
+    //     stream_ptr (cudaStream_t): Return the memory to the arena of given stream
+    void Free(intptr_t ptr, cudaStream_t stream_ptr = 0) {
+        auto& mp = pools_[device_id()];
+        mp.Free(ptr, stream_ptr);
+    }
+    // Free all **non-split** chunks in all arenas
+    void FreeAllBlocks() {
+        auto& mp = pools_[device_id()];
+        return mp.FreeAllBlocks();
+    }
+    // Free all **non-split** chunks in specified arena
+    //
+    // Args:
+    //     stream_ptr (cudaStream_t): Release free blocks in the arena of given stream
+    void FreeAllBlocks(cudaStream_t stream_ptr) {
+        auto& mp = pools_[device_id()];
+        return mp.FreeAllBlocks(stream_ptr);
+    }
+    // Count the total number of free blocks.
+    //
+    // Returns:
+    //     size_t: The total number of free blocks.
+    size_t GetNumFreeBlocks() {
+        auto& mp = pools_[device_id()];
+        return mp.GetNumFreeBlocks();
+    }
+    // Get the total number of bytes used.
+    //
+    // Returns:
+    //     size_t: The total number of bytes used.
+    size_t GetUsedBytes() {
+        auto& mp = pools_[device_id()];
+        return mp.GetUsedBytes();
+    }
+    // Get the total number of bytes acquired but not used in the pool.
+    //
+    // Returns:
+    //     size_t: The total number of bytes acquired but not used in the pool.
+    size_t GetFreeBytes() {
+        auto& mp = pools_[device_id()];
+        return mp.GetFreeBytes();
+    }
+    // Get the total number of bytes acquired in the pool.
+    //
+    // Returns:
+    //     size_t: The total number of bytes acquired in the pool.
+    size_t GetTotalBytes() {
+        auto& mp = pools_[device_id()];
+        return mp.GetTotalBytes();
+    }
+};
+} // namespace internal
+} // namespace cumo
+#endif /* ifndef CUMO_CUDA_MEMORY_POOL_IMPL_H */