cumo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.travis.yml +5 -0
  4. data/3rd_party/mkmf-cu/.gitignore +36 -0
  5. data/3rd_party/mkmf-cu/Gemfile +3 -0
  6. data/3rd_party/mkmf-cu/LICENSE +21 -0
  7. data/3rd_party/mkmf-cu/README.md +36 -0
  8. data/3rd_party/mkmf-cu/Rakefile +11 -0
  9. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
  11. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
  12. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
  13. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
  14. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
  15. data/CODE_OF_CONDUCT.md +46 -0
  16. data/Gemfile +8 -0
  17. data/LICENSE.txt +82 -0
  18. data/README.md +252 -0
  19. data/Rakefile +43 -0
  20. data/bench/broadcast_fp32.rb +138 -0
  21. data/bench/cumo_bench.rb +193 -0
  22. data/bench/numo_bench.rb +138 -0
  23. data/bench/reduction_fp32.rb +117 -0
  24. data/bin/console +14 -0
  25. data/bin/setup +8 -0
  26. data/cumo.gemspec +32 -0
  27. data/ext/cumo/cuda/cublas.c +278 -0
  28. data/ext/cumo/cuda/driver.c +421 -0
  29. data/ext/cumo/cuda/memory_pool.cpp +185 -0
  30. data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
  31. data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
  32. data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
  33. data/ext/cumo/cuda/nvrtc.c +207 -0
  34. data/ext/cumo/cuda/runtime.c +167 -0
  35. data/ext/cumo/cumo.c +148 -0
  36. data/ext/cumo/depend.erb +58 -0
  37. data/ext/cumo/extconf.rb +179 -0
  38. data/ext/cumo/include/cumo.h +25 -0
  39. data/ext/cumo/include/cumo/compat.h +23 -0
  40. data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
  41. data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
  42. data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
  43. data/ext/cumo/include/cumo/cuda/driver.h +22 -0
  44. data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
  45. data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
  46. data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
  47. data/ext/cumo/include/cumo/indexer.h +238 -0
  48. data/ext/cumo/include/cumo/intern.h +142 -0
  49. data/ext/cumo/include/cumo/intern_fwd.h +38 -0
  50. data/ext/cumo/include/cumo/intern_kernel.h +6 -0
  51. data/ext/cumo/include/cumo/narray.h +429 -0
  52. data/ext/cumo/include/cumo/narray_kernel.h +149 -0
  53. data/ext/cumo/include/cumo/ndloop.h +95 -0
  54. data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
  55. data/ext/cumo/include/cumo/template.h +158 -0
  56. data/ext/cumo/include/cumo/template_kernel.h +77 -0
  57. data/ext/cumo/include/cumo/types/bit.h +40 -0
  58. data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
  59. data/ext/cumo/include/cumo/types/complex.h +402 -0
  60. data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
  61. data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
  62. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
  63. data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
  64. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
  65. data/ext/cumo/include/cumo/types/dfloat.h +47 -0
  66. data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
  67. data/ext/cumo/include/cumo/types/float_def.h +34 -0
  68. data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
  69. data/ext/cumo/include/cumo/types/float_macro.h +191 -0
  70. data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
  71. data/ext/cumo/include/cumo/types/int16.h +24 -0
  72. data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
  73. data/ext/cumo/include/cumo/types/int32.h +24 -0
  74. data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
  75. data/ext/cumo/include/cumo/types/int64.h +24 -0
  76. data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
  77. data/ext/cumo/include/cumo/types/int8.h +24 -0
  78. data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
  79. data/ext/cumo/include/cumo/types/int_macro.h +67 -0
  80. data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
  81. data/ext/cumo/include/cumo/types/real_accum.h +486 -0
  82. data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
  83. data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
  84. data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
  85. data/ext/cumo/include/cumo/types/robject.h +27 -0
  86. data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
  87. data/ext/cumo/include/cumo/types/scomplex.h +46 -0
  88. data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
  89. data/ext/cumo/include/cumo/types/sfloat.h +48 -0
  90. data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
  91. data/ext/cumo/include/cumo/types/uint16.h +25 -0
  92. data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
  93. data/ext/cumo/include/cumo/types/uint32.h +25 -0
  94. data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
  95. data/ext/cumo/include/cumo/types/uint64.h +25 -0
  96. data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
  97. data/ext/cumo/include/cumo/types/uint8.h +25 -0
  98. data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
  99. data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
  100. data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
  101. data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
  102. data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
  103. data/ext/cumo/narray/SFMT-params.h +97 -0
  104. data/ext/cumo/narray/SFMT-params19937.h +46 -0
  105. data/ext/cumo/narray/SFMT.c +620 -0
  106. data/ext/cumo/narray/SFMT.h +167 -0
  107. data/ext/cumo/narray/array.c +638 -0
  108. data/ext/cumo/narray/data.c +961 -0
  109. data/ext/cumo/narray/gen/cogen.rb +56 -0
  110. data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
  111. data/ext/cumo/narray/gen/def/bit.rb +37 -0
  112. data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
  113. data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
  114. data/ext/cumo/narray/gen/def/int16.rb +36 -0
  115. data/ext/cumo/narray/gen/def/int32.rb +36 -0
  116. data/ext/cumo/narray/gen/def/int64.rb +36 -0
  117. data/ext/cumo/narray/gen/def/int8.rb +36 -0
  118. data/ext/cumo/narray/gen/def/robject.rb +37 -0
  119. data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
  120. data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
  121. data/ext/cumo/narray/gen/def/uint16.rb +36 -0
  122. data/ext/cumo/narray/gen/def/uint32.rb +36 -0
  123. data/ext/cumo/narray/gen/def/uint64.rb +36 -0
  124. data/ext/cumo/narray/gen/def/uint8.rb +36 -0
  125. data/ext/cumo/narray/gen/erbpp2.rb +346 -0
  126. data/ext/cumo/narray/gen/narray_def.rb +268 -0
  127. data/ext/cumo/narray/gen/spec.rb +425 -0
  128. data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
  129. data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
  130. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
  131. data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
  132. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
  133. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
  134. data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
  135. data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
  136. data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
  137. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
  138. data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
  139. data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
  140. data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
  141. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
  142. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
  143. data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
  144. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
  145. data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
  146. data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
  147. data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
  148. data/ext/cumo/narray/gen/tmpl/class.c +9 -0
  149. data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
  150. data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
  151. data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
  152. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
  153. data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
  154. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
  155. data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
  156. data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
  157. data/ext/cumo/narray/gen/tmpl/each.c +47 -0
  158. data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
  159. data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
  160. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
  161. data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
  162. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
  163. data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
  164. data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
  165. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
  166. data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
  167. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
  168. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
  169. data/ext/cumo/narray/gen/tmpl/format.c +62 -0
  170. data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
  171. data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
  172. data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
  173. data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
  174. data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
  175. data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
  176. data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
  177. data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
  178. data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
  179. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
  180. data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
  181. data/ext/cumo/narray/gen/tmpl/median.c +66 -0
  182. data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
  183. data/ext/cumo/narray/gen/tmpl/module.c +9 -0
  184. data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
  185. data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
  186. data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
  187. data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
  188. data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
  189. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
  190. data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
  191. data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
  192. data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
  193. data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
  194. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
  195. data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
  196. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
  197. data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
  198. data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
  199. data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
  200. data/ext/cumo/narray/gen/tmpl/store.c +41 -0
  201. data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
  202. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
  203. data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
  204. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
  205. data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
  206. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
  207. data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
  208. data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
  209. data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
  210. data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
  211. data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
  212. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
  213. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
  214. data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
  215. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
  216. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
  217. data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
  218. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
  219. data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
  220. data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
  221. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
  222. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
  223. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
  224. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
  225. data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
  226. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
  227. data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
  228. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
  229. data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
  230. data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
  231. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
  232. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
  233. data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
  234. data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
  235. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
  236. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
  237. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
  238. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
  239. data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
  240. data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
  241. data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
  242. data/ext/cumo/narray/index.c +880 -0
  243. data/ext/cumo/narray/kwargs.c +153 -0
  244. data/ext/cumo/narray/math.c +142 -0
  245. data/ext/cumo/narray/narray.c +1948 -0
  246. data/ext/cumo/narray/ndloop.c +2105 -0
  247. data/ext/cumo/narray/rand.c +45 -0
  248. data/ext/cumo/narray/step.c +474 -0
  249. data/ext/cumo/narray/struct.c +886 -0
  250. data/lib/cumo.rb +3 -0
  251. data/lib/cumo/cuda.rb +11 -0
  252. data/lib/cumo/cuda/compile_error.rb +36 -0
  253. data/lib/cumo/cuda/compiler.rb +161 -0
  254. data/lib/cumo/cuda/device.rb +47 -0
  255. data/lib/cumo/cuda/link_state.rb +31 -0
  256. data/lib/cumo/cuda/module.rb +40 -0
  257. data/lib/cumo/cuda/nvrtc_program.rb +27 -0
  258. data/lib/cumo/linalg.rb +12 -0
  259. data/lib/cumo/narray.rb +2 -0
  260. data/lib/cumo/narray/extra.rb +1278 -0
  261. data/lib/erbpp.rb +294 -0
  262. data/lib/erbpp/line_number.rb +137 -0
  263. data/lib/erbpp/narray_def.rb +381 -0
  264. data/numo-narray-version +1 -0
  265. data/run.gdb +7 -0
  266. metadata +353 -0
@@ -0,0 +1,308 @@
1
+ #include "memory_pool_impl.hpp"
2
+
3
+ #include <ruby.h>
4
+
5
+ namespace cumo {
6
+ namespace internal {
7
+
8
+ void CheckStatus(cudaError_t status) {
9
+ if (status != 0) {
10
+ throw CUDARuntimeError(status);
11
+ }
12
+ }
13
+
14
+ Memory::Memory(size_t size) : size_(size) {
15
+ if (size_ > 0) {
16
+ CheckStatus(cudaGetDevice(&device_id_));
17
+ CheckStatus(cudaMallocManaged(&ptr_, size_, cudaMemAttachGlobal));
18
+ // std::cout << "cudaMalloc " << ptr_ << std::endl;
19
+ }
20
+ }
21
+
22
+ Memory::~Memory() {
23
+ if (size_ > 0) {
24
+ // std::cout << "cudaFree " << ptr_ << std::endl;
25
+ cudaError_t status = cudaFree(ptr_);
26
+ // CUDA driver may shut down before freeing memory inside memory pool.
27
+ // It is okay to simply ignore because CUDA driver automatically frees memory.
28
+ if (status != cudaErrorCudartUnloading) {
29
+ CheckStatus(status);
30
+ }
31
+ }
32
+ }
33
+
34
+ std::shared_ptr<Chunk> Split(std::shared_ptr<Chunk>& self, size_t size) {
35
+ assert(self->size_ >= size);
36
+ if (self->size_ == size) {
37
+ return nullptr;
38
+ }
39
+
40
+ auto remaining = std::make_shared<Chunk>(self->mem_, self->offset_ + size, self->size_ - size, self->stream_ptr_);
41
+ self->size_ = size;
42
+
43
+ if (self->next_) {
44
+ remaining->set_next(std::move(self->next_));
45
+ remaining->next()->set_prev(remaining);
46
+ }
47
+ self->next_ = remaining;
48
+ remaining->set_prev(self);
49
+
50
+ return remaining;
51
+ }
52
+
53
+
54
+ void Merge(std::shared_ptr<Chunk>& self, std::shared_ptr<Chunk> remaining) {
55
+ assert(remaining != nullptr);
56
+ assert(self->stream_ptr_ == remaining->stream_ptr());
57
+ self->size_ += remaining->size();
58
+ self->next_ = remaining->next();
59
+ if (remaining->next() != nullptr) {
60
+ self->next_->set_prev(self);
61
+ }
62
+ }
63
+
64
+ void SingleDeviceMemoryPool::AppendToFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr) {
65
+ assert(chunk != nullptr && !chunk->in_use());
66
+ int bin_index = GetBinIndex(size);
67
+
68
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
69
+
70
+ Arena& arena = GetArena(stream_ptr);
71
+ ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
72
+ int arena_index = std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
73
+ int length = static_cast<int>(arena_index_map.size());
74
+ if (arena_index >= length || arena_index_map.at(arena_index) != bin_index) {
75
+ arena_index_map.insert(arena_index_map.begin() + arena_index, bin_index);
76
+ arena.insert(arena.begin() + arena_index, FreeList{});
77
+ }
78
+ FreeList& free_list = arena[arena_index];
79
+ free_list.emplace_back(chunk);
80
+ }
81
+
82
+ bool SingleDeviceMemoryPool::RemoveFromFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr) {
83
+ assert(chunk != nullptr && !chunk->in_use());
84
+ int bin_index = GetBinIndex(size);
85
+
86
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
87
+
88
+ Arena& arena = GetArena(stream_ptr);
89
+ ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
90
+ if (arena_index_map.size() == 0) {
91
+ return false;
92
+ }
93
+ int arena_index = std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
94
+ if (static_cast<size_t>(arena_index) == arena_index_map.size()) {
95
+ // Bin does not exist for the given chunk size.
96
+ return false;
97
+ }
98
+ if (arena_index_map.at(arena_index) != bin_index) {
99
+ return false;
100
+ }
101
+ assert(arena.size() > static_cast<size_t>(arena_index));
102
+ FreeList& free_list = arena[arena_index];
103
+ return EraseFromFreeList(free_list, chunk);
104
+ }
105
+
106
+ intptr_t SingleDeviceMemoryPool::Malloc(size_t size, cudaStream_t stream_ptr) {
107
+ size = GetRoundedSize(size);
108
+ std::shared_ptr<Chunk> chunk = nullptr;
109
+
110
+ {
111
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
112
+
113
+ // find best-fit, or a smallest larger allocation
114
+ Arena& arena = GetArena(stream_ptr);
115
+ int arena_index = GetArenaIndex(size);
116
+ int arena_length = static_cast<int>(arena.size());
117
+ for (int i = arena_index; i < arena_length; ++i) {
118
+ FreeList& free_list = arena[i];
119
+ if (free_list.empty()) {
120
+ continue;
121
+ }
122
+ chunk = PopFromFreeList(free_list);
123
+ // TODO(sonots): compact_index
124
+ break;
125
+ }
126
+ }
127
+
128
+ if (chunk != nullptr) {
129
+ std::shared_ptr<Chunk> remaining = Split(chunk, size);
130
+ if (remaining != nullptr) {
131
+ AppendToFreeList(remaining->size(), remaining, stream_ptr);
132
+ }
133
+ } else {
134
+ // cudaMalloc if a cache is not found
135
+ std::shared_ptr<Memory> mem = nullptr;
136
+ try {
137
+ mem = std::make_shared<Memory>(size);
138
+ } catch (const CUDARuntimeError& e) {
139
+ if (e.status() != cudaErrorMemoryAllocation) {
140
+ throw;
141
+ }
142
+ FreeAllBlocks();
143
+ try {
144
+ mem = std::make_shared<Memory>(size);
145
+ } catch (const CUDARuntimeError& e) {
146
+ if (e.status() != cudaErrorMemoryAllocation) {
147
+ throw;
148
+ }
149
+ #ifdef NO_RUBY // cpp test does not bind with libruby
150
+ size_t total = size + GetTotalBytes();
151
+ throw OutOfMemoryError(size, total);
152
+ #else
153
+ rb_funcall(rb_define_module("GC"), rb_intern("start"), 0);
154
+ try {
155
+ mem = std::make_shared<Memory>(size);
156
+ } catch (const CUDARuntimeError& e) {
157
+ if (e.status() != cudaErrorMemoryAllocation) {
158
+ throw;
159
+ }
160
+ size_t total = size + GetTotalBytes();
161
+ throw OutOfMemoryError(size, total);
162
+ }
163
+ #endif
164
+ }
165
+ }
166
+ chunk = std::make_shared<Chunk>(mem, 0, size, stream_ptr);
167
+ }
168
+
169
+ assert(chunk != nullptr);
170
+ assert(chunk->stream_ptr() == stream_ptr);
171
+ {
172
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
173
+
174
+ chunk->set_in_use(true);
175
+ in_use_.emplace(chunk->ptr(), chunk);
176
+ }
177
+ return chunk->ptr();
178
+ }
179
+
180
+ void SingleDeviceMemoryPool::Free(intptr_t ptr, cudaStream_t stream_ptr) {
181
+ std::shared_ptr<Chunk> chunk = nullptr;
182
+
183
+ {
184
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
185
+
186
+ chunk = in_use_[ptr];
187
+ // assert(chunk != nullptr);
188
+ if (!chunk) return;
189
+ chunk->set_in_use(false);
190
+ in_use_.erase(ptr);
191
+ }
192
+
193
+ if (chunk->next() != nullptr && !chunk->next()->in_use()) {
194
+ if (RemoveFromFreeList(chunk->next()->size(), chunk->next(), stream_ptr)) {
195
+ Merge(chunk, chunk->next());
196
+ }
197
+ }
198
+ if (chunk->prev() != nullptr && !chunk->prev()->in_use()) {
199
+ if (RemoveFromFreeList(chunk->prev()->size(), chunk->prev(), stream_ptr)) {
200
+ chunk = chunk->prev();
201
+ Merge(chunk, chunk->next());
202
+ }
203
+ }
204
+ AppendToFreeList(chunk->size(), chunk, stream_ptr);
205
+ }
206
+
207
+ void SingleDeviceMemoryPool::CompactIndex(cudaStream_t stream_ptr, bool free) {
208
+ // need lock ouside this function
209
+ if (!HasArena(stream_ptr)) return;
210
+
211
+ Arena new_arena;
212
+ ArenaIndexMap new_arena_index_map;
213
+ Arena& arena = GetArena(stream_ptr);
214
+ ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
215
+ size_t arena_length = arena.size();
216
+ for (size_t arena_index = 0; arena_index < arena_length; ++arena_index) {
217
+ FreeList& free_list = arena[arena_index];
218
+ if (free_list.empty()) {
219
+ continue;
220
+ }
221
+ if (free) {
222
+ FreeList keep_list;
223
+ for (auto chunk : free_list) {
224
+ if (chunk->prev() != nullptr || chunk->next() != nullptr) {
225
+ keep_list.emplace_back(chunk);
226
+ }
227
+ }
228
+ if (keep_list.size() == 0) {
229
+ continue;
230
+ }
231
+ new_arena_index_map.emplace_back(arena_index_map[arena_index]);
232
+ new_arena.emplace_back(keep_list);
233
+ } else {
234
+ new_arena_index_map.emplace_back(arena_index_map[arena_index]);
235
+ new_arena.emplace_back(free_list);
236
+ }
237
+ }
238
+ if (new_arena.empty()) {
239
+ index_.erase(stream_ptr);
240
+ free_.erase(stream_ptr);
241
+ } else {
242
+ arena_index_map.swap(new_arena_index_map);
243
+ arena.swap(new_arena);
244
+ }
245
+ }
246
+
247
+ // Free all **non-split** chunks in all arenas
248
+ void SingleDeviceMemoryPool::FreeAllBlocks() {
249
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
250
+
251
+ std::vector<cudaStream_t> keys(free_.size());
252
+ transform(free_.begin(), free_.end(), keys.begin(), [](auto pair) { return pair.first; });
253
+ for (cudaStream_t stream_ptr : keys) {
254
+ CompactIndex(stream_ptr, true);
255
+ }
256
+ }
257
+
258
+ // Free all **non-split** chunks in specified arena
259
+ void SingleDeviceMemoryPool::FreeAllBlocks(cudaStream_t stream_ptr) {
260
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
261
+
262
+ CompactIndex(stream_ptr, true);
263
+ }
264
+
265
+ size_t SingleDeviceMemoryPool::GetNumFreeBlocks() {
266
+ size_t n = 0;
267
+
268
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
269
+
270
+ for (auto kv : free_) {
271
+ Arena& arena = kv.second;
272
+ for (auto free_list : arena) {
273
+ n += free_list.size();
274
+ }
275
+ }
276
+ return n;
277
+ }
278
+
279
+ size_t SingleDeviceMemoryPool::GetUsedBytes() {
280
+ size_t size = 0;
281
+
282
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
283
+
284
+ for (auto kv : in_use_) {
285
+ std::shared_ptr<Chunk>& chunk = kv.second;
286
+ if (chunk) size += chunk->size();
287
+ }
288
+ return size;
289
+ }
290
+
291
+ size_t SingleDeviceMemoryPool::GetFreeBytes() {
292
+ size_t size = 0;
293
+
294
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
295
+
296
+ for (auto kv : free_) {
297
+ Arena& arena = kv.second;
298
+ for (auto free_list : arena) {
299
+ for (auto chunk : free_list) {
300
+ if (chunk) size += chunk->size();
301
+ }
302
+ }
303
+ }
304
+ return size;
305
+ }
306
+
307
+ } // namespace internal
308
+ } // namespace cumo
@@ -0,0 +1,370 @@
1
+ #ifndef CUMO_CUDA_MEMORY_POOL_IMPL_H
2
+ #define CUMO_CUDA_MEMORY_POOL_IMPL_H
3
+
4
+ #include <algorithm>
5
+ #include <cassert>
6
+ #include <memory>
7
+ #include <mutex>
8
+ #include <stdexcept>
9
+ #include <unordered_map>
10
+ #include <vector>
11
+
12
+ #include <cuda_runtime.h>
13
+
14
+ // CUDA memory pool implementation highly referring CuPy
15
+
16
+ namespace cumo {
17
+ namespace internal {
18
+
19
+ // cudaMalloc() is aligned to at least 512 bytes
20
+ // cf. https://gist.github.com/sonots/41daaa6432b1c8b27ef782cd14064269
21
+ constexpr int kRoundSize = 512; // bytes
22
+
23
+ class CUDARuntimeError : public std::runtime_error {
24
+ private:
25
+ cudaError_t status_;
26
+
27
+ public:
28
+ CUDARuntimeError(cudaError_t status) :
29
+ runtime_error(cudaGetErrorString(status)), status_(status) {}
30
+ cudaError_t status() const { return status_; }
31
+ };
32
+
33
+
34
+ class OutOfMemoryError : public std::runtime_error {
35
+ public:
36
+ OutOfMemoryError(size_t size, size_t total) :
37
+ runtime_error("out of memory to allocate " + std::to_string(size) + " bytes (total " + std::to_string(total) + " bytes)") {}
38
+ };
39
+
40
+ void CheckStatus(cudaError_t status);
41
+
42
+ // Memory allocation on a CUDA device.
43
+ //
44
+ // This class provides an RAII interface of the CUDA memory allocation.
45
+ class Memory {
46
+ private:
47
+ // Pointer to the place within the buffer.
48
+ void* ptr_ = nullptr;
49
+ // Size of the memory allocation in bytes.
50
+ size_t size_ = 0;
51
+ // GPU device id whose memory the pointer refers to.
52
+ int device_id_ = -1;
53
+
54
+ public:
55
+ Memory(size_t size);
56
+
57
+ ~Memory();
58
+
59
+ intptr_t ptr() const { return reinterpret_cast<intptr_t>(ptr_); }
60
+
61
+ size_t size() const { return size_; }
62
+
63
+ int device_id() const { return device_id_; }
64
+ };
65
+
66
+ // A chunk points to a device memory.
67
+ //
68
+ // A chunk might be a splitted memory block from a larger allocation.
69
+ // The prev/next pointers contruct a doubly-linked list of memory addresses
70
+ // sorted by base address that must be contiguous.
71
+ class Chunk {
72
+ private:
73
+ // The device memory buffer.
74
+ std::shared_ptr<Memory> mem_;
75
+ // Memory address.
76
+ intptr_t ptr_ = 0;
77
+ // An offset bytes from the head of the buffer.
78
+ size_t offset_ = 0;
79
+ // Chunk size in bytes.
80
+ size_t size_ = 0;
81
+ // GPU device id whose memory the pointer refers to.
82
+ int device_id_;
83
+ // prev memory pointer if split from a larger allocation
84
+ std::shared_ptr<Chunk> prev_;
85
+ // next memory pointer if split from a larger allocation
86
+ std::shared_ptr<Chunk> next_;
87
+ // Raw stream handle of cuda stream
88
+ cudaStream_t stream_ptr_;
89
+ // chunk is in use
90
+ bool in_use_ = false;
91
+
92
+ public:
93
+ Chunk() {}
94
+
95
+ // mem: The device memory buffer.
96
+ // offset: An offset bytes from the head of the buffer.
97
+ // size: Chunk size in bytes.
98
+ // stream_ptr: Raw stream handle of cuda stream
99
+ Chunk(const std::shared_ptr<Memory>& mem, size_t offset, size_t size, cudaStream_t stream_ptr = 0) :
100
+ mem_(mem), ptr_(mem->ptr() + offset), offset_(offset), size_(size), device_id_(mem->device_id()), stream_ptr_(stream_ptr) {
101
+ assert(mem->ptr() > 0 || offset == 0);
102
+ }
103
+
104
+ Chunk(const Chunk&) = default;
105
+
106
+ ~Chunk() {
107
+ // std::cout << "Chunk dtor " << (void*)ptr_ << " " << this << std::endl;
108
+ }
109
+
110
+ intptr_t ptr() const { return ptr_; }
111
+
112
+ size_t offset() const { return offset_; }
113
+
114
+ size_t size() const { return size_; }
115
+
116
+ int device_id() const { return device_id_; }
117
+
118
+ const std::shared_ptr<Chunk>& prev() const { return prev_; }
119
+
120
+ std::shared_ptr<Chunk>& prev() { return prev_; }
121
+
122
+ const std::shared_ptr<Chunk>& next() const { return next_; }
123
+
124
+ std::shared_ptr<Chunk>& next() { return next_; }
125
+
126
+ cudaStream_t stream_ptr() const { return stream_ptr_; }
127
+
128
+ void set_prev(const std::shared_ptr<Chunk>& prev) { prev_ = prev; }
129
+
130
+ void set_next(const std::shared_ptr<Chunk>& next) { next_ = next; }
131
+
132
+ bool in_use() const { return in_use_; }
133
+
134
+ void set_in_use(bool in_use) { in_use_ = in_use; }
135
+
136
+ // Split contiguous block of a larger allocation
137
+ friend std::shared_ptr<Chunk> Split(std::shared_ptr<Chunk>& self, size_t size);
138
+
139
+ // Merge previously splitted block (chunk)
140
+ friend void Merge(std::shared_ptr<Chunk>& self, std::shared_ptr<Chunk> remaining);
141
+ };
142
+
143
+ using FreeList = std::vector<std::shared_ptr<Chunk>>; // list of free chunk
144
+ using Arena = std::vector<FreeList>; // free_list w.r.t arena index
145
+ using ArenaIndexMap = std::vector<int>; // arena index <=> bin size index
146
+
147
+ // Memory pool implementation for single device.
148
+ // - The allocator attempts to find the smallest cached block that will fit
149
+ // the requested size. If the block is larger than the requested size,
150
+ // it may be split. If no block is found, the allocator will delegate to
151
+ // cudaMalloc.
152
+ // - If the cudaMalloc fails, the allocator will free all cached blocks that
153
+ // are not split and retry the allocation.
154
+ class SingleDeviceMemoryPool {
155
+ private:
156
+ int device_id_;
157
+ std::unordered_map<intptr_t, std::shared_ptr<Chunk>> in_use_; // ptr => Chunk
158
+ std::unordered_map<cudaStream_t, Arena> free_;
159
+ std::unordered_map<cudaStream_t, ArenaIndexMap> index_;
160
+ std::recursive_mutex mutex_;
161
+
162
+ public:
163
+ SingleDeviceMemoryPool() {
164
+ CheckStatus(cudaGetDevice(&device_id_));
165
+ }
166
+
167
+ intptr_t Malloc(size_t size, cudaStream_t stream_ptr = 0);
168
+
169
+ void Free(intptr_t ptr, cudaStream_t stream_ptr = 0);
170
+
171
+ // Free all **non-split** chunks in all arenas
172
+ void FreeAllBlocks();
173
+
174
+ // Free all **non-split** chunks in specified arena
175
+ void FreeAllBlocks(cudaStream_t stream_ptr);
176
+
177
+ size_t GetNumFreeBlocks();
178
+
179
+ size_t GetUsedBytes();
180
+
181
+ size_t GetFreeBytes();
182
+
183
+ size_t GetTotalBytes() {
184
+ return GetUsedBytes() + GetFreeBytes();
185
+ }
186
+
187
+ // private:
188
+
189
+ // Rounds up the memory size to fit memory alignment of cudaMalloc.
190
+ size_t GetRoundedSize(size_t size) {
191
+ return ((size + kRoundSize - 1) / kRoundSize) * kRoundSize;
192
+ }
193
+
194
+ // Get bin index regarding the memory size
195
+ int GetBinIndex(size_t size) {
196
+ return (size - 1) / kRoundSize;
197
+ }
198
+
199
+ int GetArenaIndex(size_t size, cudaStream_t stream_ptr = 0) {
200
+ int bin_index = GetBinIndex(size);
201
+ ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
202
+ return std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
203
+ }
204
+
205
+ bool HasArena(cudaStream_t stream_ptr) {
206
+ auto it = free_.find(stream_ptr);
207
+ return it != free_.end();
208
+ }
209
+
210
+ // Returns appropriate arena (list of bins) of a given stream.
211
+ //
212
+ // All free chunks in the stream belong to one of the bin in the arena.
213
+ //
214
+ // Caller is responsible to acquire lock.
215
+ Arena& GetArena(cudaStream_t stream_ptr) {
216
+ return free_[stream_ptr]; // find or create
217
+ }
218
+
219
+ // Returns appropriate arena sparse index of a given stream.
220
+ //
221
+ // Each element of the returned vector is an index value of the arena
222
+ // for the stream. The k-th element of the arena index is the bin index
223
+ // of the arena. For example, when the arena index is `[1, 3]`, it means
224
+ // that the arena has 2 bins, and `arena[0]` is for bin index 1 and
225
+ // `arena[1]` is for bin index 3.
226
+ //
227
+ // Caller is responsible to acquire lock.
228
+ ArenaIndexMap& GetArenaIndexMap(cudaStream_t stream_ptr) {
229
+ return index_[stream_ptr]; // find or create
230
+ }
231
+
232
+ std::shared_ptr<Chunk> PopFromFreeList(FreeList& free_list) {
233
+ auto data = free_list.back();
234
+ free_list.pop_back();
235
+ return data;
236
+ }
237
+
238
+ // std::vector erase-remove idiom
239
+ // http://minus9d.hatenablog.com/entry/20120605/1338896754
240
+ bool EraseFromFreeList(FreeList& free_list, const std::shared_ptr<Chunk>& chunk) {
241
+ assert(!chunk->in_use());
242
+ auto iter = std::find(free_list.begin(), free_list.end(), chunk);
243
+ if (iter == free_list.end()) {
244
+ return false;
245
+ }
246
+ free_list.erase(iter);
247
+ return true;
248
+ }
249
+
250
+ void AppendToFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr = 0);
251
+
252
+ // Removes the chunk from the free list.
253
+ //
254
+ // @return true if the chunk can successfully be removed from
255
+ // the free list. false` otherwise (e.g., the chunk could not
256
+ // be found in the free list as the chunk is allocated.)
257
+ bool RemoveFromFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr = 0);
258
+
259
+ void CompactIndex(cudaStream_t stream_ptr, bool free);
260
+ };
261
+
262
+ // Memory pool for all GPU devices on the host.
263
+ //
264
+ // A memory pool preserves any allocations even if they are freed by the user.
265
+ // Freed memory buffers are held by the memory pool as *free blocks*, and they
266
+ // are reused for further memory allocations of the same sizes. The allocated
267
+ // blocks are managed for each device, so one instance of this class can be
268
+ // used for multiple devices.
269
+ // .. note::
270
+ // When the allocation is skipped by reusing the pre-allocated block, it
271
+ // does not call ``cudaMalloc`` and therefore CPU-GPU synchronization does
272
+ // not occur. It makes interleaves of memory allocations and kernel
273
+ // invocations very fast.
274
+ // .. note::
275
+ // The memory pool holds allocated blocks without freeing as much as
276
+ // possible. It makes the program hold most of the device memory, which may
277
+ // make other CUDA programs running in parallel out-of-memory situation.
278
+ class MemoryPool {
279
+ private:
280
+ int device_id() {
281
+ int device_id = -1;
282
+ CheckStatus(cudaGetDevice(&device_id));
283
+ return device_id;
284
+ }
285
+
286
+ std::unordered_map<int, SingleDeviceMemoryPool> pools_;
287
+
288
+ public:
289
+ MemoryPool() {}
290
+
291
+ ~MemoryPool() { pools_.clear(); }
292
+
293
+ // Allocates the memory, from the pool if possible.
294
+ //
295
+ // Args:
296
+ // size (int): Size of the memory buffer to allocate in bytes.
297
+ // stream_ptr (cudaStream_t): Get the memory from the arena of given stream
298
+ // Returns:
299
+ // intptr_t: Pointer address to the allocated buffer.
300
+ intptr_t Malloc(size_t size, cudaStream_t stream_ptr = 0) {
301
+ auto& mp = pools_[device_id()];
302
+ return mp.Malloc(size, stream_ptr);
303
+ }
304
+
305
+ // Frees the memory, to the pool
306
+ //
307
+ // Args:
308
+ // ptr (intptr_t): Pointer of the memory buffer
309
+ // stream_ptr (cudaStream_t): Return the memory to the arena of given stream
310
+ void Free(intptr_t ptr, cudaStream_t stream_ptr = 0) {
311
+ auto& mp = pools_[device_id()];
312
+ mp.Free(ptr, stream_ptr);
313
+ }
314
+
315
+ // Free all **non-split** chunks in all arenas
316
+ void FreeAllBlocks() {
317
+ auto& mp = pools_[device_id()];
318
+ return mp.FreeAllBlocks();
319
+ }
320
+
321
+ // Free all **non-split** chunks in specified arena
322
+ //
323
+ // Args:
324
+ // stream_ptr (cudaStream_t): Release free blocks in the arena of given stream
325
+ void FreeAllBlocks(cudaStream_t stream_ptr) {
326
+ auto& mp = pools_[device_id()];
327
+ return mp.FreeAllBlocks(stream_ptr);
328
+ }
329
+
330
+ // Count the total number of free blocks.
331
+ //
332
+ // Returns:
333
+ // size_t: The total number of free blocks.
334
+ size_t GetNumFreeBlocks() {
335
+ auto& mp = pools_[device_id()];
336
+ return mp.GetNumFreeBlocks();
337
+ }
338
+
339
+ // Get the total number of bytes used.
340
+ //
341
+ // Returns:
342
+ // size_t: The total number of bytes used.
343
+ size_t GetUsedBytes() {
344
+ auto& mp = pools_[device_id()];
345
+ return mp.GetUsedBytes();
346
+ }
347
+
348
+ // Get the total number of bytes acquired but not used in the pool.
349
+ //
350
+ // Returns:
351
+ // size_t: The total number of bytes acquired but not used in the pool.
352
+ size_t GetFreeBytes() {
353
+ auto& mp = pools_[device_id()];
354
+ return mp.GetFreeBytes();
355
+ }
356
+
357
+ // Get the total number of bytes acquired in the pool.
358
+ //
359
+ // Returns:
360
+ // size_t: The total number of bytes acquired in the pool.
361
+ size_t GetTotalBytes() {
362
+ auto& mp = pools_[device_id()];
363
+ return mp.GetTotalBytes();
364
+ }
365
+ };
366
+
367
+ } // namespace internal
368
+ } // namespace cumo
369
+
370
+ #endif /* ifndef CUMO_CUDA_MEMORY_POOL_IMPL_H */