cumo 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (266) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.travis.yml +5 -0
  4. data/3rd_party/mkmf-cu/.gitignore +36 -0
  5. data/3rd_party/mkmf-cu/Gemfile +3 -0
  6. data/3rd_party/mkmf-cu/LICENSE +21 -0
  7. data/3rd_party/mkmf-cu/README.md +36 -0
  8. data/3rd_party/mkmf-cu/Rakefile +11 -0
  9. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
  11. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
  12. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
  13. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
  14. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
  15. data/CODE_OF_CONDUCT.md +46 -0
  16. data/Gemfile +8 -0
  17. data/LICENSE.txt +82 -0
  18. data/README.md +252 -0
  19. data/Rakefile +43 -0
  20. data/bench/broadcast_fp32.rb +138 -0
  21. data/bench/cumo_bench.rb +193 -0
  22. data/bench/numo_bench.rb +138 -0
  23. data/bench/reduction_fp32.rb +117 -0
  24. data/bin/console +14 -0
  25. data/bin/setup +8 -0
  26. data/cumo.gemspec +32 -0
  27. data/ext/cumo/cuda/cublas.c +278 -0
  28. data/ext/cumo/cuda/driver.c +421 -0
  29. data/ext/cumo/cuda/memory_pool.cpp +185 -0
  30. data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
  31. data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
  32. data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
  33. data/ext/cumo/cuda/nvrtc.c +207 -0
  34. data/ext/cumo/cuda/runtime.c +167 -0
  35. data/ext/cumo/cumo.c +148 -0
  36. data/ext/cumo/depend.erb +58 -0
  37. data/ext/cumo/extconf.rb +179 -0
  38. data/ext/cumo/include/cumo.h +25 -0
  39. data/ext/cumo/include/cumo/compat.h +23 -0
  40. data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
  41. data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
  42. data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
  43. data/ext/cumo/include/cumo/cuda/driver.h +22 -0
  44. data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
  45. data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
  46. data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
  47. data/ext/cumo/include/cumo/indexer.h +238 -0
  48. data/ext/cumo/include/cumo/intern.h +142 -0
  49. data/ext/cumo/include/cumo/intern_fwd.h +38 -0
  50. data/ext/cumo/include/cumo/intern_kernel.h +6 -0
  51. data/ext/cumo/include/cumo/narray.h +429 -0
  52. data/ext/cumo/include/cumo/narray_kernel.h +149 -0
  53. data/ext/cumo/include/cumo/ndloop.h +95 -0
  54. data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
  55. data/ext/cumo/include/cumo/template.h +158 -0
  56. data/ext/cumo/include/cumo/template_kernel.h +77 -0
  57. data/ext/cumo/include/cumo/types/bit.h +40 -0
  58. data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
  59. data/ext/cumo/include/cumo/types/complex.h +402 -0
  60. data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
  61. data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
  62. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
  63. data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
  64. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
  65. data/ext/cumo/include/cumo/types/dfloat.h +47 -0
  66. data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
  67. data/ext/cumo/include/cumo/types/float_def.h +34 -0
  68. data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
  69. data/ext/cumo/include/cumo/types/float_macro.h +191 -0
  70. data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
  71. data/ext/cumo/include/cumo/types/int16.h +24 -0
  72. data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
  73. data/ext/cumo/include/cumo/types/int32.h +24 -0
  74. data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
  75. data/ext/cumo/include/cumo/types/int64.h +24 -0
  76. data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
  77. data/ext/cumo/include/cumo/types/int8.h +24 -0
  78. data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
  79. data/ext/cumo/include/cumo/types/int_macro.h +67 -0
  80. data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
  81. data/ext/cumo/include/cumo/types/real_accum.h +486 -0
  82. data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
  83. data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
  84. data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
  85. data/ext/cumo/include/cumo/types/robject.h +27 -0
  86. data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
  87. data/ext/cumo/include/cumo/types/scomplex.h +46 -0
  88. data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
  89. data/ext/cumo/include/cumo/types/sfloat.h +48 -0
  90. data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
  91. data/ext/cumo/include/cumo/types/uint16.h +25 -0
  92. data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
  93. data/ext/cumo/include/cumo/types/uint32.h +25 -0
  94. data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
  95. data/ext/cumo/include/cumo/types/uint64.h +25 -0
  96. data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
  97. data/ext/cumo/include/cumo/types/uint8.h +25 -0
  98. data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
  99. data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
  100. data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
  101. data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
  102. data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
  103. data/ext/cumo/narray/SFMT-params.h +97 -0
  104. data/ext/cumo/narray/SFMT-params19937.h +46 -0
  105. data/ext/cumo/narray/SFMT.c +620 -0
  106. data/ext/cumo/narray/SFMT.h +167 -0
  107. data/ext/cumo/narray/array.c +638 -0
  108. data/ext/cumo/narray/data.c +961 -0
  109. data/ext/cumo/narray/gen/cogen.rb +56 -0
  110. data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
  111. data/ext/cumo/narray/gen/def/bit.rb +37 -0
  112. data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
  113. data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
  114. data/ext/cumo/narray/gen/def/int16.rb +36 -0
  115. data/ext/cumo/narray/gen/def/int32.rb +36 -0
  116. data/ext/cumo/narray/gen/def/int64.rb +36 -0
  117. data/ext/cumo/narray/gen/def/int8.rb +36 -0
  118. data/ext/cumo/narray/gen/def/robject.rb +37 -0
  119. data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
  120. data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
  121. data/ext/cumo/narray/gen/def/uint16.rb +36 -0
  122. data/ext/cumo/narray/gen/def/uint32.rb +36 -0
  123. data/ext/cumo/narray/gen/def/uint64.rb +36 -0
  124. data/ext/cumo/narray/gen/def/uint8.rb +36 -0
  125. data/ext/cumo/narray/gen/erbpp2.rb +346 -0
  126. data/ext/cumo/narray/gen/narray_def.rb +268 -0
  127. data/ext/cumo/narray/gen/spec.rb +425 -0
  128. data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
  129. data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
  130. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
  131. data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
  132. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
  133. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
  134. data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
  135. data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
  136. data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
  137. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
  138. data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
  139. data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
  140. data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
  141. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
  142. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
  143. data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
  144. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
  145. data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
  146. data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
  147. data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
  148. data/ext/cumo/narray/gen/tmpl/class.c +9 -0
  149. data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
  150. data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
  151. data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
  152. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
  153. data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
  154. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
  155. data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
  156. data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
  157. data/ext/cumo/narray/gen/tmpl/each.c +47 -0
  158. data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
  159. data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
  160. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
  161. data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
  162. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
  163. data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
  164. data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
  165. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
  166. data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
  167. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
  168. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
  169. data/ext/cumo/narray/gen/tmpl/format.c +62 -0
  170. data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
  171. data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
  172. data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
  173. data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
  174. data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
  175. data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
  176. data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
  177. data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
  178. data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
  179. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
  180. data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
  181. data/ext/cumo/narray/gen/tmpl/median.c +66 -0
  182. data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
  183. data/ext/cumo/narray/gen/tmpl/module.c +9 -0
  184. data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
  185. data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
  186. data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
  187. data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
  188. data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
  189. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
  190. data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
  191. data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
  192. data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
  193. data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
  194. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
  195. data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
  196. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
  197. data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
  198. data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
  199. data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
  200. data/ext/cumo/narray/gen/tmpl/store.c +41 -0
  201. data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
  202. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
  203. data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
  204. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
  205. data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
  206. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
  207. data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
  208. data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
  209. data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
  210. data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
  211. data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
  212. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
  213. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
  214. data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
  215. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
  216. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
  217. data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
  218. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
  219. data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
  220. data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
  221. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
  222. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
  223. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
  224. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
  225. data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
  226. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
  227. data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
  228. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
  229. data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
  230. data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
  231. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
  232. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
  233. data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
  234. data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
  235. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
  236. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
  237. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
  238. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
  239. data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
  240. data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
  241. data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
  242. data/ext/cumo/narray/index.c +880 -0
  243. data/ext/cumo/narray/kwargs.c +153 -0
  244. data/ext/cumo/narray/math.c +142 -0
  245. data/ext/cumo/narray/narray.c +1948 -0
  246. data/ext/cumo/narray/ndloop.c +2105 -0
  247. data/ext/cumo/narray/rand.c +45 -0
  248. data/ext/cumo/narray/step.c +474 -0
  249. data/ext/cumo/narray/struct.c +886 -0
  250. data/lib/cumo.rb +3 -0
  251. data/lib/cumo/cuda.rb +11 -0
  252. data/lib/cumo/cuda/compile_error.rb +36 -0
  253. data/lib/cumo/cuda/compiler.rb +161 -0
  254. data/lib/cumo/cuda/device.rb +47 -0
  255. data/lib/cumo/cuda/link_state.rb +31 -0
  256. data/lib/cumo/cuda/module.rb +40 -0
  257. data/lib/cumo/cuda/nvrtc_program.rb +27 -0
  258. data/lib/cumo/linalg.rb +12 -0
  259. data/lib/cumo/narray.rb +2 -0
  260. data/lib/cumo/narray/extra.rb +1278 -0
  261. data/lib/erbpp.rb +294 -0
  262. data/lib/erbpp/line_number.rb +137 -0
  263. data/lib/erbpp/narray_def.rb +381 -0
  264. data/numo-narray-version +1 -0
  265. data/run.gdb +7 -0
  266. metadata +353 -0
@@ -0,0 +1,308 @@
1
+ #include "memory_pool_impl.hpp"
2
+
3
+ #include <ruby.h>
4
+
5
+ namespace cumo {
6
+ namespace internal {
7
+
8
+ void CheckStatus(cudaError_t status) {
9
+ if (status != 0) {
10
+ throw CUDARuntimeError(status);
11
+ }
12
+ }
13
+
14
+ Memory::Memory(size_t size) : size_(size) {
15
+ if (size_ > 0) {
16
+ CheckStatus(cudaGetDevice(&device_id_));
17
+ CheckStatus(cudaMallocManaged(&ptr_, size_, cudaMemAttachGlobal));
18
+ // std::cout << "cudaMalloc " << ptr_ << std::endl;
19
+ }
20
+ }
21
+
22
+ Memory::~Memory() {
23
+ if (size_ > 0) {
24
+ // std::cout << "cudaFree " << ptr_ << std::endl;
25
+ cudaError_t status = cudaFree(ptr_);
26
+ // CUDA driver may shut down before freeing memory inside memory pool.
27
+ // It is okay to simply ignore because CUDA driver automatically frees memory.
28
+ if (status != cudaErrorCudartUnloading) {
29
+ CheckStatus(status);
30
+ }
31
+ }
32
+ }
33
+
34
+ std::shared_ptr<Chunk> Split(std::shared_ptr<Chunk>& self, size_t size) {
35
+ assert(self->size_ >= size);
36
+ if (self->size_ == size) {
37
+ return nullptr;
38
+ }
39
+
40
+ auto remaining = std::make_shared<Chunk>(self->mem_, self->offset_ + size, self->size_ - size, self->stream_ptr_);
41
+ self->size_ = size;
42
+
43
+ if (self->next_) {
44
+ remaining->set_next(std::move(self->next_));
45
+ remaining->next()->set_prev(remaining);
46
+ }
47
+ self->next_ = remaining;
48
+ remaining->set_prev(self);
49
+
50
+ return remaining;
51
+ }
52
+
53
+
54
+ void Merge(std::shared_ptr<Chunk>& self, std::shared_ptr<Chunk> remaining) {
55
+ assert(remaining != nullptr);
56
+ assert(self->stream_ptr_ == remaining->stream_ptr());
57
+ self->size_ += remaining->size();
58
+ self->next_ = remaining->next();
59
+ if (remaining->next() != nullptr) {
60
+ self->next_->set_prev(self);
61
+ }
62
+ }
63
+
64
+ void SingleDeviceMemoryPool::AppendToFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr) {
65
+ assert(chunk != nullptr && !chunk->in_use());
66
+ int bin_index = GetBinIndex(size);
67
+
68
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
69
+
70
+ Arena& arena = GetArena(stream_ptr);
71
+ ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
72
+ int arena_index = std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
73
+ int length = static_cast<int>(arena_index_map.size());
74
+ if (arena_index >= length || arena_index_map.at(arena_index) != bin_index) {
75
+ arena_index_map.insert(arena_index_map.begin() + arena_index, bin_index);
76
+ arena.insert(arena.begin() + arena_index, FreeList{});
77
+ }
78
+ FreeList& free_list = arena[arena_index];
79
+ free_list.emplace_back(chunk);
80
+ }
81
+
82
+ bool SingleDeviceMemoryPool::RemoveFromFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr) {
83
+ assert(chunk != nullptr && !chunk->in_use());
84
+ int bin_index = GetBinIndex(size);
85
+
86
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
87
+
88
+ Arena& arena = GetArena(stream_ptr);
89
+ ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
90
+ if (arena_index_map.size() == 0) {
91
+ return false;
92
+ }
93
+ int arena_index = std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
94
+ if (static_cast<size_t>(arena_index) == arena_index_map.size()) {
95
+ // Bin does not exist for the given chunk size.
96
+ return false;
97
+ }
98
+ if (arena_index_map.at(arena_index) != bin_index) {
99
+ return false;
100
+ }
101
+ assert(arena.size() > static_cast<size_t>(arena_index));
102
+ FreeList& free_list = arena[arena_index];
103
+ return EraseFromFreeList(free_list, chunk);
104
+ }
105
+
106
+ intptr_t SingleDeviceMemoryPool::Malloc(size_t size, cudaStream_t stream_ptr) {
107
+ size = GetRoundedSize(size);
108
+ std::shared_ptr<Chunk> chunk = nullptr;
109
+
110
+ {
111
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
112
+
113
+ // find best-fit, or a smallest larger allocation
114
+ Arena& arena = GetArena(stream_ptr);
115
+ int arena_index = GetArenaIndex(size);
116
+ int arena_length = static_cast<int>(arena.size());
117
+ for (int i = arena_index; i < arena_length; ++i) {
118
+ FreeList& free_list = arena[i];
119
+ if (free_list.empty()) {
120
+ continue;
121
+ }
122
+ chunk = PopFromFreeList(free_list);
123
+ // TODO(sonots): compact_index
124
+ break;
125
+ }
126
+ }
127
+
128
+ if (chunk != nullptr) {
129
+ std::shared_ptr<Chunk> remaining = Split(chunk, size);
130
+ if (remaining != nullptr) {
131
+ AppendToFreeList(remaining->size(), remaining, stream_ptr);
132
+ }
133
+ } else {
134
+ // cudaMalloc if a cache is not found
135
+ std::shared_ptr<Memory> mem = nullptr;
136
+ try {
137
+ mem = std::make_shared<Memory>(size);
138
+ } catch (const CUDARuntimeError& e) {
139
+ if (e.status() != cudaErrorMemoryAllocation) {
140
+ throw;
141
+ }
142
+ FreeAllBlocks();
143
+ try {
144
+ mem = std::make_shared<Memory>(size);
145
+ } catch (const CUDARuntimeError& e) {
146
+ if (e.status() != cudaErrorMemoryAllocation) {
147
+ throw;
148
+ }
149
+ #ifdef NO_RUBY // cpp test does not bind with libruby
150
+ size_t total = size + GetTotalBytes();
151
+ throw OutOfMemoryError(size, total);
152
+ #else
153
+ rb_funcall(rb_define_module("GC"), rb_intern("start"), 0);
154
+ try {
155
+ mem = std::make_shared<Memory>(size);
156
+ } catch (const CUDARuntimeError& e) {
157
+ if (e.status() != cudaErrorMemoryAllocation) {
158
+ throw;
159
+ }
160
+ size_t total = size + GetTotalBytes();
161
+ throw OutOfMemoryError(size, total);
162
+ }
163
+ #endif
164
+ }
165
+ }
166
+ chunk = std::make_shared<Chunk>(mem, 0, size, stream_ptr);
167
+ }
168
+
169
+ assert(chunk != nullptr);
170
+ assert(chunk->stream_ptr() == stream_ptr);
171
+ {
172
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
173
+
174
+ chunk->set_in_use(true);
175
+ in_use_.emplace(chunk->ptr(), chunk);
176
+ }
177
+ return chunk->ptr();
178
+ }
179
+
180
+ void SingleDeviceMemoryPool::Free(intptr_t ptr, cudaStream_t stream_ptr) {
181
+ std::shared_ptr<Chunk> chunk = nullptr;
182
+
183
+ {
184
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
185
+
186
+ chunk = in_use_[ptr];
187
+ // assert(chunk != nullptr);
188
+ if (!chunk) return;
189
+ chunk->set_in_use(false);
190
+ in_use_.erase(ptr);
191
+ }
192
+
193
+ if (chunk->next() != nullptr && !chunk->next()->in_use()) {
194
+ if (RemoveFromFreeList(chunk->next()->size(), chunk->next(), stream_ptr)) {
195
+ Merge(chunk, chunk->next());
196
+ }
197
+ }
198
+ if (chunk->prev() != nullptr && !chunk->prev()->in_use()) {
199
+ if (RemoveFromFreeList(chunk->prev()->size(), chunk->prev(), stream_ptr)) {
200
+ chunk = chunk->prev();
201
+ Merge(chunk, chunk->next());
202
+ }
203
+ }
204
+ AppendToFreeList(chunk->size(), chunk, stream_ptr);
205
+ }
206
+
207
+ void SingleDeviceMemoryPool::CompactIndex(cudaStream_t stream_ptr, bool free) {
208
+ // need lock ouside this function
209
+ if (!HasArena(stream_ptr)) return;
210
+
211
+ Arena new_arena;
212
+ ArenaIndexMap new_arena_index_map;
213
+ Arena& arena = GetArena(stream_ptr);
214
+ ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
215
+ size_t arena_length = arena.size();
216
+ for (size_t arena_index = 0; arena_index < arena_length; ++arena_index) {
217
+ FreeList& free_list = arena[arena_index];
218
+ if (free_list.empty()) {
219
+ continue;
220
+ }
221
+ if (free) {
222
+ FreeList keep_list;
223
+ for (auto chunk : free_list) {
224
+ if (chunk->prev() != nullptr || chunk->next() != nullptr) {
225
+ keep_list.emplace_back(chunk);
226
+ }
227
+ }
228
+ if (keep_list.size() == 0) {
229
+ continue;
230
+ }
231
+ new_arena_index_map.emplace_back(arena_index_map[arena_index]);
232
+ new_arena.emplace_back(keep_list);
233
+ } else {
234
+ new_arena_index_map.emplace_back(arena_index_map[arena_index]);
235
+ new_arena.emplace_back(free_list);
236
+ }
237
+ }
238
+ if (new_arena.empty()) {
239
+ index_.erase(stream_ptr);
240
+ free_.erase(stream_ptr);
241
+ } else {
242
+ arena_index_map.swap(new_arena_index_map);
243
+ arena.swap(new_arena);
244
+ }
245
+ }
246
+
247
+ // Free all **non-split** chunks in all arenas
248
+ void SingleDeviceMemoryPool::FreeAllBlocks() {
249
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
250
+
251
+ std::vector<cudaStream_t> keys(free_.size());
252
+ transform(free_.begin(), free_.end(), keys.begin(), [](auto pair) { return pair.first; });
253
+ for (cudaStream_t stream_ptr : keys) {
254
+ CompactIndex(stream_ptr, true);
255
+ }
256
+ }
257
+
258
+ // Free all **non-split** chunks in specified arena
259
+ void SingleDeviceMemoryPool::FreeAllBlocks(cudaStream_t stream_ptr) {
260
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
261
+
262
+ CompactIndex(stream_ptr, true);
263
+ }
264
+
265
+ size_t SingleDeviceMemoryPool::GetNumFreeBlocks() {
266
+ size_t n = 0;
267
+
268
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
269
+
270
+ for (auto kv : free_) {
271
+ Arena& arena = kv.second;
272
+ for (auto free_list : arena) {
273
+ n += free_list.size();
274
+ }
275
+ }
276
+ return n;
277
+ }
278
+
279
+ size_t SingleDeviceMemoryPool::GetUsedBytes() {
280
+ size_t size = 0;
281
+
282
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
283
+
284
+ for (auto kv : in_use_) {
285
+ std::shared_ptr<Chunk>& chunk = kv.second;
286
+ if (chunk) size += chunk->size();
287
+ }
288
+ return size;
289
+ }
290
+
291
+ size_t SingleDeviceMemoryPool::GetFreeBytes() {
292
+ size_t size = 0;
293
+
294
+ std::lock_guard<std::recursive_mutex> lock{mutex_};
295
+
296
+ for (auto kv : free_) {
297
+ Arena& arena = kv.second;
298
+ for (auto free_list : arena) {
299
+ for (auto chunk : free_list) {
300
+ if (chunk) size += chunk->size();
301
+ }
302
+ }
303
+ }
304
+ return size;
305
+ }
306
+
307
+ } // namespace internal
308
+ } // namespace cumo
@@ -0,0 +1,370 @@
1
+ #ifndef CUMO_CUDA_MEMORY_POOL_IMPL_H
2
+ #define CUMO_CUDA_MEMORY_POOL_IMPL_H
3
+
4
+ #include <algorithm>
5
+ #include <cassert>
6
+ #include <memory>
7
+ #include <mutex>
8
+ #include <stdexcept>
9
+ #include <unordered_map>
10
+ #include <vector>
11
+
12
+ #include <cuda_runtime.h>
13
+
14
+ // CUDA memory pool implementation highly referring CuPy
15
+
16
+ namespace cumo {
17
+ namespace internal {
18
+
19
+ // cudaMalloc() is aligned to at least 512 bytes
20
+ // cf. https://gist.github.com/sonots/41daaa6432b1c8b27ef782cd14064269
21
+ constexpr int kRoundSize = 512; // bytes
22
+
23
+ class CUDARuntimeError : public std::runtime_error {
24
+ private:
25
+ cudaError_t status_;
26
+
27
+ public:
28
+ CUDARuntimeError(cudaError_t status) :
29
+ runtime_error(cudaGetErrorString(status)), status_(status) {}
30
+ cudaError_t status() const { return status_; }
31
+ };
32
+
33
+
34
+ class OutOfMemoryError : public std::runtime_error {
35
+ public:
36
+ OutOfMemoryError(size_t size, size_t total) :
37
+ runtime_error("out of memory to allocate " + std::to_string(size) + " bytes (total " + std::to_string(total) + " bytes)") {}
38
+ };
39
+
40
+ void CheckStatus(cudaError_t status);
41
+
42
+ // Memory allocation on a CUDA device.
43
+ //
44
+ // This class provides an RAII interface of the CUDA memory allocation.
45
+ class Memory {
46
+ private:
47
+ // Pointer to the place within the buffer.
48
+ void* ptr_ = nullptr;
49
+ // Size of the memory allocation in bytes.
50
+ size_t size_ = 0;
51
+ // GPU device id whose memory the pointer refers to.
52
+ int device_id_ = -1;
53
+
54
+ public:
55
+ Memory(size_t size);
56
+
57
+ ~Memory();
58
+
59
+ intptr_t ptr() const { return reinterpret_cast<intptr_t>(ptr_); }
60
+
61
+ size_t size() const { return size_; }
62
+
63
+ int device_id() const { return device_id_; }
64
+ };
65
+
66
+ // A chunk points to a device memory.
67
+ //
68
+ // A chunk might be a splitted memory block from a larger allocation.
69
+ // The prev/next pointers contruct a doubly-linked list of memory addresses
70
+ // sorted by base address that must be contiguous.
71
+ class Chunk {
72
+ private:
73
+ // The device memory buffer.
74
+ std::shared_ptr<Memory> mem_;
75
+ // Memory address.
76
+ intptr_t ptr_ = 0;
77
+ // An offset bytes from the head of the buffer.
78
+ size_t offset_ = 0;
79
+ // Chunk size in bytes.
80
+ size_t size_ = 0;
81
+ // GPU device id whose memory the pointer refers to.
82
+ int device_id_;
83
+ // prev memory pointer if split from a larger allocation
84
+ std::shared_ptr<Chunk> prev_;
85
+ // next memory pointer if split from a larger allocation
86
+ std::shared_ptr<Chunk> next_;
87
+ // Raw stream handle of cuda stream
88
+ cudaStream_t stream_ptr_;
89
+ // chunk is in use
90
+ bool in_use_ = false;
91
+
92
+ public:
93
+ Chunk() {}
94
+
95
+ // mem: The device memory buffer.
96
+ // offset: An offset bytes from the head of the buffer.
97
+ // size: Chunk size in bytes.
98
+ // stream_ptr: Raw stream handle of cuda stream
99
+ Chunk(const std::shared_ptr<Memory>& mem, size_t offset, size_t size, cudaStream_t stream_ptr = 0) :
100
+ mem_(mem), ptr_(mem->ptr() + offset), offset_(offset), size_(size), device_id_(mem->device_id()), stream_ptr_(stream_ptr) {
101
+ assert(mem->ptr() > 0 || offset == 0);
102
+ }
103
+
104
+ Chunk(const Chunk&) = default;
105
+
106
+ ~Chunk() {
107
+ // std::cout << "Chunk dtor " << (void*)ptr_ << " " << this << std::endl;
108
+ }
109
+
110
+ intptr_t ptr() const { return ptr_; }
111
+
112
+ size_t offset() const { return offset_; }
113
+
114
+ size_t size() const { return size_; }
115
+
116
+ int device_id() const { return device_id_; }
117
+
118
+ const std::shared_ptr<Chunk>& prev() const { return prev_; }
119
+
120
+ std::shared_ptr<Chunk>& prev() { return prev_; }
121
+
122
+ const std::shared_ptr<Chunk>& next() const { return next_; }
123
+
124
+ std::shared_ptr<Chunk>& next() { return next_; }
125
+
126
+ cudaStream_t stream_ptr() const { return stream_ptr_; }
127
+
128
+ void set_prev(const std::shared_ptr<Chunk>& prev) { prev_ = prev; }
129
+
130
+ void set_next(const std::shared_ptr<Chunk>& next) { next_ = next; }
131
+
132
+ bool in_use() const { return in_use_; }
133
+
134
+ void set_in_use(bool in_use) { in_use_ = in_use; }
135
+
136
+ // Split contiguous block of a larger allocation
137
+ friend std::shared_ptr<Chunk> Split(std::shared_ptr<Chunk>& self, size_t size);
138
+
139
+ // Merge previously splitted block (chunk)
140
+ friend void Merge(std::shared_ptr<Chunk>& self, std::shared_ptr<Chunk> remaining);
141
+ };
142
+
143
+ using FreeList = std::vector<std::shared_ptr<Chunk>>; // list of free chunk
144
+ using Arena = std::vector<FreeList>; // free_list w.r.t arena index
145
+ using ArenaIndexMap = std::vector<int>; // arena index <=> bin size index
146
+
147
+ // Memory pool implementation for single device.
148
+ // - The allocator attempts to find the smallest cached block that will fit
149
+ // the requested size. If the block is larger than the requested size,
150
+ // it may be split. If no block is found, the allocator will delegate to
151
+ // cudaMalloc.
152
+ // - If the cudaMalloc fails, the allocator will free all cached blocks that
153
+ // are not split and retry the allocation.
154
+ class SingleDeviceMemoryPool {
155
+ private:
156
+ int device_id_;
157
+ std::unordered_map<intptr_t, std::shared_ptr<Chunk>> in_use_; // ptr => Chunk
158
+ std::unordered_map<cudaStream_t, Arena> free_;
159
+ std::unordered_map<cudaStream_t, ArenaIndexMap> index_;
160
+ std::recursive_mutex mutex_;
161
+
162
+ public:
163
+ SingleDeviceMemoryPool() {
164
+ CheckStatus(cudaGetDevice(&device_id_));
165
+ }
166
+
167
+ intptr_t Malloc(size_t size, cudaStream_t stream_ptr = 0);
168
+
169
+ void Free(intptr_t ptr, cudaStream_t stream_ptr = 0);
170
+
171
+ // Free all **non-split** chunks in all arenas
172
+ void FreeAllBlocks();
173
+
174
+ // Free all **non-split** chunks in specified arena
175
+ void FreeAllBlocks(cudaStream_t stream_ptr);
176
+
177
+ size_t GetNumFreeBlocks();
178
+
179
+ size_t GetUsedBytes();
180
+
181
+ size_t GetFreeBytes();
182
+
183
+ size_t GetTotalBytes() {
184
+ return GetUsedBytes() + GetFreeBytes();
185
+ }
186
+
187
+ // private:
188
+
189
+ // Rounds up the memory size to fit memory alignment of cudaMalloc.
190
+ size_t GetRoundedSize(size_t size) {
191
+ return ((size + kRoundSize - 1) / kRoundSize) * kRoundSize;
192
+ }
193
+
194
+ // Get bin index regarding the memory size
195
+ int GetBinIndex(size_t size) {
196
+ return (size - 1) / kRoundSize;
197
+ }
198
+
199
+ int GetArenaIndex(size_t size, cudaStream_t stream_ptr = 0) {
200
+ int bin_index = GetBinIndex(size);
201
+ ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr);
202
+ return std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin();
203
+ }
204
+
205
+ bool HasArena(cudaStream_t stream_ptr) {
206
+ auto it = free_.find(stream_ptr);
207
+ return it != free_.end();
208
+ }
209
+
210
+ // Returns appropriate arena (list of bins) of a given stream.
211
+ //
212
+ // All free chunks in the stream belong to one of the bin in the arena.
213
+ //
214
+ // Caller is responsible to acquire lock.
215
+ Arena& GetArena(cudaStream_t stream_ptr) {
216
+ return free_[stream_ptr]; // find or create
217
+ }
218
+
219
+ // Returns appropriate arena sparse index of a given stream.
220
+ //
221
+ // Each element of the returned vector is an index value of the arena
222
+ // for the stream. The k-th element of the arena index is the bin index
223
+ // of the arena. For example, when the arena index is `[1, 3]`, it means
224
+ // that the arena has 2 bins, and `arena[0]` is for bin index 1 and
225
+ // `arena[1]` is for bin index 3.
226
+ //
227
+ // Caller is responsible to acquire lock.
228
+ ArenaIndexMap& GetArenaIndexMap(cudaStream_t stream_ptr) {
229
+ return index_[stream_ptr]; // find or create
230
+ }
231
+
232
+ std::shared_ptr<Chunk> PopFromFreeList(FreeList& free_list) {
233
+ auto data = free_list.back();
234
+ free_list.pop_back();
235
+ return data;
236
+ }
237
+
238
+ // std::vector erase-remove idiom
239
+ // http://minus9d.hatenablog.com/entry/20120605/1338896754
240
+ bool EraseFromFreeList(FreeList& free_list, const std::shared_ptr<Chunk>& chunk) {
241
+ assert(!chunk->in_use());
242
+ auto iter = std::find(free_list.begin(), free_list.end(), chunk);
243
+ if (iter == free_list.end()) {
244
+ return false;
245
+ }
246
+ free_list.erase(iter);
247
+ return true;
248
+ }
249
+
250
+ void AppendToFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr = 0);
251
+
252
+ // Removes the chunk from the free list.
253
+ //
254
+ // @return true if the chunk can successfully be removed from
255
+ // the free list. false` otherwise (e.g., the chunk could not
256
+ // be found in the free list as the chunk is allocated.)
257
+ bool RemoveFromFreeList(size_t size, std::shared_ptr<Chunk>& chunk, cudaStream_t stream_ptr = 0);
258
+
259
+ void CompactIndex(cudaStream_t stream_ptr, bool free);
260
+ };
261
+
262
+ // Memory pool for all GPU devices on the host.
263
+ //
264
+ // A memory pool preserves any allocations even if they are freed by the user.
265
+ // Freed memory buffers are held by the memory pool as *free blocks*, and they
266
+ // are reused for further memory allocations of the same sizes. The allocated
267
+ // blocks are managed for each device, so one instance of this class can be
268
+ // used for multiple devices.
269
+ // .. note::
270
+ // When the allocation is skipped by reusing the pre-allocated block, it
271
+ // does not call ``cudaMalloc`` and therefore CPU-GPU synchronization does
272
+ // not occur. It makes interleaves of memory allocations and kernel
273
+ // invocations very fast.
274
+ // .. note::
275
+ // The memory pool holds allocated blocks without freeing as much as
276
+ // possible. It makes the program hold most of the device memory, which may
277
+ // make other CUDA programs running in parallel out-of-memory situation.
278
+ class MemoryPool {
279
+ private:
280
+ int device_id() {
281
+ int device_id = -1;
282
+ CheckStatus(cudaGetDevice(&device_id));
283
+ return device_id;
284
+ }
285
+
286
+ std::unordered_map<int, SingleDeviceMemoryPool> pools_;
287
+
288
+ public:
289
+ MemoryPool() {}
290
+
291
+ ~MemoryPool() { pools_.clear(); }
292
+
293
+ // Allocates the memory, from the pool if possible.
294
+ //
295
+ // Args:
296
+ // size (int): Size of the memory buffer to allocate in bytes.
297
+ // stream_ptr (cudaStream_t): Get the memory from the arena of given stream
298
+ // Returns:
299
+ // intptr_t: Pointer address to the allocated buffer.
300
+ intptr_t Malloc(size_t size, cudaStream_t stream_ptr = 0) {
301
+ auto& mp = pools_[device_id()];
302
+ return mp.Malloc(size, stream_ptr);
303
+ }
304
+
305
+ // Frees the memory, to the pool
306
+ //
307
+ // Args:
308
+ // ptr (intptr_t): Pointer of the memory buffer
309
+ // stream_ptr (cudaStream_t): Return the memory to the arena of given stream
310
+ void Free(intptr_t ptr, cudaStream_t stream_ptr = 0) {
311
+ auto& mp = pools_[device_id()];
312
+ mp.Free(ptr, stream_ptr);
313
+ }
314
+
315
+ // Free all **non-split** chunks in all arenas
316
+ void FreeAllBlocks() {
317
+ auto& mp = pools_[device_id()];
318
+ return mp.FreeAllBlocks();
319
+ }
320
+
321
+ // Free all **non-split** chunks in specified arena
322
+ //
323
+ // Args:
324
+ // stream_ptr (cudaStream_t): Release free blocks in the arena of given stream
325
+ void FreeAllBlocks(cudaStream_t stream_ptr) {
326
+ auto& mp = pools_[device_id()];
327
+ return mp.FreeAllBlocks(stream_ptr);
328
+ }
329
+
330
+ // Count the total number of free blocks.
331
+ //
332
+ // Returns:
333
+ // size_t: The total number of free blocks.
334
+ size_t GetNumFreeBlocks() {
335
+ auto& mp = pools_[device_id()];
336
+ return mp.GetNumFreeBlocks();
337
+ }
338
+
339
+ // Get the total number of bytes used.
340
+ //
341
+ // Returns:
342
+ // size_t: The total number of bytes used.
343
+ size_t GetUsedBytes() {
344
+ auto& mp = pools_[device_id()];
345
+ return mp.GetUsedBytes();
346
+ }
347
+
348
+ // Get the total number of bytes acquired but not used in the pool.
349
+ //
350
+ // Returns:
351
+ // size_t: The total number of bytes acquired but not used in the pool.
352
+ size_t GetFreeBytes() {
353
+ auto& mp = pools_[device_id()];
354
+ return mp.GetFreeBytes();
355
+ }
356
+
357
+ // Get the total number of bytes acquired in the pool.
358
+ //
359
+ // Returns:
360
+ // size_t: The total number of bytes acquired in the pool.
361
+ size_t GetTotalBytes() {
362
+ auto& mp = pools_[device_id()];
363
+ return mp.GetTotalBytes();
364
+ }
365
+ };
366
+
367
+ } // namespace internal
368
+ } // namespace cumo
369
+
370
+ #endif /* ifndef CUMO_CUDA_MEMORY_POOL_IMPL_H */