cumo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.travis.yml +5 -0
  4. data/3rd_party/mkmf-cu/.gitignore +36 -0
  5. data/3rd_party/mkmf-cu/Gemfile +3 -0
  6. data/3rd_party/mkmf-cu/LICENSE +21 -0
  7. data/3rd_party/mkmf-cu/README.md +36 -0
  8. data/3rd_party/mkmf-cu/Rakefile +11 -0
  9. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
  11. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
  12. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
  13. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
  14. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
  15. data/CODE_OF_CONDUCT.md +46 -0
  16. data/Gemfile +8 -0
  17. data/LICENSE.txt +82 -0
  18. data/README.md +252 -0
  19. data/Rakefile +43 -0
  20. data/bench/broadcast_fp32.rb +138 -0
  21. data/bench/cumo_bench.rb +193 -0
  22. data/bench/numo_bench.rb +138 -0
  23. data/bench/reduction_fp32.rb +117 -0
  24. data/bin/console +14 -0
  25. data/bin/setup +8 -0
  26. data/cumo.gemspec +32 -0
  27. data/ext/cumo/cuda/cublas.c +278 -0
  28. data/ext/cumo/cuda/driver.c +421 -0
  29. data/ext/cumo/cuda/memory_pool.cpp +185 -0
  30. data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
  31. data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
  32. data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
  33. data/ext/cumo/cuda/nvrtc.c +207 -0
  34. data/ext/cumo/cuda/runtime.c +167 -0
  35. data/ext/cumo/cumo.c +148 -0
  36. data/ext/cumo/depend.erb +58 -0
  37. data/ext/cumo/extconf.rb +179 -0
  38. data/ext/cumo/include/cumo.h +25 -0
  39. data/ext/cumo/include/cumo/compat.h +23 -0
  40. data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
  41. data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
  42. data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
  43. data/ext/cumo/include/cumo/cuda/driver.h +22 -0
  44. data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
  45. data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
  46. data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
  47. data/ext/cumo/include/cumo/indexer.h +238 -0
  48. data/ext/cumo/include/cumo/intern.h +142 -0
  49. data/ext/cumo/include/cumo/intern_fwd.h +38 -0
  50. data/ext/cumo/include/cumo/intern_kernel.h +6 -0
  51. data/ext/cumo/include/cumo/narray.h +429 -0
  52. data/ext/cumo/include/cumo/narray_kernel.h +149 -0
  53. data/ext/cumo/include/cumo/ndloop.h +95 -0
  54. data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
  55. data/ext/cumo/include/cumo/template.h +158 -0
  56. data/ext/cumo/include/cumo/template_kernel.h +77 -0
  57. data/ext/cumo/include/cumo/types/bit.h +40 -0
  58. data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
  59. data/ext/cumo/include/cumo/types/complex.h +402 -0
  60. data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
  61. data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
  62. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
  63. data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
  64. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
  65. data/ext/cumo/include/cumo/types/dfloat.h +47 -0
  66. data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
  67. data/ext/cumo/include/cumo/types/float_def.h +34 -0
  68. data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
  69. data/ext/cumo/include/cumo/types/float_macro.h +191 -0
  70. data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
  71. data/ext/cumo/include/cumo/types/int16.h +24 -0
  72. data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
  73. data/ext/cumo/include/cumo/types/int32.h +24 -0
  74. data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
  75. data/ext/cumo/include/cumo/types/int64.h +24 -0
  76. data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
  77. data/ext/cumo/include/cumo/types/int8.h +24 -0
  78. data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
  79. data/ext/cumo/include/cumo/types/int_macro.h +67 -0
  80. data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
  81. data/ext/cumo/include/cumo/types/real_accum.h +486 -0
  82. data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
  83. data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
  84. data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
  85. data/ext/cumo/include/cumo/types/robject.h +27 -0
  86. data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
  87. data/ext/cumo/include/cumo/types/scomplex.h +46 -0
  88. data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
  89. data/ext/cumo/include/cumo/types/sfloat.h +48 -0
  90. data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
  91. data/ext/cumo/include/cumo/types/uint16.h +25 -0
  92. data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
  93. data/ext/cumo/include/cumo/types/uint32.h +25 -0
  94. data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
  95. data/ext/cumo/include/cumo/types/uint64.h +25 -0
  96. data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
  97. data/ext/cumo/include/cumo/types/uint8.h +25 -0
  98. data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
  99. data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
  100. data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
  101. data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
  102. data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
  103. data/ext/cumo/narray/SFMT-params.h +97 -0
  104. data/ext/cumo/narray/SFMT-params19937.h +46 -0
  105. data/ext/cumo/narray/SFMT.c +620 -0
  106. data/ext/cumo/narray/SFMT.h +167 -0
  107. data/ext/cumo/narray/array.c +638 -0
  108. data/ext/cumo/narray/data.c +961 -0
  109. data/ext/cumo/narray/gen/cogen.rb +56 -0
  110. data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
  111. data/ext/cumo/narray/gen/def/bit.rb +37 -0
  112. data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
  113. data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
  114. data/ext/cumo/narray/gen/def/int16.rb +36 -0
  115. data/ext/cumo/narray/gen/def/int32.rb +36 -0
  116. data/ext/cumo/narray/gen/def/int64.rb +36 -0
  117. data/ext/cumo/narray/gen/def/int8.rb +36 -0
  118. data/ext/cumo/narray/gen/def/robject.rb +37 -0
  119. data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
  120. data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
  121. data/ext/cumo/narray/gen/def/uint16.rb +36 -0
  122. data/ext/cumo/narray/gen/def/uint32.rb +36 -0
  123. data/ext/cumo/narray/gen/def/uint64.rb +36 -0
  124. data/ext/cumo/narray/gen/def/uint8.rb +36 -0
  125. data/ext/cumo/narray/gen/erbpp2.rb +346 -0
  126. data/ext/cumo/narray/gen/narray_def.rb +268 -0
  127. data/ext/cumo/narray/gen/spec.rb +425 -0
  128. data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
  129. data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
  130. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
  131. data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
  132. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
  133. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
  134. data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
  135. data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
  136. data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
  137. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
  138. data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
  139. data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
  140. data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
  141. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
  142. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
  143. data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
  144. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
  145. data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
  146. data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
  147. data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
  148. data/ext/cumo/narray/gen/tmpl/class.c +9 -0
  149. data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
  150. data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
  151. data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
  152. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
  153. data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
  154. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
  155. data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
  156. data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
  157. data/ext/cumo/narray/gen/tmpl/each.c +47 -0
  158. data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
  159. data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
  160. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
  161. data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
  162. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
  163. data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
  164. data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
  165. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
  166. data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
  167. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
  168. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
  169. data/ext/cumo/narray/gen/tmpl/format.c +62 -0
  170. data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
  171. data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
  172. data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
  173. data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
  174. data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
  175. data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
  176. data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
  177. data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
  178. data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
  179. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
  180. data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
  181. data/ext/cumo/narray/gen/tmpl/median.c +66 -0
  182. data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
  183. data/ext/cumo/narray/gen/tmpl/module.c +9 -0
  184. data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
  185. data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
  186. data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
  187. data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
  188. data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
  189. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
  190. data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
  191. data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
  192. data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
  193. data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
  194. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
  195. data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
  196. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
  197. data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
  198. data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
  199. data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
  200. data/ext/cumo/narray/gen/tmpl/store.c +41 -0
  201. data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
  202. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
  203. data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
  204. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
  205. data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
  206. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
  207. data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
  208. data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
  209. data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
  210. data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
  211. data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
  212. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
  213. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
  214. data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
  215. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
  216. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
  217. data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
  218. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
  219. data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
  220. data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
  221. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
  222. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
  223. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
  224. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
  225. data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
  226. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
  227. data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
  228. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
  229. data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
  230. data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
  231. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
  232. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
  233. data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
  234. data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
  235. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
  236. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
  237. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
  238. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
  239. data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
  240. data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
  241. data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
  242. data/ext/cumo/narray/index.c +880 -0
  243. data/ext/cumo/narray/kwargs.c +153 -0
  244. data/ext/cumo/narray/math.c +142 -0
  245. data/ext/cumo/narray/narray.c +1948 -0
  246. data/ext/cumo/narray/ndloop.c +2105 -0
  247. data/ext/cumo/narray/rand.c +45 -0
  248. data/ext/cumo/narray/step.c +474 -0
  249. data/ext/cumo/narray/struct.c +886 -0
  250. data/lib/cumo.rb +3 -0
  251. data/lib/cumo/cuda.rb +11 -0
  252. data/lib/cumo/cuda/compile_error.rb +36 -0
  253. data/lib/cumo/cuda/compiler.rb +161 -0
  254. data/lib/cumo/cuda/device.rb +47 -0
  255. data/lib/cumo/cuda/link_state.rb +31 -0
  256. data/lib/cumo/cuda/module.rb +40 -0
  257. data/lib/cumo/cuda/nvrtc_program.rb +27 -0
  258. data/lib/cumo/linalg.rb +12 -0
  259. data/lib/cumo/narray.rb +2 -0
  260. data/lib/cumo/narray/extra.rb +1278 -0
  261. data/lib/erbpp.rb +294 -0
  262. data/lib/erbpp/line_number.rb +137 -0
  263. data/lib/erbpp/narray_def.rb +381 -0
  264. data/numo-narray-version +1 -0
  265. data/run.gdb +7 -0
  266. metadata +353 -0
@@ -0,0 +1,421 @@
1
+ #include <ruby.h>
2
+ #include <ruby/thread.h>
3
+ #include <cuda.h>
4
+ #include <cuda_runtime.h>
5
+ #include "cumo/cuda/driver.h"
6
+
7
+ VALUE cumo_cuda_eDriverError;
8
+ VALUE cumo_cuda_mDriver;
9
+ #define eDriverError cumo_cuda_eDriverError
10
+ #define mDriver cumo_cuda_mDriver
11
+
12
+ static void
13
+ check_status(CUresult status)
14
+ {
15
+ if (status != 0) {
16
+ const char *errname = NULL;
17
+ const char *errstring = NULL;
18
+ cuGetErrorName(status, &errname);
19
+ cuGetErrorString(status, &errstring);
20
+ rb_raise(cumo_cuda_eDriverError, "%s %s (error=%d)", errname, errstring, status);
21
+ }
22
+ }
23
+
24
+ ///////////////////////////////////////////////
25
+ // Context Management
26
+ //////////////////////////////////////////////
27
+
28
+ static VALUE
29
+ rb_cuCtxCreate(VALUE self, VALUE flags, VALUE dev)
30
+ {
31
+ unsigned int _flags = NUM2INT(flags);
32
+ CUdevice _dev = (CUdevice)NUM2INT(dev);
33
+ CUcontext _pctx;
34
+ CUresult status;
35
+
36
+ status = cuCtxCreate(&_pctx, _flags, _dev);
37
+
38
+ check_status(status);
39
+ return SIZET2NUM((size_t)_pctx);
40
+ }
41
+
42
+ static VALUE
43
+ rb_cuCtxGetCurrent(VALUE self)
44
+ {
45
+ CUcontext ctx;
46
+ CUresult status;
47
+
48
+ status = cuCtxGetCurrent(&ctx);
49
+ check_status(status);
50
+
51
+ return SIZET2NUM((size_t)ctx);
52
+ }
53
+
54
+ ///////////////////////////////////////////////
55
+ // Device Management
56
+ //////////////////////////////////////////////
57
+
58
+ static VALUE
59
+ rb_cuDeviceGet(VALUE self, VALUE ordinal)
60
+ {
61
+ int _ordinal = NUM2INT(ordinal);
62
+ CUdevice _device;
63
+ CUresult status;
64
+
65
+ status = cuDeviceGet(&_device, _ordinal);
66
+
67
+ check_status(status);
68
+ return INT2NUM(_device);
69
+ }
70
+
71
+ ///////////////////////////////////////////////
72
+ // Module Load and Kernel Execution
73
+ //////////////////////////////////////////////
74
+
75
+ struct cuLinkAddDataParam {
76
+ CUlinkState state;
77
+ CUjitInputType type;
78
+ void* data;
79
+ size_t size;
80
+ const char* name;
81
+ unsigned int numOptions;
82
+ CUjit_option* options;
83
+ void ** optionValues;
84
+ };
85
+
86
+ static void *
87
+ cuLinkAddData_without_gvl_cb(void *param)
88
+ {
89
+ struct cuLinkAddDataParam *p = param;
90
+ CUresult status;
91
+ status = cuLinkAddData(p->state, p->type, p->data, p->size, p->name, p->numOptions, p->options, p->optionValues);
92
+ return (void *)status;
93
+ }
94
+
95
+ // TODO(sonots): Support options.
96
+ static VALUE
97
+ rb_cuLinkAddData(VALUE self, VALUE state, VALUE type, VALUE data, VALUE name)
98
+ {
99
+ CUlinkState _state = (CUlinkState)NUM2SIZET(state);
100
+ CUjitInputType _type = (CUjitInputType)NUM2INT(type);
101
+ void* _data = (void *)RSTRING_PTR(data);
102
+ size_t _size = RSTRING_LEN(data);
103
+ const char* _name = RSTRING_PTR(data);
104
+ CUresult status;
105
+
106
+ struct cuLinkAddDataParam param = {_state, _type, _data, _size, _name, 0, (CUjit_option*)0, (void**)0};
107
+ status = (CUresult)rb_thread_call_without_gvl(cuLinkAddData_without_gvl_cb, &param, NULL, NULL);
108
+ //status = cuLinkAddData(_state, _type, _data, _size, _name, 0, (CUjit_option*)0, (void**)0);
109
+
110
+ check_status(status);
111
+ return Qnil;
112
+ }
113
+
114
+ struct cuLinkAddFileParam {
115
+ CUlinkState state;
116
+ CUjitInputType type;
117
+ const char* path;
118
+ unsigned int numOptions;
119
+ CUjit_option* options;
120
+ void ** optionValues;
121
+ };
122
+
123
+ static void *
124
+ cuLinkAddFile_without_gvl_cb(void *param)
125
+ {
126
+ struct cuLinkAddFileParam *p = param;
127
+ CUresult status;
128
+ status = cuLinkAddFile(p->state, p->type, p->path, p->numOptions, p->options, p->optionValues);
129
+ return (void *)status;
130
+ }
131
+
132
+ // TODO(sonots): Support options.
133
+ static VALUE
134
+ rb_cuLinkAddFile(VALUE self, VALUE state, VALUE type, VALUE path)
135
+ {
136
+ CUlinkState _state = (CUlinkState)NUM2SIZET(state);
137
+ CUjitInputType _type = (CUjitInputType)NUM2INT(type);
138
+ const char* _path = RSTRING_PTR(path);
139
+ CUresult status;
140
+
141
+ struct cuLinkAddFileParam param = {_state, _type, _path, 0, (CUjit_option*)0, (void **)0};
142
+ status = (CUresult)rb_thread_call_without_gvl(cuLinkAddFile_without_gvl_cb, &param, NULL, NULL);
143
+ //status = cuLinkAddFile(_state, _type, _path, 0, (CUjit_option*)0, (void **)0);
144
+
145
+ check_status(status);
146
+ return Qnil;
147
+ }
148
+
149
+ struct cuLinkCompleteParam {
150
+ CUlinkState state;
151
+ void** cubinOut;
152
+ size_t* sizeOut;
153
+ };
154
+
155
+ static void *
156
+ cuLinkComplete_without_gvl_cb(void *param)
157
+ {
158
+ struct cuLinkCompleteParam *p = param;
159
+ CUresult status;
160
+ status = cuLinkComplete(p->state, p->cubinOut, p->sizeOut);
161
+ return (void *)status;
162
+ }
163
+
164
+ static VALUE
165
+ rb_cuLinkComplete(VALUE self, VALUE state)
166
+ {
167
+ CUlinkState _state = (CUlinkState)NUM2SIZET(state);
168
+ void* _cubinOut;
169
+ size_t _sizeOut;
170
+ CUresult status;
171
+
172
+ struct cuLinkCompleteParam param = {_state, &_cubinOut, &_sizeOut};
173
+ status = (CUresult)rb_thread_call_without_gvl(cuLinkComplete_without_gvl_cb, &param, NULL, NULL);
174
+ //status = cuLinkComplete(_state, &_cubinOut, &_sizeOut);
175
+
176
+ check_status(status);
177
+ return rb_str_new((char *)_cubinOut, _sizeOut);
178
+ }
179
+
180
+ struct cuLinkCreateParam {
181
+ unsigned int numOptions;
182
+ CUjit_option* options;
183
+ void** optionValues;
184
+ CUlinkState* state;
185
+ };
186
+
187
+ static void *
188
+ cuLinkCreate_without_gvl_cb(void *param)
189
+ {
190
+ struct cuLinkCreateParam *p = param;
191
+ CUresult status;
192
+ status = cuLinkCreate(p->numOptions, p->options, p->optionValues, p->state);
193
+ return (void *)status;
194
+ }
195
+
196
+ // TODO(sonots): Support options.
197
+ static VALUE
198
+ rb_cuLinkCreate(VALUE self)
199
+ {
200
+ CUlinkState state;
201
+ CUresult status;
202
+
203
+ struct cuLinkCreateParam param = {0, (CUjit_option*)0, (void**)0, &state};
204
+ status = (CUresult)rb_thread_call_without_gvl(cuLinkCreate_without_gvl_cb, &param, NULL, NULL);
205
+ //status = cuLinkCreate(0, (CUjit_option*)0, (void**)0, &state);
206
+
207
+ check_status(status);
208
+ return SIZET2NUM((size_t)state);
209
+ }
210
+
211
+ struct cuLinkDestroyParam {
212
+ CUlinkState state;
213
+ };
214
+
215
+ static void *
216
+ cuLinkDestroy_without_gvl_cb(void *param)
217
+ {
218
+ struct cuLinkDestroyParam *p = param;
219
+ CUresult status;
220
+ status = cuLinkDestroy(p->state);
221
+ return (void *)status;
222
+ }
223
+
224
+ static VALUE
225
+ rb_cuLinkDestroy(VALUE self, VALUE state)
226
+ {
227
+ CUlinkState _state = (CUlinkState)NUM2SIZET(state);
228
+ CUresult status;
229
+
230
+ struct cuLinkDestroyParam param = {_state};
231
+ status = (CUresult)rb_thread_call_without_gvl(cuLinkDestroy_without_gvl_cb, &param, NULL, NULL);
232
+ //status = cuLinkDestroy(_state);
233
+
234
+ check_status(status);
235
+ return Qnil;
236
+ }
237
+
238
+ struct cuModuleGetFunctionParam {
239
+ CUfunction* hfunc;
240
+ CUmodule hmod;
241
+ const char* name;
242
+ };
243
+
244
+ static void *
245
+ cuModuleGetFunction_without_gvl_cb(void *param)
246
+ {
247
+ struct cuModuleGetFunctionParam *p = param;
248
+ CUresult status;
249
+ status = cuModuleGetFunction(p->hfunc, p->hmod, p->name);
250
+ return (void *)status;
251
+ }
252
+
253
+ static VALUE
254
+ rb_cuModuleGetFunction(VALUE self, VALUE hmod, VALUE name)
255
+ {
256
+ CUfunction _hfunc;
257
+ CUmodule _hmod = (CUmodule)NUM2SIZET(hmod);
258
+ const char* _name = RSTRING_PTR(name);
259
+ CUresult status;
260
+
261
+ struct cuModuleGetFunctionParam param = {&_hfunc, _hmod, _name};
262
+ status = (CUresult)rb_thread_call_without_gvl(cuModuleGetFunction_without_gvl_cb, &param, NULL, NULL);
263
+ //status = cuModuleGetFunction(&_hfunc, _hmod, _name);
264
+
265
+ check_status(status);
266
+ return SIZET2NUM((size_t)_hfunc);
267
+ }
268
+
269
+ struct cuModuleGetGlobalParam {
270
+ CUdeviceptr* dptr;
271
+ size_t* bytes;
272
+ CUmodule hmod;
273
+ const char* name;
274
+ };
275
+
276
+ static void *
277
+ cuModuleGetGlobal_without_gvl_cb(void *param)
278
+ {
279
+ struct cuModuleGetGlobalParam *p = param;
280
+ CUresult status;
281
+ status = cuModuleGetGlobal(p->dptr, p->bytes, p->hmod, p->name);
282
+ return (void *)status;
283
+ }
284
+
285
+ static VALUE
286
+ rb_cuModuleGetGlobal(VALUE self, VALUE hmod, VALUE name)
287
+ {
288
+ CUdeviceptr _dptr;
289
+ size_t _bytes;
290
+ CUmodule _hmod = (CUmodule)NUM2SIZET(hmod);
291
+ const char* _name = RSTRING_PTR(name);
292
+ CUresult status;
293
+
294
+ struct cuModuleGetGlobalParam param = {&_dptr, &_bytes, _hmod, _name};
295
+ status = (CUresult)rb_thread_call_without_gvl(cuModuleGetGlobal_without_gvl_cb, &param, NULL, NULL);
296
+ //status = cuModuleGetGlobal(&_dptr, &_bytes, _hmod, _name);
297
+
298
+ check_status(status);
299
+ return rb_str_new((char *)_dptr, _bytes);
300
+ }
301
+
302
+ struct cuModuleLoadParam {
303
+ CUmodule* module;
304
+ const char* fname;
305
+ };
306
+
307
+ static void *
308
+ cuModuleLoad_without_gvl_cb(void *param)
309
+ {
310
+ struct cuModuleLoadParam *p = param;
311
+ CUresult status;
312
+ status = cuModuleLoad(p->module, p->fname);
313
+ return (void *)status;
314
+ }
315
+
316
+ static VALUE
317
+ rb_cuModuleLoad(VALUE self, VALUE fname)
318
+ {
319
+ CUmodule _module;
320
+ const char* _fname = RSTRING_PTR(fname);
321
+ CUresult status;
322
+
323
+ struct cuModuleLoadParam param = {&_module, _fname};
324
+ status = (CUresult)rb_thread_call_without_gvl(cuModuleLoad_without_gvl_cb, &param, NULL, NULL);
325
+ //status = cuModuleLoad(&_module, _fname);
326
+
327
+ check_status(status);
328
+ return SIZET2NUM((size_t)_module);
329
+ }
330
+
331
+ struct cuModuleLoadDataParam {
332
+ CUmodule* module;
333
+ const void* image;
334
+ };
335
+
336
+ static void *
337
+ cuModuleLoadData_without_gvl_cb(void *param)
338
+ {
339
+ struct cuModuleLoadDataParam *p = param;
340
+ CUresult status;
341
+ status = cuModuleLoadData(p->module, p->image);
342
+ return (void *)status;
343
+ }
344
+
345
+ static VALUE
346
+ rb_cuModuleLoadData(VALUE self, VALUE image)
347
+ {
348
+ CUmodule _module;
349
+ const void* _image = (void*)RSTRING_PTR(image);
350
+ CUresult status;
351
+
352
+ struct cuModuleLoadDataParam param = {&_module, _image};
353
+ status = (CUresult)rb_thread_call_without_gvl(cuModuleLoadData_without_gvl_cb, &param, NULL, NULL);
354
+ //status = cuModuleLoadData(&_module, _image);
355
+
356
+ check_status(status);
357
+ return SIZET2NUM((size_t)_module);
358
+ }
359
+
360
+ struct cuModuleUnloadParam {
361
+ CUmodule hmod;
362
+ };
363
+
364
+ static void *
365
+ cuModuleUnload_without_gvl_cb(void *param)
366
+ {
367
+ struct cuModuleUnloadParam *p = param;
368
+ CUresult status;
369
+ status = cuModuleUnload(p->hmod);
370
+ return (void *)status;
371
+ }
372
+
373
+ static VALUE
374
+ rb_cuModuleUnload(VALUE self, VALUE hmod)
375
+ {
376
+ CUmodule _hmod = (CUmodule)NUM2SIZET(hmod);
377
+ CUresult status;
378
+
379
+ struct cuModuleUnloadParam param = {_hmod};
380
+ status = (CUresult)rb_thread_call_without_gvl(cuModuleUnload_without_gvl_cb, &param, NULL, NULL);
381
+ //status = cuModuleUnload(_hmod);
382
+
383
+ check_status(status);
384
+ return Qnil;
385
+ }
386
+
387
+ void
388
+ Init_cumo_cuda_driver()
389
+ {
390
+ VALUE mCumo = rb_define_module("Cumo");
391
+ VALUE mCUDA = rb_define_module_under(mCumo, "CUDA");
392
+ mDriver = rb_define_module_under(mCUDA, "Driver");
393
+ eDriverError = rb_define_class_under(mCUDA, "DriverError", rb_eStandardError);
394
+
395
+ rb_define_singleton_method(mDriver, "cuCtxGetCurrent", rb_cuCtxGetCurrent, 0);
396
+ rb_define_singleton_method(mDriver, "cuLinkAddData", rb_cuLinkAddData, 4);
397
+ rb_define_singleton_method(mDriver, "cuLinkAddFile", rb_cuLinkAddFile, 3);
398
+ rb_define_singleton_method(mDriver, "cuLinkComplete", rb_cuLinkComplete, 1);
399
+ rb_define_singleton_method(mDriver, "cuLinkCreate", rb_cuLinkCreate, 0);
400
+ rb_define_singleton_method(mDriver, "cuLinkDestroy", rb_cuLinkDestroy, 1);
401
+ rb_define_singleton_method(mDriver, "cuModuleGetFunction", rb_cuModuleGetFunction, 2);
402
+ rb_define_singleton_method(mDriver, "cuModuleGetGlobal", rb_cuModuleGetGlobal, 2);
403
+ rb_define_singleton_method(mDriver, "cuModuleLoad", rb_cuModuleLoad, 1);
404
+ rb_define_singleton_method(mDriver, "cuModuleLoadData", rb_cuModuleLoadData, 1);
405
+ rb_define_singleton_method(mDriver, "cuModuleUnload", rb_cuModuleUnload, 1);
406
+
407
+ rb_define_singleton_method(mDriver, "cuDeviceGet", rb_cuDeviceGet, 1);
408
+ rb_define_singleton_method(mDriver, "cuCtxCreate", rb_cuCtxCreate, 2);
409
+
410
+ rb_define_const(mDriver, "CU_JIT_INPUT_CUBIN", INT2NUM(CU_JIT_INPUT_CUBIN));
411
+ rb_define_const(mDriver, "CU_JIT_INPUT_FATBINARY", INT2NUM(CU_JIT_INPUT_FATBINARY));
412
+ rb_define_const(mDriver, "CU_JIT_INPUT_LIBRARY", INT2NUM(CU_JIT_INPUT_LIBRARY));
413
+ rb_define_const(mDriver, "CU_JIT_INPUT_OBJECT", INT2NUM(CU_JIT_INPUT_OBJECT));
414
+ rb_define_const(mDriver, "CU_JIT_INPUT_PTX", INT2NUM(CU_JIT_INPUT_PTX));
415
+
416
+ CUdevice cuDevice;
417
+ CUcontext context;
418
+ cuInit(0);
419
+ cuDeviceGet(&cuDevice, 0);
420
+ cuCtxCreate(&context, 0, cuDevice);
421
+ }
@@ -0,0 +1,185 @@
1
+ #include <ruby.h>
2
+ #include <cuda_runtime.h>
3
+ #include "memory_pool_impl.hpp"
4
+ #include "cumo/cuda/memory_pool.h"
5
+ #include "cumo/cuda/runtime.h"
6
+
7
+ #include <cstdlib>
8
+ #include <string>
9
+
10
+ #if defined(__cplusplus)
11
+ extern "C" {
12
+ #if 0
13
+ } /* satisfy cc-mode */
14
+ #endif
15
+ #endif
16
+
17
+ static cumo::internal::MemoryPool pool{};
18
+ static bool memory_pool_enabled;
19
+
20
+ VALUE cumo_cuda_eOutOfMemoryError;
21
+
22
+ char*
23
+ cumo_cuda_runtime_malloc(size_t size)
24
+ {
25
+ if (memory_pool_enabled) {
26
+ try {
27
+ // TODO(sonots): Get current CUDA stream and pass it
28
+ return reinterpret_cast<char*>(pool.Malloc(size));
29
+ } catch (const cumo::internal::CUDARuntimeError& e) {
30
+ cumo_cuda_runtime_check_status(e.status());
31
+ } catch (const cumo::internal::OutOfMemoryError& e) {
32
+ rb_raise(cumo_cuda_eOutOfMemoryError, "%s", e.what());
33
+ }
34
+ } else {
35
+ void *ptr = 0;
36
+ cumo_cuda_runtime_check_status(cudaMallocManaged(&ptr, size, cudaMemAttachGlobal));
37
+ return reinterpret_cast<char*>(ptr);
38
+ }
39
+ return 0; // should not reach here
40
+ }
41
+
42
+ void
43
+ cumo_cuda_runtime_free(char *ptr)
44
+ {
45
+ if (memory_pool_enabled) {
46
+ try {
47
+ // TODO(sonots): Get current CUDA stream and pass it
48
+ pool.Free(reinterpret_cast<intptr_t>(ptr));
49
+ } catch (const cumo::internal::CUDARuntimeError& e) {
50
+ cumo_cuda_runtime_check_status(e.status());
51
+ }
52
+ } else {
53
+ cumo_cuda_runtime_check_status(cudaFree((void*)ptr));
54
+ }
55
+ }
56
+
57
+ /*
58
+ Enable memory pool.
59
+
60
+ @return [Boolean] Returns previous state (true if enabled)
61
+ */
62
+ static VALUE
63
+ rb_memory_pool_enable(VALUE self)
64
+ {
65
+ VALUE ret = (memory_pool_enabled ? Qtrue : Qfalse);
66
+ memory_pool_enabled = true;
67
+ return ret;
68
+ }
69
+
70
+ /*
71
+ Disable memory pool.
72
+
73
+ @return [Boolean] Returns previous state (true if enabled)
74
+ */
75
+ static VALUE
76
+ rb_memory_pool_disable(VALUE self)
77
+ {
78
+ VALUE ret = (memory_pool_enabled ? Qtrue : Qfalse);
79
+ memory_pool_enabled = false;
80
+ return ret;
81
+ }
82
+
83
+ /*
84
+ Returns whether memory pool is enabled or not.
85
+
86
+ @return [Boolean] Returns the state (true if enabled)
87
+ */
88
+ static VALUE
89
+ rb_memory_pool_enabled_p(VALUE self)
90
+ {
91
+ return (memory_pool_enabled ? Qtrue : Qfalse);
92
+ }
93
+
94
+ /*
95
+ Free all **non-split** chunks in all arenas.
96
+ */
97
+ static VALUE
98
+ rb_memory_pool_free_all_blocks(int argc, VALUE* argv, VALUE self)
99
+ {
100
+ try {
101
+ if (argc < 1) {
102
+ pool.FreeAllBlocks();
103
+ } else {
104
+ // TODO(sonots): FIX if we create a Stream object
105
+ cudaStream_t stream_ptr = (cudaStream_t)NUM2SIZET(argv[0]);
106
+ pool.FreeAllBlocks(stream_ptr);
107
+ }
108
+ } catch (const cumo::internal::CUDARuntimeError& e) {
109
+ cumo_cuda_runtime_check_status(e.status());
110
+ }
111
+ return Qnil;
112
+ }
113
+
114
+ /*
115
+ Count the total number of free blocks.
116
+
117
+ @return [Integer] The total number of free blocks.
118
+ */
119
+ static VALUE
120
+ rb_memory_pool_n_free_blocks(VALUE self)
121
+ {
122
+ return SIZET2NUM(pool.GetNumFreeBlocks());
123
+ }
124
+
125
+ /*
126
+ Get the total number of bytes used.
127
+
128
+ @return [Integer] The total number of bytes used.
129
+ */
130
+ static VALUE
131
+ rb_memory_pool_used_bytes(VALUE self)
132
+ {
133
+ return SIZET2NUM(pool.GetUsedBytes());
134
+ }
135
+
136
+ /*
137
+ Get the total number of bytes acquired but not used in the pool.
138
+
139
+ @return [Integer] The total number of bytes acquired but not used in the pool.
140
+ */
141
+ static VALUE
142
+ rb_memory_pool_free_bytes(VALUE self)
143
+ {
144
+ return SIZET2NUM(pool.GetFreeBytes());
145
+ }
146
+
147
+ /*
148
+ Get the total number of bytes acquired in the pool.
149
+
150
+ @return [Integer] The total number of bytes acquired in the pool.
151
+ */
152
+ static VALUE
153
+ rb_memory_pool_total_bytes(VALUE self)
154
+ {
155
+ return SIZET2NUM(pool.GetTotalBytes());
156
+ }
157
+
158
+ void
159
+ Init_cumo_cuda_memory_pool()
160
+ {
161
+ VALUE mCumo = rb_define_module("Cumo");
162
+ VALUE mCUDA = rb_define_module_under(mCumo, "CUDA");
163
+ VALUE mMemoryPool = rb_define_module_under(mCUDA, "MemoryPool");
164
+ cumo_cuda_eOutOfMemoryError = rb_define_class_under(mCUDA, "OutOfMemoryError", rb_eStandardError);
165
+
166
+ rb_define_singleton_method(mMemoryPool, "enable", RUBY_METHOD_FUNC(rb_memory_pool_enable), 0);
167
+ rb_define_singleton_method(mMemoryPool, "disable", RUBY_METHOD_FUNC(rb_memory_pool_disable), 0);
168
+ rb_define_singleton_method(mMemoryPool, "enabled?", RUBY_METHOD_FUNC(rb_memory_pool_enabled_p), 0);
169
+ rb_define_singleton_method(mMemoryPool, "free_all_blocks", RUBY_METHOD_FUNC(rb_memory_pool_free_all_blocks), -1);
170
+ rb_define_singleton_method(mMemoryPool, "n_free_blocks", RUBY_METHOD_FUNC(rb_memory_pool_n_free_blocks), 0);
171
+ rb_define_singleton_method(mMemoryPool, "used_bytes", RUBY_METHOD_FUNC(rb_memory_pool_used_bytes), 0);
172
+ rb_define_singleton_method(mMemoryPool, "free_bytes", RUBY_METHOD_FUNC(rb_memory_pool_free_bytes), 0);
173
+ rb_define_singleton_method(mMemoryPool, "total_bytes", RUBY_METHOD_FUNC(rb_memory_pool_total_bytes), 0);
174
+
175
+ // default is true
176
+ const char* env = std::getenv("CUMO_MEMORY_POOL");
177
+ memory_pool_enabled = env == nullptr || (std::string(env) != "OFF" && std::string(env) != "0" && std::string(env) != "NO");
178
+ }
179
+
180
+ #if defined(__cplusplus)
181
+ #if 0
182
+ { /* satisfy cc-mode */
183
+ #endif
184
+ } /* extern "C" { */
185
+ #endif