cumo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.travis.yml +5 -0
  4. data/3rd_party/mkmf-cu/.gitignore +36 -0
  5. data/3rd_party/mkmf-cu/Gemfile +3 -0
  6. data/3rd_party/mkmf-cu/LICENSE +21 -0
  7. data/3rd_party/mkmf-cu/README.md +36 -0
  8. data/3rd_party/mkmf-cu/Rakefile +11 -0
  9. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
  11. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
  12. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
  13. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
  14. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
  15. data/CODE_OF_CONDUCT.md +46 -0
  16. data/Gemfile +8 -0
  17. data/LICENSE.txt +82 -0
  18. data/README.md +252 -0
  19. data/Rakefile +43 -0
  20. data/bench/broadcast_fp32.rb +138 -0
  21. data/bench/cumo_bench.rb +193 -0
  22. data/bench/numo_bench.rb +138 -0
  23. data/bench/reduction_fp32.rb +117 -0
  24. data/bin/console +14 -0
  25. data/bin/setup +8 -0
  26. data/cumo.gemspec +32 -0
  27. data/ext/cumo/cuda/cublas.c +278 -0
  28. data/ext/cumo/cuda/driver.c +421 -0
  29. data/ext/cumo/cuda/memory_pool.cpp +185 -0
  30. data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
  31. data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
  32. data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
  33. data/ext/cumo/cuda/nvrtc.c +207 -0
  34. data/ext/cumo/cuda/runtime.c +167 -0
  35. data/ext/cumo/cumo.c +148 -0
  36. data/ext/cumo/depend.erb +58 -0
  37. data/ext/cumo/extconf.rb +179 -0
  38. data/ext/cumo/include/cumo.h +25 -0
  39. data/ext/cumo/include/cumo/compat.h +23 -0
  40. data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
  41. data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
  42. data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
  43. data/ext/cumo/include/cumo/cuda/driver.h +22 -0
  44. data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
  45. data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
  46. data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
  47. data/ext/cumo/include/cumo/indexer.h +238 -0
  48. data/ext/cumo/include/cumo/intern.h +142 -0
  49. data/ext/cumo/include/cumo/intern_fwd.h +38 -0
  50. data/ext/cumo/include/cumo/intern_kernel.h +6 -0
  51. data/ext/cumo/include/cumo/narray.h +429 -0
  52. data/ext/cumo/include/cumo/narray_kernel.h +149 -0
  53. data/ext/cumo/include/cumo/ndloop.h +95 -0
  54. data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
  55. data/ext/cumo/include/cumo/template.h +158 -0
  56. data/ext/cumo/include/cumo/template_kernel.h +77 -0
  57. data/ext/cumo/include/cumo/types/bit.h +40 -0
  58. data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
  59. data/ext/cumo/include/cumo/types/complex.h +402 -0
  60. data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
  61. data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
  62. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
  63. data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
  64. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
  65. data/ext/cumo/include/cumo/types/dfloat.h +47 -0
  66. data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
  67. data/ext/cumo/include/cumo/types/float_def.h +34 -0
  68. data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
  69. data/ext/cumo/include/cumo/types/float_macro.h +191 -0
  70. data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
  71. data/ext/cumo/include/cumo/types/int16.h +24 -0
  72. data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
  73. data/ext/cumo/include/cumo/types/int32.h +24 -0
  74. data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
  75. data/ext/cumo/include/cumo/types/int64.h +24 -0
  76. data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
  77. data/ext/cumo/include/cumo/types/int8.h +24 -0
  78. data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
  79. data/ext/cumo/include/cumo/types/int_macro.h +67 -0
  80. data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
  81. data/ext/cumo/include/cumo/types/real_accum.h +486 -0
  82. data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
  83. data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
  84. data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
  85. data/ext/cumo/include/cumo/types/robject.h +27 -0
  86. data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
  87. data/ext/cumo/include/cumo/types/scomplex.h +46 -0
  88. data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
  89. data/ext/cumo/include/cumo/types/sfloat.h +48 -0
  90. data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
  91. data/ext/cumo/include/cumo/types/uint16.h +25 -0
  92. data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
  93. data/ext/cumo/include/cumo/types/uint32.h +25 -0
  94. data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
  95. data/ext/cumo/include/cumo/types/uint64.h +25 -0
  96. data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
  97. data/ext/cumo/include/cumo/types/uint8.h +25 -0
  98. data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
  99. data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
  100. data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
  101. data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
  102. data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
  103. data/ext/cumo/narray/SFMT-params.h +97 -0
  104. data/ext/cumo/narray/SFMT-params19937.h +46 -0
  105. data/ext/cumo/narray/SFMT.c +620 -0
  106. data/ext/cumo/narray/SFMT.h +167 -0
  107. data/ext/cumo/narray/array.c +638 -0
  108. data/ext/cumo/narray/data.c +961 -0
  109. data/ext/cumo/narray/gen/cogen.rb +56 -0
  110. data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
  111. data/ext/cumo/narray/gen/def/bit.rb +37 -0
  112. data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
  113. data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
  114. data/ext/cumo/narray/gen/def/int16.rb +36 -0
  115. data/ext/cumo/narray/gen/def/int32.rb +36 -0
  116. data/ext/cumo/narray/gen/def/int64.rb +36 -0
  117. data/ext/cumo/narray/gen/def/int8.rb +36 -0
  118. data/ext/cumo/narray/gen/def/robject.rb +37 -0
  119. data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
  120. data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
  121. data/ext/cumo/narray/gen/def/uint16.rb +36 -0
  122. data/ext/cumo/narray/gen/def/uint32.rb +36 -0
  123. data/ext/cumo/narray/gen/def/uint64.rb +36 -0
  124. data/ext/cumo/narray/gen/def/uint8.rb +36 -0
  125. data/ext/cumo/narray/gen/erbpp2.rb +346 -0
  126. data/ext/cumo/narray/gen/narray_def.rb +268 -0
  127. data/ext/cumo/narray/gen/spec.rb +425 -0
  128. data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
  129. data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
  130. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
  131. data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
  132. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
  133. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
  134. data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
  135. data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
  136. data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
  137. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
  138. data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
  139. data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
  140. data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
  141. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
  142. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
  143. data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
  144. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
  145. data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
  146. data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
  147. data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
  148. data/ext/cumo/narray/gen/tmpl/class.c +9 -0
  149. data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
  150. data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
  151. data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
  152. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
  153. data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
  154. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
  155. data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
  156. data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
  157. data/ext/cumo/narray/gen/tmpl/each.c +47 -0
  158. data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
  159. data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
  160. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
  161. data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
  162. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
  163. data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
  164. data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
  165. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
  166. data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
  167. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
  168. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
  169. data/ext/cumo/narray/gen/tmpl/format.c +62 -0
  170. data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
  171. data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
  172. data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
  173. data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
  174. data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
  175. data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
  176. data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
  177. data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
  178. data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
  179. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
  180. data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
  181. data/ext/cumo/narray/gen/tmpl/median.c +66 -0
  182. data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
  183. data/ext/cumo/narray/gen/tmpl/module.c +9 -0
  184. data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
  185. data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
  186. data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
  187. data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
  188. data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
  189. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
  190. data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
  191. data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
  192. data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
  193. data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
  194. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
  195. data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
  196. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
  197. data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
  198. data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
  199. data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
  200. data/ext/cumo/narray/gen/tmpl/store.c +41 -0
  201. data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
  202. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
  203. data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
  204. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
  205. data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
  206. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
  207. data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
  208. data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
  209. data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
  210. data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
  211. data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
  212. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
  213. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
  214. data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
  215. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
  216. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
  217. data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
  218. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
  219. data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
  220. data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
  221. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
  222. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
  223. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
  224. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
  225. data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
  226. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
  227. data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
  228. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
  229. data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
  230. data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
  231. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
  232. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
  233. data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
  234. data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
  235. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
  236. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
  237. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
  238. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
  239. data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
  240. data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
  241. data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
  242. data/ext/cumo/narray/index.c +880 -0
  243. data/ext/cumo/narray/kwargs.c +153 -0
  244. data/ext/cumo/narray/math.c +142 -0
  245. data/ext/cumo/narray/narray.c +1948 -0
  246. data/ext/cumo/narray/ndloop.c +2105 -0
  247. data/ext/cumo/narray/rand.c +45 -0
  248. data/ext/cumo/narray/step.c +474 -0
  249. data/ext/cumo/narray/struct.c +886 -0
  250. data/lib/cumo.rb +3 -0
  251. data/lib/cumo/cuda.rb +11 -0
  252. data/lib/cumo/cuda/compile_error.rb +36 -0
  253. data/lib/cumo/cuda/compiler.rb +161 -0
  254. data/lib/cumo/cuda/device.rb +47 -0
  255. data/lib/cumo/cuda/link_state.rb +31 -0
  256. data/lib/cumo/cuda/module.rb +40 -0
  257. data/lib/cumo/cuda/nvrtc_program.rb +27 -0
  258. data/lib/cumo/linalg.rb +12 -0
  259. data/lib/cumo/narray.rb +2 -0
  260. data/lib/cumo/narray/extra.rb +1278 -0
  261. data/lib/erbpp.rb +294 -0
  262. data/lib/erbpp/line_number.rb +137 -0
  263. data/lib/erbpp/narray_def.rb +381 -0
  264. data/numo-narray-version +1 -0
  265. data/run.gdb +7 -0
  266. metadata +353 -0
@@ -0,0 +1,79 @@
1
+ #ifndef CUMO_CUDA_THRUST_COMPLEX_H
2
+ #define CUMO_CUDA_THRUST_COMPLEX_H
3
+
4
+ #include "cumo/types/complex_kernel.h"
5
+ #include "cumo/cuda/cumo_thrust.hpp"
6
+
7
+ // ref. https://github.com/thrust/thrust/blob/master/examples/summary_statistics.cu
8
+
9
+ // structure used to accumulate the moments and other
10
+ // statistical properties encountered so far.
11
+ template <typename T, typename R>
12
+ struct cumo_thrust_complex_variance_data
13
+ {
14
+ R n;
15
+ T mean;
16
+ R M2;
17
+
18
+ // initialize to the identity element
19
+ void initialize()
20
+ {
21
+ n = M2 = 0;
22
+ mean = c_zero();
23
+ }
24
+
25
+ __host__ __device__ R variance() { return M2 / (n - 1); }
26
+ __host__ __device__ R variance_n() { return M2 / n; }
27
+ };
28
+
29
+ // stats_unary_op is a functor that takes in a value x and
30
+ // returns a variace_data whose mean value is initialized to x.
31
+ template <typename T, typename R>
32
+ struct cumo_thrust_complex_variance_unary_op
33
+ {
34
+ __host__ __device__
35
+ cumo_thrust_complex_variance_data<T,R> operator()(const T& x) const
36
+ {
37
+ cumo_thrust_complex_variance_data<T,R> result;
38
+ result.n = 1;
39
+ result.mean = x;
40
+ result.M2 = 0;
41
+
42
+ return result;
43
+ }
44
+ };
45
+
46
+ // cumo_thrust_variance_binary_op is a functor that accepts two cumo_thrust_variance_data
47
+ // structs and returns a new cumo_thrust_variance_data which are an
48
+ // approximation to the cumo_thrust_variance for
49
+ // all values that have been agregated so far
50
+ template <typename T, typename R>
51
+ struct cumo_thrust_complex_variance_binary_op
52
+ : public thrust::binary_function<const cumo_thrust_complex_variance_data<T,R>&,
53
+ const cumo_thrust_complex_variance_data<T,R>&,
54
+ cumo_thrust_complex_variance_data<T,R> >
55
+ {
56
+ __host__ __device__
57
+ cumo_thrust_complex_variance_data<T,R> operator()(const cumo_thrust_complex_variance_data<T,R>& x, const cumo_thrust_complex_variance_data<T,R>& y) const
58
+ {
59
+ cumo_thrust_complex_variance_data<T,R> result;
60
+
61
+ // precompute some common subexpressions
62
+ R n = x.n + y.n;
63
+
64
+ T delta = c_sub(y.mean, x.mean);
65
+ R delta2 = c_abs_square(delta);
66
+
67
+ //Basic number of samples (n)
68
+ result.n = n;
69
+
70
+ result.mean = c_add(x.mean, c_mul_r(delta, y.n / n));
71
+
72
+ result.M2 = x.M2 + y.M2;
73
+ result.M2 += delta2 * x.n * y.n / n;
74
+
75
+ return result;
76
+ }
77
+ };
78
+
79
+ #endif /* ifndef CUMO_CUDA_THRUST_COMPLEX_H */
@@ -0,0 +1,22 @@
1
+ #ifndef CUMO_CUDA_DRIVER_H
2
+ #define CUMO_CUDA_DRIVER_H
3
+ #include <cuda.h>
4
+
5
+ #if defined(__cplusplus)
6
+ extern "C" {
7
+ #if 0
8
+ } /* satisfy cc-mode */
9
+ #endif
10
+ #endif
11
+
12
+ extern VALUE cumo_cuda_eDriverError;
13
+ extern VALUE cumo_cuda_mDriver;
14
+
15
+ #if defined(__cplusplus)
16
+ #if 0
17
+ { /* satisfy cc-mode */
18
+ #endif
19
+ } /* extern "C" { */
20
+ #endif
21
+
22
+ #endif /* ifndef CUMO_CUDA_DRIVER_H */
@@ -0,0 +1,28 @@
1
+ #ifndef CUMO_CUDA_MEMORY_POOL_H
2
+ #define CUMO_CUDA_MEMORY_POOL_H
3
+
4
+ #include "cumo/narray.h"
5
+
6
+ #if defined(__cplusplus)
7
+ extern "C" {
8
+ #if 0
9
+ } /* satisfy cc-mode */
10
+ #endif
11
+ #endif
12
+
13
+ extern VALUE cumo_cuda_eOutOfMemoryError;
14
+
15
+ char*
16
+ cumo_cuda_runtime_malloc(size_t size);
17
+
18
+ void
19
+ cumo_cuda_runtime_free(char *ptr);
20
+
21
+ #if defined(__cplusplus)
22
+ #if 0
23
+ { /* satisfy cc-mode */
24
+ #endif
25
+ } /* extern "C" { */
26
+ #endif
27
+
28
+ #endif /* ifndef CUMO_CUDA_MEMORY_POOL_H */
@@ -0,0 +1,22 @@
1
+ #ifndef CUMO_CUDA_NVRTC_H
2
+ #define CUMO_CUDA_NVRTC_H
3
+ #include <nvrtc.h>
4
+
5
+ #if defined(__cplusplus)
6
+ extern "C" {
7
+ #if 0
8
+ } /* satisfy cc-mode */
9
+ #endif
10
+ #endif
11
+
12
+ extern VALUE cumo_cuda_eNVRTCError;
13
+ extern VALUE cumo_cuda_mNVRTC;
14
+
15
+ #if defined(__cplusplus)
16
+ #if 0
17
+ { /* satisfy cc-mode */
18
+ #endif
19
+ } /* extern "C" { */
20
+ #endif
21
+
22
+ #endif /* ifndef CUMO_CUDA_NVRTC_H */
@@ -0,0 +1,40 @@
1
+ #ifndef CUMO_CUDA_RUNTIME_H
2
+ #define CUMO_CUDA_RUNTIME_H
3
+
4
+ #include "cumo/narray.h"
5
+ #include <cuda_runtime.h>
6
+
7
+ #if defined(__cplusplus)
8
+ extern "C" {
9
+ #if 0
10
+ } /* satisfy cc-mode */
11
+ #endif
12
+ #endif
13
+
14
+ extern VALUE cumo_cuda_eRuntimeError;
15
+
16
+ static inline void
17
+ cumo_cuda_runtime_check_status(cudaError_t status)
18
+ {
19
+ if (status != 0) {
20
+ rb_raise(cumo_cuda_eRuntimeError, "%s (error=%d)", cudaGetErrorString(status), status);
21
+ }
22
+ }
23
+
24
+ static inline bool
25
+ cumo_cuda_runtime_is_device_memory(void* ptr)
26
+ {
27
+ struct cudaPointerAttributes attrs;
28
+ cudaError_t status = cudaPointerGetAttributes(&attrs, ptr);
29
+ cudaGetLastError(); // reset last error to success
30
+ return (status != cudaErrorInvalidValue);
31
+ }
32
+
33
+ #if defined(__cplusplus)
34
+ #if 0
35
+ { /* satisfy cc-mode */
36
+ #endif
37
+ } /* extern "C" { */
38
+ #endif
39
+
40
+ #endif /* ifndef CUMO_CUDA_RUNTIME_H */
@@ -0,0 +1,238 @@
1
+ #ifndef CUMO_INDEXER_H
2
+ #define CUMO_INDEXER_H
3
+
4
+ /* Add cumo_ prefix */
5
+ #define na_indexer_t cumo_na_indexer_t
6
+ #define na_iarray_t cumo_na_iarray_t
7
+ #define na_reduction_arg_t cumo_na_reduction_arg_t
8
+
9
+ #ifndef __CUDACC__
10
+ #include "cumo/narray.h"
11
+ #include "cumo/ndloop.h"
12
+ #else
13
+ #include "cumo/narray_kernel.h"
14
+ #endif
15
+
16
+ /* A structure to get indices for each dimension.
17
+ *
18
+ * Note that shapes of each argument NArray are typically equivalent, and
19
+ * thus indexer would point the same indicies for all NArrays.
20
+ */
21
+ typedef struct {
22
+ unsigned char ndim; // # of dimensions
23
+ size_t total_size; // # of total elements
24
+ size_t shape[NA_MAX_DIMENSION]; // # of elements for each dimension
25
+ uint64_t index[NA_MAX_DIMENSION]; // indicies for each dimension
26
+ uint64_t raw_index;
27
+ } na_indexer_t;
28
+
29
+ /* A structure to get data address with indexer.
30
+ *
31
+ * Note that strides would be different for each NArray although indexer points same indicies.
32
+ */
33
+ typedef struct {
34
+ char* ptr;
35
+ ssize_t step[NA_MAX_DIMENSION]; // or strides
36
+ } na_iarray_t;
37
+
38
+ typedef struct {
39
+ na_iarray_t in;
40
+ na_iarray_t out;
41
+ na_indexer_t in_indexer;
42
+ na_indexer_t out_indexer;
43
+ na_indexer_t reduce_indexer;
44
+ } na_reduction_arg_t;
45
+
46
+ #ifndef __CUDACC__
47
+ extern int na_debug_flag; // narray.c
48
+
49
+ static void
50
+ print_na_indexer_t(na_indexer_t* indexer)
51
+ {
52
+ printf("na_indexer_t = 0x%"SZF"x {\n", (size_t)indexer);
53
+ printf(" ndim = %d\n", indexer->ndim);
54
+ printf(" total_size = %ld\n", indexer->total_size);
55
+ printf(" shape = 0x%"SZF"x\n", (size_t)indexer->shape);
56
+ for (int i = 0; i < indexer->ndim; ++i) {
57
+ printf(" shape[%d] = %ld\n", i, indexer->shape[i]);
58
+ }
59
+ printf("}\n");
60
+ }
61
+
62
+ static void
63
+ print_na_iarray_t(na_iarray_t* iarray, unsigned char ndim)
64
+ {
65
+ printf("na_iarray_t = 0x%"SZF"x {\n", (size_t)iarray);
66
+ printf(" ptr = 0x%"SZF"x\n", (size_t)iarray->ptr);
67
+ printf(" step = 0x%"SZF"x\n", (size_t)iarray->step);
68
+ for (int i = 0; i < ndim; ++i) {
69
+ printf(" step[%d] = %ld\n", i, iarray->step[i]);
70
+ }
71
+ printf("}\n");
72
+ }
73
+
74
+ static void
75
+ print_na_reduction_arg_t(na_reduction_arg_t* arg)
76
+ {
77
+ printf("na_reduction_arg_t = 0x%"SZF"x {\n", (size_t)arg);
78
+ printf("--in--\n");
79
+ print_na_iarray_t(&arg->in, arg->in_indexer.ndim);
80
+ printf("--out--\n");
81
+ print_na_iarray_t(&arg->out, arg->out_indexer.ndim);
82
+ printf("--in_indexer--\n");
83
+ print_na_indexer_t(&arg->in_indexer);
84
+ printf("--out_indexer--\n");
85
+ print_na_indexer_t(&arg->out_indexer);
86
+ printf("--reduce_indexer--\n");
87
+ print_na_indexer_t(&arg->reduce_indexer);
88
+ printf("}\n");
89
+ }
90
+
91
+ // Note that you, then, have to call na_indexer_set to create index[]
92
+ static na_indexer_t
93
+ na_make_indexer(na_loop_args_t* arg)
94
+ {
95
+ na_indexer_t indexer;
96
+ indexer.ndim = arg->ndim;
97
+ indexer.total_size = 1;
98
+ for (int i = 0; i < arg->ndim; ++i) {
99
+ indexer.shape[i] = arg->shape[i];
100
+ indexer.total_size *= arg->shape[i];
101
+ }
102
+ return indexer;
103
+ }
104
+
105
+ static na_iarray_t
106
+ na_make_iarray_given_ndim(na_loop_args_t* arg, int ndim)
107
+ {
108
+ na_iarray_t iarray;
109
+ iarray.ptr = arg->ptr + arg->iter[0].pos;
110
+ for (int idim = ndim; --idim >= 0;) {
111
+ iarray.step[idim] = arg->iter[idim].step;
112
+ }
113
+ return iarray;
114
+ }
115
+
116
+ static na_iarray_t
117
+ na_make_iarray(na_loop_args_t* arg)
118
+ {
119
+ return na_make_iarray_given_ndim(arg, arg->ndim);
120
+ }
121
+
122
+ static na_reduction_arg_t
123
+ na_make_reduction_arg(na_loop_t* lp_user)
124
+ {
125
+ na_reduction_arg_t arg;
126
+ int i;
127
+ int in_ndim = lp_user->args[0].ndim;
128
+
129
+ // in shape = (2, 3, 4, 5, 6)
130
+ // axis = (1, 3)
131
+ // out shape = (2, 4, 6)
132
+ // reduce shape = (3, 5)
133
+
134
+ arg.in = na_make_iarray(&lp_user->args[0]);
135
+ arg.in_indexer = na_make_indexer(&lp_user->args[0]);
136
+
137
+ arg.reduce_indexer.ndim = 0;
138
+ arg.reduce_indexer.total_size = 1;
139
+ arg.out_indexer.ndim = 0;
140
+ arg.out_indexer.total_size = 1;
141
+ for (i = 0; i < in_ndim; ++i) {
142
+ if (na_test_reduce(lp_user->reduce, i)) {
143
+ arg.reduce_indexer.shape[arg.reduce_indexer.ndim] = arg.in_indexer.shape[i];
144
+ arg.reduce_indexer.total_size *= arg.in_indexer.shape[i];
145
+ ++arg.reduce_indexer.ndim;
146
+ } else {
147
+ arg.out_indexer.shape[arg.out_indexer.ndim] = arg.in_indexer.shape[i];
148
+ arg.out_indexer.total_size *= arg.in_indexer.shape[i];
149
+ ++arg.out_indexer.ndim;
150
+ }
151
+ }
152
+ arg.out = na_make_iarray_given_ndim(&lp_user->args[1], arg.out_indexer.ndim);
153
+
154
+ if (na_debug_flag) {
155
+ print_na_reduction_arg_t(&arg);
156
+ }
157
+
158
+ assert(arg.reduce_indexer.ndim == lp_user->reduce_dim);
159
+ assert(arg.in_indexer.ndim == arg.reduce_indexer.ndim + arg.out_indexer.ndim);
160
+
161
+ return arg;
162
+ }
163
+
164
+ #endif // #ifndef __CUDACC__
165
+
166
+ #define CUMO_NA_INDEXER_OPTIMIZED_NDIM 4
167
+
168
+ #ifdef __CUDACC__
169
+
170
+ __host__ __device__
171
+ static inline void
172
+ cumo_na_indexer_set_dim(na_indexer_t* indexer, uint64_t i) {
173
+ indexer->raw_index = i;
174
+ for (int j = indexer->ndim; --j >= 0;) {
175
+ indexer->index[j] = i % indexer->shape[j];
176
+ i /= indexer->shape[j];
177
+ }
178
+ }
179
+
180
+ // Let compiler optimize
181
+ #define CUMO_NA_INDEXER_SET(NDIM) \
182
+ __host__ __device__ \
183
+ static inline void \
184
+ cumo_na_indexer_set_dim##NDIM(na_indexer_t* indexer, uint64_t i) { \
185
+ indexer->raw_index = i; \
186
+ for (int j = NDIM; --j >= 0;) { \
187
+ indexer->index[j] = i % indexer->shape[j]; \
188
+ i /= indexer->shape[j]; \
189
+ } \
190
+ }
191
+
192
+ CUMO_NA_INDEXER_SET(4)
193
+ CUMO_NA_INDEXER_SET(3)
194
+ CUMO_NA_INDEXER_SET(2)
195
+ CUMO_NA_INDEXER_SET(0)
196
+
197
+ __host__ __device__
198
+ static inline void
199
+ cumo_na_indexer_set_dim1(na_indexer_t* indexer, uint64_t i) {
200
+ indexer->raw_index = i;
201
+ }
202
+
203
+ __host__ __device__
204
+ static inline char*
205
+ cumo_na_iarray_at_dim(na_iarray_t* iarray, na_indexer_t* indexer) {
206
+ char* ptr = iarray->ptr;
207
+ for (int idim = 0; idim < indexer->ndim; ++idim) {
208
+ ptr += iarray->step[idim] * indexer->index[idim];
209
+ }
210
+ return ptr;
211
+ }
212
+
213
+ // Let compiler optimize
214
+ #define CUMO_NA_IARRAY_AT(NDIM) \
215
+ __host__ __device__ \
216
+ static inline char* \
217
+ cumo_na_iarray_at_dim##NDIM(na_iarray_t* iarray, na_indexer_t* indexer) { \
218
+ char* ptr = iarray->ptr; \
219
+ for (int idim = 0; idim < NDIM; ++idim) { \
220
+ ptr += iarray->step[idim] * indexer->index[idim]; \
221
+ } \
222
+ return ptr; \
223
+ }
224
+
225
+ CUMO_NA_IARRAY_AT(4)
226
+ CUMO_NA_IARRAY_AT(3)
227
+ CUMO_NA_IARRAY_AT(2)
228
+ CUMO_NA_IARRAY_AT(0)
229
+
230
+ __host__ __device__
231
+ static inline char*
232
+ cumo_na_iarray_at_dim1(na_iarray_t* iarray, na_indexer_t* indexer) {
233
+ return iarray->ptr + iarray->step[0] * indexer->raw_index;
234
+ }
235
+
236
+ #endif // #ifdef __CUDACC__
237
+
238
+ #endif // CUMO_INDEXER_H
@@ -0,0 +1,142 @@
1
+ #ifndef CUMO_INTERN_H
2
+ #define CUMO_INTERN_H
3
+
4
+ void cumo_debug_breakpoint(void);
5
+
6
+ /* Add cumo_ prefix to avoid C symbol collisions with Numo without modifying C implementations */
7
+
8
+ #define rb_narray_new cumo_nary_new
9
+ #define nary_new cumo_nary_new
10
+ VALUE cumo_nary_new(VALUE elem, int ndim, size_t *shape);
11
+ #define rb_narray_view_new cumo_nary_view_new
12
+ #define nary_view_new cumo_nary_view_new
13
+ VALUE cumo_nary_view_new(VALUE elem, int ndim, size_t *shape);
14
+ #define rb_narray_debug_info cumo_nary_debug_info
15
+ #define nary_debug_info cumo_nary_debug_info
16
+ VALUE cumo_nary_debug_info(VALUE);
17
+
18
+ #define na_make_view cumo_nary_make_view
19
+ VALUE cumo_nary_make_view(VALUE self);
20
+
21
+ #define na_s_allocate cumo_nary_s_allocate
22
+ VALUE cumo_nary_s_allocate(VALUE klass);
23
+ #define na_s_allocate_view cumo_nary_s_allocate_view
24
+ VALUE cumo_nary_s_allocate_view(VALUE klass);
25
+ #define na_s_new_like cumo_nary_s_new_like
26
+ VALUE cumo_nary_s_new_like(VALUE type, VALUE obj);
27
+
28
+ #define na_alloc_shape cumo_na_alloc_shape
29
+ void cumo_na_alloc_shape(narray_t *na, int ndim);
30
+ #define na_array_to_internal_shape cumo_na_array_to_internal_shape
31
+ void cumo_na_array_to_internal_shape(VALUE self, VALUE ary, size_t *shape);
32
+ #define na_index_arg_to_internal_order cumo_na_index_arg_to_internal_order
33
+ void cumo_na_index_arg_to_internal_order(int argc, VALUE *argv, VALUE self);
34
+ #define na_setup_shape cumo_na_setup_shape
35
+ void cumo_na_setup_shape(narray_t *na, int ndim, size_t *shape);
36
+
37
+ #define na_get_elmsz cumo_nary_element_stride
38
+ #define nary_element_stride cumo_nary_element_stride
39
+ //#define na_element_stride cumo_nary_element_stride
40
+ unsigned int cumo_nary_element_stride(VALUE nary);
41
+ #define na_dtype_elmsz cumo_nary_dtype_element_stride
42
+ size_t cumo_nary_dtype_element_stride(VALUE klass);
43
+
44
+ #define na_get_pointer cumo_nary_get_pointer
45
+ char *cumo_nary_get_pointer(VALUE);
46
+ #define na_get_pointer_for_write cumo_nary_get_pointer_for_write
47
+ char *cumo_nary_get_pointer_for_write(VALUE);
48
+ #define na_get_pointer_for_read cumo_nary_get_pointer_for_read
49
+ char *cumo_nary_get_pointer_for_read(VALUE);
50
+ #define na_get_pointer_for_read_write cumo_nary_get_pointer_for_read_write
51
+ char *cumo_nary_get_pointer_for_read_write(VALUE);
52
+ #define na_get_offset cumo_nary_get_offset
53
+ size_t cumo_nary_get_offset(VALUE self);
54
+
55
+ #define na_copy_flags cumo_nary_copy_flags
56
+ void cumo_nary_copy_flags(VALUE src, VALUE dst);
57
+
58
+ #define na_check_ladder cumo_nary_check_ladder
59
+ VALUE cumo_nary_check_ladder(VALUE self, int start_dim);
60
+ #define na_check_contiguous cumo_nary_check_contiguous
61
+ VALUE cumo_nary_check_contiguous(VALUE self);
62
+
63
+ #define na_flatten_dim cumo_nary_flatten_dim
64
+ VALUE cumo_nary_flatten_dim(VALUE self, int sd);
65
+
66
+ #define na_flatten cumo_nary_flatten
67
+ VALUE cumo_nary_flatten(VALUE);
68
+
69
+ #define na_copy cumo_nary_dup
70
+ VALUE cumo_nary_dup(VALUE);
71
+
72
+ #define na_store cumo_nary_store
73
+ VALUE cumo_nary_store(VALUE self, VALUE src);
74
+
75
+ #define na_upcast cumo_na_upcast
76
+ VALUE cumo_na_upcast(VALUE type1, VALUE type2);
77
+
78
+ #define na_release_lock cumo_na_release_lock
79
+ void cumo_na_release_lock(VALUE); // currently do nothing
80
+
81
+ // used in reduce methods
82
+ #define nary_reduce_dimension cumo_nary_reduce_dimension
83
+ #define na_reduce_dimension cumo_nary_reduce_dimension
84
+ VALUE cumo_nary_reduce_dimension(int argc, VALUE *argv, int naryc, VALUE *naryv,
85
+ ndfunc_t *ndf, na_iter_func_t nan_iter);
86
+
87
+ #define nary_reduce_options cumo_nary_reduce_options
88
+ #define na_reduce_options cumo_nary_reduce_options
89
+ VALUE cumo_nary_reduce_options(VALUE axes, VALUE *opts, int naryc, VALUE *naryv,
90
+ ndfunc_t *ndf);
91
+
92
+ // ndloop
93
+ #define na_ndloop cumo_na_ndloop
94
+ VALUE cumo_na_ndloop(ndfunc_t *nf, int argc, ...);
95
+ #define na_ndloop2 cumo_na_ndloop2
96
+ VALUE cumo_na_ndloop2(ndfunc_t *nf, VALUE args);
97
+ #define na_ndloop3 cumo_na_ndloop3
98
+ VALUE cumo_na_ndloop3(ndfunc_t *nf, void *ptr, int argc, ...);
99
+ #define na_ndloop4 cumo_na_ndloop4
100
+ VALUE cumo_na_ndloop4(ndfunc_t *nf, void *ptr, VALUE args);
101
+
102
+ #define na_ndloop_cast_narray_to_rarray cumo_na_ndloop_cast_narray_to_rarray
103
+ VALUE cumo_na_ndloop_cast_narray_to_rarray(ndfunc_t *nf, VALUE nary, VALUE fmt);
104
+ #define na_ndloop_store_rarray cumo_na_ndloop_store_rarray
105
+ VALUE cumo_na_ndloop_store_rarray(ndfunc_t *nf, VALUE nary, VALUE rary);
106
+ #define na_ndloop_store_rarray2 cumo_na_ndloop_store_rarray2
107
+ VALUE cumo_na_ndloop_store_rarray2(ndfunc_t *nf, VALUE nary, VALUE rary, VALUE opt);
108
+ #define na_ndloop_inspect cumo_na_ndloop_inspect
109
+ VALUE cumo_na_ndloop_inspect(VALUE nary, na_text_func_t func, VALUE opt);
110
+ #define na_ndloop_with_index cumo_na_ndloop_with_index
111
+ VALUE cumo_na_ndloop_with_index(ndfunc_t *nf, int argc, ...);
112
+
113
+ #define na_info_str cumo_nary_info_str
114
+ VALUE cumo_nary_info_str(VALUE);
115
+
116
+ #define na_test_reduce cumo_nary_test_reduce
117
+ bool cumo_nary_test_reduce(VALUE reduce, int dim);
118
+
119
+ #define nary_step_array_index cumo_nary_step_array_index
120
+ void cumo_nary_step_array_index(VALUE self, size_t ary_size, size_t *plen, ssize_t *pbeg, ssize_t *pstep);
121
+ #define nary_step_sequence cumo_nary_step_sequence
122
+ void cumo_nary_step_sequence(VALUE self, size_t *plen, double *pbeg, double *pstep);
123
+
124
+ // used in aref, aset
125
+ #define na_get_result_dimension cumo_nary_get_result_dimension
126
+ int cumo_nary_get_result_dimension(VALUE self, int argc, VALUE *argv, ssize_t stride, size_t *pos_idx);
127
+ #define na_aref_main cumo_nary_aref_main
128
+ VALUE cumo_nary_aref_main(int nidx, VALUE *idx, VALUE self, int keep_dim, int result_nd, size_t pos);
129
+
130
+ // defined in array, used in math
131
+ #define na_ary_composition_dtype cumo_na_ary_composition_dtype
132
+ VALUE cumo_na_ary_composition_dtype(VALUE ary);
133
+
134
+ #include "ruby/version.h"
135
+
136
+ #if RUBY_API_VERSION_CODE == 20100 // 2.1.0
137
+ int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *);
138
+ VALUE rb_extract_keywords(VALUE *orighash);
139
+ #endif
140
+
141
+
142
+ #endif /* ifndef CUMO_INTERN_H */