cumo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.travis.yml +5 -0
  4. data/3rd_party/mkmf-cu/.gitignore +36 -0
  5. data/3rd_party/mkmf-cu/Gemfile +3 -0
  6. data/3rd_party/mkmf-cu/LICENSE +21 -0
  7. data/3rd_party/mkmf-cu/README.md +36 -0
  8. data/3rd_party/mkmf-cu/Rakefile +11 -0
  9. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
  11. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
  12. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
  13. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
  14. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
  15. data/CODE_OF_CONDUCT.md +46 -0
  16. data/Gemfile +8 -0
  17. data/LICENSE.txt +82 -0
  18. data/README.md +252 -0
  19. data/Rakefile +43 -0
  20. data/bench/broadcast_fp32.rb +138 -0
  21. data/bench/cumo_bench.rb +193 -0
  22. data/bench/numo_bench.rb +138 -0
  23. data/bench/reduction_fp32.rb +117 -0
  24. data/bin/console +14 -0
  25. data/bin/setup +8 -0
  26. data/cumo.gemspec +32 -0
  27. data/ext/cumo/cuda/cublas.c +278 -0
  28. data/ext/cumo/cuda/driver.c +421 -0
  29. data/ext/cumo/cuda/memory_pool.cpp +185 -0
  30. data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
  31. data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
  32. data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
  33. data/ext/cumo/cuda/nvrtc.c +207 -0
  34. data/ext/cumo/cuda/runtime.c +167 -0
  35. data/ext/cumo/cumo.c +148 -0
  36. data/ext/cumo/depend.erb +58 -0
  37. data/ext/cumo/extconf.rb +179 -0
  38. data/ext/cumo/include/cumo.h +25 -0
  39. data/ext/cumo/include/cumo/compat.h +23 -0
  40. data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
  41. data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
  42. data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
  43. data/ext/cumo/include/cumo/cuda/driver.h +22 -0
  44. data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
  45. data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
  46. data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
  47. data/ext/cumo/include/cumo/indexer.h +238 -0
  48. data/ext/cumo/include/cumo/intern.h +142 -0
  49. data/ext/cumo/include/cumo/intern_fwd.h +38 -0
  50. data/ext/cumo/include/cumo/intern_kernel.h +6 -0
  51. data/ext/cumo/include/cumo/narray.h +429 -0
  52. data/ext/cumo/include/cumo/narray_kernel.h +149 -0
  53. data/ext/cumo/include/cumo/ndloop.h +95 -0
  54. data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
  55. data/ext/cumo/include/cumo/template.h +158 -0
  56. data/ext/cumo/include/cumo/template_kernel.h +77 -0
  57. data/ext/cumo/include/cumo/types/bit.h +40 -0
  58. data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
  59. data/ext/cumo/include/cumo/types/complex.h +402 -0
  60. data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
  61. data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
  62. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
  63. data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
  64. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
  65. data/ext/cumo/include/cumo/types/dfloat.h +47 -0
  66. data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
  67. data/ext/cumo/include/cumo/types/float_def.h +34 -0
  68. data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
  69. data/ext/cumo/include/cumo/types/float_macro.h +191 -0
  70. data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
  71. data/ext/cumo/include/cumo/types/int16.h +24 -0
  72. data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
  73. data/ext/cumo/include/cumo/types/int32.h +24 -0
  74. data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
  75. data/ext/cumo/include/cumo/types/int64.h +24 -0
  76. data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
  77. data/ext/cumo/include/cumo/types/int8.h +24 -0
  78. data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
  79. data/ext/cumo/include/cumo/types/int_macro.h +67 -0
  80. data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
  81. data/ext/cumo/include/cumo/types/real_accum.h +486 -0
  82. data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
  83. data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
  84. data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
  85. data/ext/cumo/include/cumo/types/robject.h +27 -0
  86. data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
  87. data/ext/cumo/include/cumo/types/scomplex.h +46 -0
  88. data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
  89. data/ext/cumo/include/cumo/types/sfloat.h +48 -0
  90. data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
  91. data/ext/cumo/include/cumo/types/uint16.h +25 -0
  92. data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
  93. data/ext/cumo/include/cumo/types/uint32.h +25 -0
  94. data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
  95. data/ext/cumo/include/cumo/types/uint64.h +25 -0
  96. data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
  97. data/ext/cumo/include/cumo/types/uint8.h +25 -0
  98. data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
  99. data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
  100. data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
  101. data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
  102. data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
  103. data/ext/cumo/narray/SFMT-params.h +97 -0
  104. data/ext/cumo/narray/SFMT-params19937.h +46 -0
  105. data/ext/cumo/narray/SFMT.c +620 -0
  106. data/ext/cumo/narray/SFMT.h +167 -0
  107. data/ext/cumo/narray/array.c +638 -0
  108. data/ext/cumo/narray/data.c +961 -0
  109. data/ext/cumo/narray/gen/cogen.rb +56 -0
  110. data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
  111. data/ext/cumo/narray/gen/def/bit.rb +37 -0
  112. data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
  113. data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
  114. data/ext/cumo/narray/gen/def/int16.rb +36 -0
  115. data/ext/cumo/narray/gen/def/int32.rb +36 -0
  116. data/ext/cumo/narray/gen/def/int64.rb +36 -0
  117. data/ext/cumo/narray/gen/def/int8.rb +36 -0
  118. data/ext/cumo/narray/gen/def/robject.rb +37 -0
  119. data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
  120. data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
  121. data/ext/cumo/narray/gen/def/uint16.rb +36 -0
  122. data/ext/cumo/narray/gen/def/uint32.rb +36 -0
  123. data/ext/cumo/narray/gen/def/uint64.rb +36 -0
  124. data/ext/cumo/narray/gen/def/uint8.rb +36 -0
  125. data/ext/cumo/narray/gen/erbpp2.rb +346 -0
  126. data/ext/cumo/narray/gen/narray_def.rb +268 -0
  127. data/ext/cumo/narray/gen/spec.rb +425 -0
  128. data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
  129. data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
  130. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
  131. data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
  132. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
  133. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
  134. data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
  135. data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
  136. data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
  137. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
  138. data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
  139. data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
  140. data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
  141. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
  142. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
  143. data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
  144. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
  145. data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
  146. data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
  147. data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
  148. data/ext/cumo/narray/gen/tmpl/class.c +9 -0
  149. data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
  150. data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
  151. data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
  152. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
  153. data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
  154. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
  155. data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
  156. data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
  157. data/ext/cumo/narray/gen/tmpl/each.c +47 -0
  158. data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
  159. data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
  160. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
  161. data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
  162. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
  163. data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
  164. data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
  165. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
  166. data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
  167. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
  168. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
  169. data/ext/cumo/narray/gen/tmpl/format.c +62 -0
  170. data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
  171. data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
  172. data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
  173. data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
  174. data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
  175. data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
  176. data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
  177. data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
  178. data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
  179. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
  180. data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
  181. data/ext/cumo/narray/gen/tmpl/median.c +66 -0
  182. data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
  183. data/ext/cumo/narray/gen/tmpl/module.c +9 -0
  184. data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
  185. data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
  186. data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
  187. data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
  188. data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
  189. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
  190. data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
  191. data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
  192. data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
  193. data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
  194. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
  195. data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
  196. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
  197. data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
  198. data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
  199. data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
  200. data/ext/cumo/narray/gen/tmpl/store.c +41 -0
  201. data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
  202. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
  203. data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
  204. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
  205. data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
  206. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
  207. data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
  208. data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
  209. data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
  210. data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
  211. data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
  212. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
  213. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
  214. data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
  215. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
  216. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
  217. data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
  218. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
  219. data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
  220. data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
  221. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
  222. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
  223. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
  224. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
  225. data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
  226. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
  227. data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
  228. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
  229. data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
  230. data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
  231. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
  232. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
  233. data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
  234. data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
  235. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
  236. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
  237. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
  238. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
  239. data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
  240. data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
  241. data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
  242. data/ext/cumo/narray/index.c +880 -0
  243. data/ext/cumo/narray/kwargs.c +153 -0
  244. data/ext/cumo/narray/math.c +142 -0
  245. data/ext/cumo/narray/narray.c +1948 -0
  246. data/ext/cumo/narray/ndloop.c +2105 -0
  247. data/ext/cumo/narray/rand.c +45 -0
  248. data/ext/cumo/narray/step.c +474 -0
  249. data/ext/cumo/narray/struct.c +886 -0
  250. data/lib/cumo.rb +3 -0
  251. data/lib/cumo/cuda.rb +11 -0
  252. data/lib/cumo/cuda/compile_error.rb +36 -0
  253. data/lib/cumo/cuda/compiler.rb +161 -0
  254. data/lib/cumo/cuda/device.rb +47 -0
  255. data/lib/cumo/cuda/link_state.rb +31 -0
  256. data/lib/cumo/cuda/module.rb +40 -0
  257. data/lib/cumo/cuda/nvrtc_program.rb +27 -0
  258. data/lib/cumo/linalg.rb +12 -0
  259. data/lib/cumo/narray.rb +2 -0
  260. data/lib/cumo/narray/extra.rb +1278 -0
  261. data/lib/erbpp.rb +294 -0
  262. data/lib/erbpp/line_number.rb +137 -0
  263. data/lib/erbpp/narray_def.rb +381 -0
  264. data/numo-narray-version +1 -0
  265. data/run.gdb +7 -0
  266. metadata +353 -0
@@ -0,0 +1,149 @@
1
+ #ifndef CUMO_NARRAY_KERNEL_H
2
+ #define CUMO_NARRAY_KERNEL_H
3
+
4
+ #if defined(__cplusplus)
5
+ extern "C" {
6
+ #if 0
7
+ } /* satisfy cc-mode */
8
+ #endif
9
+ #endif
10
+
11
+ #include <math.h>
12
+ //#include "cumo/compat.h"
13
+ #include "cumo/template_kernel.h"
14
+ //#include "cumo/extconf.h"
15
+
16
+ #ifdef HAVE_STDBOOL_H
17
+ # include <stdbool.h>
18
+ #endif
19
+
20
+ #ifdef HAVE_STDINT_H
21
+ # include <stdint.h>
22
+ #endif
23
+
24
+ #ifdef HAVE_SYS_TYPES_H
25
+ # include <sys/types.h>
26
+ #endif
27
+
28
+ #ifndef HAVE_U_INT8_T
29
+ # ifdef HAVE_UINT8_T
30
+ typedef uint8_t u_int8_t;
31
+ # endif
32
+ #endif
33
+
34
+ #ifndef HAVE_U_INT16_T
35
+ # ifdef HAVE_UINT16_T
36
+ typedef uint16_t u_int16_t;
37
+ # endif
38
+ #endif
39
+
40
+ #ifndef HAVE_U_INT32_T
41
+ # ifdef HAVE_UINT32_T
42
+ typedef uint32_t u_int32_t;
43
+ # endif
44
+ #endif
45
+
46
+ #ifndef HAVE_U_INT64_T
47
+ # ifdef HAVE_UINT64_T
48
+ typedef uint64_t u_int64_t;
49
+ # endif
50
+ #endif
51
+
52
+ #define SZF PRI_SIZE_PREFIX // defined in ruby.h
53
+
54
+ #if SIZEOF_LONG==8
55
+ # define NUM2INT64(x) NUM2LONG(x)
56
+ # define INT642NUM(x) LONG2NUM(x)
57
+ # define NUM2UINT64(x) NUM2ULONG(x)
58
+ # define UINT642NUM(x) ULONG2NUM(x)
59
+ # ifndef PRId64
60
+ # define PRId64 "ld"
61
+ # endif
62
+ # ifndef PRIu64
63
+ # define PRIu64 "lu"
64
+ # endif
65
+ #elif SIZEOF_LONG_LONG==8
66
+ # define NUM2INT64(x) NUM2LL(x)
67
+ # define INT642NUM(x) LL2NUM(x)
68
+ # define NUM2UINT64(x) NUM2ULL(x)
69
+ # define UINT642NUM(x) ULL2NUM(x)
70
+ # ifndef PRId64
71
+ # define PRId64 "lld"
72
+ # endif
73
+ # ifndef PRIu64
74
+ # define PRIu64 "llu"
75
+ # endif
76
+ #endif
77
+
78
+ #if SIZEOF_LONG==4
79
+ # define NUM2INT32(x) NUM2LONG(x)
80
+ # define INT322NUM(x) LONG2NUM(x)
81
+ # define NUM2UINT32(x) NUM2ULONG(x)
82
+ # define UINT322NUM(x) ULONG2NUM(x)
83
+ # ifndef PRId32
84
+ # define PRId32 "ld"
85
+ # endif
86
+ # ifndef PRIu32
87
+ # define PRIu32 "lu"
88
+ # endif
89
+ #elif SIZEOF_INT==4
90
+ # define NUM2INT32(x) NUM2INT(x)
91
+ # define INT322NUM(x) INT2NUM(x)
92
+ # define NUM2UINT32(x) NUM2UINT(x)
93
+ # define UINT322NUM(x) UINT2NUM(x)
94
+ # ifndef PRId32
95
+ # define PRId32 "d"
96
+ # endif
97
+ # ifndef PRIu32
98
+ # define PRIu32 "u"
99
+ # endif
100
+ #endif
101
+
102
+ #ifndef HAVE_TYPE_BOOL
103
+ typedef int bool;
104
+ #endif
105
+ #ifndef FALSE /* in case these macros already exist */
106
+ # define FALSE 0 /* values of bool */
107
+ #endif
108
+ #ifndef TRUE
109
+ # define TRUE 1
110
+ #endif
111
+
112
+ typedef struct { float dat[2]; } scomplex;
113
+ typedef struct { double dat[2]; } dcomplex;
114
+ typedef int fortran_integer;
115
+
116
+ #define REAL(x) ((x).dat[0])
117
+ #define IMAG(x) ((x).dat[1])
118
+
119
+ extern int na_debug_flag;
120
+
121
+ #define NARRAY_DATA_T 0x1
122
+ #define NARRAY_VIEW_T 0x2
123
+ #define NARRAY_FILEMAP_T 0x3
124
+
125
+ //#define NA_MAX_DIMENSION (int)(sizeof(VALUE)*8-2)
126
+ #define NA_MAX_DIMENSION 12
127
+ #define NA_MAX_ELMSZ 65535
128
+
129
+ typedef unsigned int BIT_DIGIT;
130
+ #define BYTE_BIT_DIGIT sizeof(BIT_DIGIT)
131
+ #define NB (sizeof(BIT_DIGIT)*8)
132
+ #define BALL (~(BIT_DIGIT)0)
133
+ #define SLB(n) (((n)==NB)?~(BIT_DIGIT)0:(~(~(BIT_DIGIT)0<<(n))))
134
+
135
+ #define ELEMENT_BIT_SIZE "ELEMENT_BIT_SIZE"
136
+ #define ELEMENT_BYTE_SIZE "ELEMENT_BYTE_SIZE"
137
+ #define CONTIGUOUS_STRIDE "CONTIGUOUS_STRIDE"
138
+
139
+ #include "cumo/indexer.h"
140
+ #include "cumo/intern_kernel.h"
141
+
142
+ #if defined(__cplusplus)
143
+ #if 0
144
+ { /* satisfy cc-mode */
145
+ #endif
146
+ } /* extern "C" { */
147
+ #endif
148
+
149
+ #endif /* ifndef CUMO_NARRAY_KERNEL_H */
@@ -0,0 +1,95 @@
1
+ #ifndef CUMO_NDLOOP_H
2
+ #define CUMO_NDLOOP_H
3
+
4
+ typedef struct NA_LOOP_ITER {
5
+ ssize_t pos; // - required for each dimension.
6
+ ssize_t step;
7
+ size_t *idx;
8
+ } na_loop_iter_t;
9
+
10
+ typedef struct NA_LOOP_ARGS {
11
+ VALUE value;
12
+ ssize_t elmsz;
13
+ char *ptr;
14
+ //char *buf_ptr; //
15
+ int ndim; // required for each argument.
16
+ // ssize_t pos; - not required here.
17
+ size_t *shape;
18
+ na_loop_iter_t *iter; // moved from na_loop_t
19
+ } na_loop_args_t;
20
+
21
+ // pass this structure to user iterator
22
+ typedef struct NA_LOOP {
23
+ int narg;
24
+ int ndim; // n of user dimention used at user function.
25
+ size_t *n; // n of elements for each dim (=shape)
26
+ na_loop_args_t *args; // for each arg
27
+ VALUE option;
28
+ void *opt_ptr;
29
+ VALUE err_type;
30
+ int reduce_dim; // number of dimensions to reduce in reduction kernel, e.g., for an array of shape: [2,3,4],
31
+ // 3 for sum(), 1 for sum(axis: 1), 2 for sum(axis: [1,2])
32
+ VALUE reduce; // dimension indicies to reduce in reduction kernel (in bits), e.g., for an array of shape:
33
+ // [2,3,4], 111b for sum(), 010b for sum(axis: 1), 110b for sum(axis: [1,2])
34
+ } na_loop_t;
35
+
36
+
37
+ // ------------------ ndfunc -------------------------------------------
38
+
39
+ #define NDF_HAS_LOOP (1<<0) // x[i]
40
+ #define NDF_STRIDE_LOOP (1<<1) // *(x+stride*i)
41
+ #define NDF_INDEX_LOOP (1<<2) // *(x+idx[i])
42
+ #define NDF_KEEP_DIM (1<<3)
43
+ #define NDF_INPLACE (1<<4)
44
+ #define NDF_ACCEPT_BYTESWAP (1<<5)
45
+
46
+ #define NDF_FLAT_REDUCE (1<<6)
47
+ #define NDF_EXTRACT (1<<7)
48
+ #define NDF_CUM (1<<8)
49
+
50
+ #define NDF_INDEXER_LOOP (1<<9) // Cumo custom. Use cumo own indexer.
51
+
52
+ #define FULL_LOOP (NDF_HAS_LOOP|NDF_STRIDE_LOOP|NDF_INDEX_LOOP|NDF_INPLACE)
53
+ #define FULL_LOOP_NIP (NDF_HAS_LOOP|NDF_STRIDE_LOOP|NDF_INDEX_LOOP)
54
+ #define STRIDE_LOOP (NDF_HAS_LOOP|NDF_STRIDE_LOOP|NDF_INPLACE)
55
+ #define STRIDE_LOOP_NIP (NDF_HAS_LOOP|NDF_STRIDE_LOOP)
56
+ #define NO_LOOP 0
57
+
58
+ #define OVERWRITE Qtrue // used for CASTABLE(t)
59
+
60
+ #define NDF_TEST(nf,fl) ((nf)->flag & (fl))
61
+ #define NDF_SET(nf,fl) {(nf)->flag |= (fl);}
62
+
63
+ #define NDF_ARG_READ_ONLY 1
64
+ #define NDF_ARG_WRITE_ONLY 2
65
+ #define NDF_ARG_READ_WRITE 3
66
+
67
+ // type of user function
68
+ typedef void (*na_iter_func_t) _((na_loop_t *const));
69
+ typedef VALUE (*na_text_func_t) _((char *ptr, size_t pos, VALUE opt));
70
+ //typedef void (*) void (*loop_func)(ndfunc_t*, na_md_loop_t*))
71
+
72
+
73
+ typedef struct NDF_ARG_IN {
74
+ VALUE type; // argument types
75
+ int dim; // # of dimension of argument handled by user function
76
+ // if dim==-1, reduce dimension
77
+ } ndfunc_arg_in_t;
78
+
79
+ typedef struct NDF_ARG_OUT {
80
+ VALUE type; // argument types
81
+ int dim; // # of dimension of argument handled by user function
82
+ size_t *shape;
83
+ } ndfunc_arg_out_t;
84
+
85
+ // spec of user function
86
+ typedef struct NDFUNCTION {
87
+ na_iter_func_t func; // user function
88
+ unsigned int flag; // what kind of loop user function supports
89
+ int nin; // # of arguments
90
+ int nout; // # of results
91
+ ndfunc_arg_in_t *ain; // spec of input arguments
92
+ ndfunc_arg_out_t *aout; // spec of output result
93
+ } ndfunc_t;
94
+
95
+ #endif /* CUMO_NDLOOP_H */
@@ -0,0 +1,126 @@
1
+ #ifndef CUMO_REDUCE_KERNEL_H
2
+ #define CUMO_REDUCE_KERNEL_H
3
+
4
+ #include <algorithm>
5
+ #include <cstdint>
6
+ #include <type_traits>
7
+
8
+ #include "cumo/indexer.h"
9
+
10
+ static inline int64_t round_up_to_power_of_2(int64_t x) {
11
+ --x;
12
+ x |= x >> 1;
13
+ x |= x >> 2;
14
+ x |= x >> 4;
15
+ x |= x >> 8;
16
+ x |= x >> 16;
17
+ x |= x >> 32;
18
+ return x + 1;
19
+ }
20
+
21
+ #define _REDUCE(offset) \
22
+ if (tid < offset) { \
23
+ impl.Reduce(sdata[(tid + offset)], sdata[tid]); \
24
+ }
25
+
26
+ // reference: cupy reduction kernel
27
+
28
+ template <typename TypeIn, typename TypeOut, typename ReductionImpl>
29
+ __global__ static void reduction_kernel(na_reduction_arg_t arg, ReductionImpl impl) {
30
+ na_iarray_t& in_iarray = arg.in;
31
+ na_iarray_t& out_iarray = arg.out;
32
+ na_indexer_t& in_indexer = arg.in_indexer;
33
+ na_indexer_t& out_indexer = arg.out_indexer;
34
+ na_indexer_t& reduce_indexer = arg.reduce_indexer;
35
+
36
+ using TypeReduce = decltype(impl.Identity());
37
+
38
+ extern __shared__ __align__(8) char sdata_raw[];
39
+ TypeReduce* sdata = (TypeReduce*)sdata_raw;
40
+ unsigned int tid = threadIdx.x;
41
+ unsigned int block_size = blockDim.x; // number of threads
42
+
43
+ for (uint64_t i_out = blockIdx.x; i_out < out_indexer.total_size; i_out += gridDim.x) {
44
+ cumo_na_indexer_set_dim(&out_indexer, i_out);
45
+ TypeReduce accum = impl.Identity();
46
+
47
+ for (int8_t i_out_dim = 0; i_out_dim < out_indexer.ndim; ++i_out_dim) {
48
+ in_indexer.index[i_out_dim] = out_indexer.index[i_out_dim];
49
+ }
50
+ for (auto i_reduce = tid; i_reduce < reduce_indexer.total_size; i_reduce += block_size) {
51
+ cumo_na_indexer_set_dim(&reduce_indexer, i_reduce);
52
+ for (int8_t i_reduce_dim = 0; i_reduce_dim < reduce_indexer.ndim; ++i_reduce_dim) {
53
+ in_indexer.index[out_indexer.ndim + i_reduce_dim] = reduce_indexer.index[i_reduce_dim];
54
+ }
55
+ TypeIn* in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
56
+ uint64_t i_in = in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr);
57
+ impl.Reduce(impl.MapIn(*in_ptr, i_in), accum);
58
+ }
59
+
60
+ if (block_size >= 2) {
61
+ sdata[tid] = accum;
62
+ __syncthreads();
63
+
64
+ if (block_size > 2) {
65
+ if (block_size > 4) {
66
+ if (block_size > 8) {
67
+ if (block_size > 16) {
68
+ if (block_size > 32) {
69
+ if (block_size > 64) {
70
+ if (block_size > 128) {
71
+ if (block_size > 256) {
72
+ _REDUCE(256);
73
+ __syncthreads();
74
+ }
75
+ _REDUCE(128);
76
+ __syncthreads();
77
+ }
78
+ _REDUCE(64);
79
+ __syncthreads();
80
+ }
81
+ _REDUCE(32);
82
+ __syncthreads();
83
+ }
84
+ _REDUCE(16);
85
+ __syncthreads();
86
+ }
87
+ _REDUCE(8);
88
+ __syncthreads();
89
+ }
90
+ _REDUCE(4);
91
+ __syncthreads();
92
+ }
93
+ _REDUCE(2);
94
+ __syncthreads();
95
+ }
96
+ _REDUCE(1);
97
+ accum = sdata[0];
98
+ }
99
+ if (tid == 0) {
100
+ TypeOut* out_ptr = reinterpret_cast<TypeOut*>(cumo_na_iarray_at_dim(&out_iarray, &out_indexer));
101
+ *out_ptr = impl.MapOut(accum);
102
+ //printf("threadId.x:%d blockIdx.x:%d blockDim.x:%d gridDim.x:%d block_size:%d accum:%d out:%p(%d)\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, block_size, accum, out_ptr, *out_ptr);
103
+ }
104
+ }
105
+ }
106
+
107
+ #undef _REDUCE
108
+
109
+ static constexpr size_t max_block_size = 512;
110
+
111
+ template <typename TypeIn, typename TypeOut, typename ReductionImpl>
112
+ void cumo_reduce(na_reduction_arg_t arg, ReductionImpl&& impl) {
113
+ na_indexer_t& out_indexer = arg.out_indexer;
114
+ na_indexer_t& reduce_indexer = arg.reduce_indexer;
115
+
116
+ using TypeReduce = decltype(impl.Identity());
117
+
118
+ size_t block_size = round_up_to_power_of_2(std::max(int64_t{1}, static_cast<int64_t>(reduce_indexer.total_size)));
119
+ block_size = std::min(max_block_size, block_size);
120
+ size_t grid_size = out_indexer.total_size;
121
+ size_t shared_mem_size = sizeof(TypeReduce) * block_size;
122
+
123
+ reduction_kernel<TypeIn,TypeOut,ReductionImpl><<<grid_size, block_size, shared_mem_size>>>(arg, impl);
124
+ }
125
+
126
+ #endif // CUMO_REDUCE_KERNEL_H
@@ -0,0 +1,158 @@
1
+ #ifndef CUMO_TEMPLATE_H
2
+ #define CUMO_TEMPLATE_H
3
+
4
+ #define INIT_COUNTER( lp, c ) \
5
+ { c = (lp)->n[0]; }
6
+
7
+ #define NDL_CNT(lp) ((lp)->n[0])
8
+ #define NDL_PTR(lp,i) ((lp)->args[i].ptr + (lp)->args[i].iter[0].pos)
9
+ #define NDL_STEP(lp,i) ((lp)->args[i].iter[0].step)
10
+ #define NDL_IDX(lp,i) ((lp)->args[i].iter[0].idx)
11
+ #define NDL_ESZ(lp,i) ((lp)->args[i].elmsz)
12
+ #define NDL_SHAPE(lp,i) ((lp)->args[i].shape)
13
+
14
+ #define INIT_PTR( lp, i, pt, st ) \
15
+ { \
16
+ pt = ((lp)->args[i]).ptr + ((lp)->args[i].iter[0]).pos; \
17
+ st = ((lp)->args[i].iter[0]).step; \
18
+ }
19
+
20
+ #define INIT_PTR_IDX( lp, i, pt, st, id ) \
21
+ { \
22
+ pt = ((lp)->args[i]).ptr + ((lp)->args[i].iter[0]).pos; \
23
+ st = ((lp)->args[i].iter[0]).step; \
24
+ id = ((lp)->args[i].iter[0]).idx; \
25
+ }
26
+
27
+ #define INIT_ELMSIZE( lp, i, es ) \
28
+ { \
29
+ es = ((lp)->args[i]).elmsz; \
30
+ }
31
+
32
+ #define INIT_PTR_BIT( lp, i, ad, ps, st ) \
33
+ { \
34
+ ps = ((lp)->args[i].iter[0]).pos; \
35
+ ad = (BIT_DIGIT*)(((lp)->args[i]).ptr) + ps/NB; \
36
+ ps %= NB; \
37
+ st = ((lp)->args[i].iter[0]).step; \
38
+ }
39
+
40
+ #define INIT_PTR_BIT_IDX( lp, i, ad, ps, st, id ) \
41
+ { \
42
+ ps = ((lp)->args[i].iter[0]).pos; \
43
+ ad = (BIT_DIGIT*)(((lp)->args[i]).ptr) + ps/NB; \
44
+ ps %= NB; \
45
+ st = ((lp)->args[i].iter[0]).step; \
46
+ id = ((lp)->args[i].iter[0]).idx; \
47
+ }
48
+
49
+ #define GET_DATA( ptr, type, val ) \
50
+ { \
51
+ val = *(type*)(ptr); \
52
+ }
53
+
54
+ #define SET_DATA( ptr, type, val ) \
55
+ { \
56
+ *(type*)(ptr) = val; \
57
+ }
58
+
59
+ #define GET_DATA_STRIDE( ptr, step, type, val ) \
60
+ { \
61
+ val = *(type*)(ptr); \
62
+ ptr += step; \
63
+ }
64
+
65
+ #define GET_DATA_INDEX( ptr, idx, type, val ) \
66
+ { \
67
+ val = *(type*)(ptr + *idx); \
68
+ idx++; \
69
+ }
70
+
71
+ #define SET_DATA_STRIDE( ptr, step, type, val ) \
72
+ { \
73
+ *(type*)(ptr) = val; \
74
+ ptr += step; \
75
+ }
76
+
77
+ #define SET_DATA_INDEX( ptr, idx, type, val ) \
78
+ { \
79
+ *(type*)(ptr + *idx) = val; \
80
+ idx++; \
81
+ }
82
+
83
+ #define LOAD_BIT( adr, pos, val ) \
84
+ { \
85
+ size_t dig = (pos) / NB; \
86
+ int bit = (pos) % NB; \
87
+ val = (((BIT_DIGIT*)(adr))[dig]>>(bit)) & 1u; \
88
+ }
89
+
90
+ #define LOAD_BIT_STEP( adr, pos, step, idx, val ) \
91
+ { \
92
+ size_t dig; int bit; \
93
+ if (idx) { \
94
+ dig = ((pos) + *(idx)) / NB; \
95
+ bit = ((pos) + *(idx)) % NB; \
96
+ idx++; \
97
+ } else { \
98
+ dig = (pos) / NB; \
99
+ bit = (pos) % NB; \
100
+ pos += step; \
101
+ } \
102
+ val = (((BIT_DIGIT*)(adr))[dig]>>bit) & 1u; \
103
+ }
104
+
105
+ #define STORE_BIT(adr,pos,val) \
106
+ { \
107
+ size_t dig = (pos) / NB; \
108
+ int bit = (pos) % NB; \
109
+ ((BIT_DIGIT*)(adr))[dig] = \
110
+ (((BIT_DIGIT*)(adr))[dig] & ~(1u<<(bit))) | ((val)<<(bit)); \
111
+ }
112
+ // val -> val&1 ??
113
+
114
+ #define STORE_BIT_STEP( adr, pos, step, idx, val )\
115
+ { \
116
+ size_t dig; int bit; \
117
+ if (idx) { \
118
+ dig = ((pos) + *(idx)) / NB; \
119
+ bit = ((pos) + *(idx)) % NB; \
120
+ idx++; \
121
+ } else { \
122
+ dig = (pos) / NB; \
123
+ bit = (pos) % NB; \
124
+ pos += step; \
125
+ } \
126
+ ((BIT_DIGIT*)(adr))[dig] = \
127
+ (((BIT_DIGIT*)(adr))[dig] & ~(1u<<(bit))) | ((val)<<(bit)); \
128
+ }
129
+ // val -> val&1 ??
130
+
131
+ static inline int
132
+ is_aligned(const void *ptr, const size_t alignment)
133
+ {
134
+ return ((size_t)(ptr) & ((alignment)-1)) == 0;
135
+ }
136
+
137
+ static inline int
138
+ is_aligned_step(const ssize_t step, const size_t alignment)
139
+ {
140
+ return ((step) & ((alignment)-1)) == 0;
141
+ }
142
+
143
+ #define SHOW_WARNING_ONCE( c_str ) \
144
+ { \
145
+ static bool show_warning = true; \
146
+ if (show_warning) { \
147
+ fprintf(stderr, (c_str)); \
148
+ show_warning = false; \
149
+ } \
150
+ }
151
+
152
+ #define SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE( func_name, type_name ) \
153
+ SHOW_WARNING_ONCE("Warning: FIXME: Method \"" func_name "\" for dtype \"" type_name "\" synchronizes with CPU.\n")
154
+
155
+ #define SHOW_SYNCHRONIZE_WARNING_ONCE( func_name, type_name ) \
156
+ SHOW_WARNING_ONCE("Warning: Method \"" func_name "\" for dtype \"" type_name "\" synchronizes with CPU.\n")
157
+
158
+ #endif /* ifndef CUMO_TEMPLATE_H */