cumo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.travis.yml +5 -0
  4. data/3rd_party/mkmf-cu/.gitignore +36 -0
  5. data/3rd_party/mkmf-cu/Gemfile +3 -0
  6. data/3rd_party/mkmf-cu/LICENSE +21 -0
  7. data/3rd_party/mkmf-cu/README.md +36 -0
  8. data/3rd_party/mkmf-cu/Rakefile +11 -0
  9. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
  11. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
  12. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
  13. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
  14. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
  15. data/CODE_OF_CONDUCT.md +46 -0
  16. data/Gemfile +8 -0
  17. data/LICENSE.txt +82 -0
  18. data/README.md +252 -0
  19. data/Rakefile +43 -0
  20. data/bench/broadcast_fp32.rb +138 -0
  21. data/bench/cumo_bench.rb +193 -0
  22. data/bench/numo_bench.rb +138 -0
  23. data/bench/reduction_fp32.rb +117 -0
  24. data/bin/console +14 -0
  25. data/bin/setup +8 -0
  26. data/cumo.gemspec +32 -0
  27. data/ext/cumo/cuda/cublas.c +278 -0
  28. data/ext/cumo/cuda/driver.c +421 -0
  29. data/ext/cumo/cuda/memory_pool.cpp +185 -0
  30. data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
  31. data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
  32. data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
  33. data/ext/cumo/cuda/nvrtc.c +207 -0
  34. data/ext/cumo/cuda/runtime.c +167 -0
  35. data/ext/cumo/cumo.c +148 -0
  36. data/ext/cumo/depend.erb +58 -0
  37. data/ext/cumo/extconf.rb +179 -0
  38. data/ext/cumo/include/cumo.h +25 -0
  39. data/ext/cumo/include/cumo/compat.h +23 -0
  40. data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
  41. data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
  42. data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
  43. data/ext/cumo/include/cumo/cuda/driver.h +22 -0
  44. data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
  45. data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
  46. data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
  47. data/ext/cumo/include/cumo/indexer.h +238 -0
  48. data/ext/cumo/include/cumo/intern.h +142 -0
  49. data/ext/cumo/include/cumo/intern_fwd.h +38 -0
  50. data/ext/cumo/include/cumo/intern_kernel.h +6 -0
  51. data/ext/cumo/include/cumo/narray.h +429 -0
  52. data/ext/cumo/include/cumo/narray_kernel.h +149 -0
  53. data/ext/cumo/include/cumo/ndloop.h +95 -0
  54. data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
  55. data/ext/cumo/include/cumo/template.h +158 -0
  56. data/ext/cumo/include/cumo/template_kernel.h +77 -0
  57. data/ext/cumo/include/cumo/types/bit.h +40 -0
  58. data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
  59. data/ext/cumo/include/cumo/types/complex.h +402 -0
  60. data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
  61. data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
  62. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
  63. data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
  64. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
  65. data/ext/cumo/include/cumo/types/dfloat.h +47 -0
  66. data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
  67. data/ext/cumo/include/cumo/types/float_def.h +34 -0
  68. data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
  69. data/ext/cumo/include/cumo/types/float_macro.h +191 -0
  70. data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
  71. data/ext/cumo/include/cumo/types/int16.h +24 -0
  72. data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
  73. data/ext/cumo/include/cumo/types/int32.h +24 -0
  74. data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
  75. data/ext/cumo/include/cumo/types/int64.h +24 -0
  76. data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
  77. data/ext/cumo/include/cumo/types/int8.h +24 -0
  78. data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
  79. data/ext/cumo/include/cumo/types/int_macro.h +67 -0
  80. data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
  81. data/ext/cumo/include/cumo/types/real_accum.h +486 -0
  82. data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
  83. data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
  84. data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
  85. data/ext/cumo/include/cumo/types/robject.h +27 -0
  86. data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
  87. data/ext/cumo/include/cumo/types/scomplex.h +46 -0
  88. data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
  89. data/ext/cumo/include/cumo/types/sfloat.h +48 -0
  90. data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
  91. data/ext/cumo/include/cumo/types/uint16.h +25 -0
  92. data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
  93. data/ext/cumo/include/cumo/types/uint32.h +25 -0
  94. data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
  95. data/ext/cumo/include/cumo/types/uint64.h +25 -0
  96. data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
  97. data/ext/cumo/include/cumo/types/uint8.h +25 -0
  98. data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
  99. data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
  100. data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
  101. data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
  102. data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
  103. data/ext/cumo/narray/SFMT-params.h +97 -0
  104. data/ext/cumo/narray/SFMT-params19937.h +46 -0
  105. data/ext/cumo/narray/SFMT.c +620 -0
  106. data/ext/cumo/narray/SFMT.h +167 -0
  107. data/ext/cumo/narray/array.c +638 -0
  108. data/ext/cumo/narray/data.c +961 -0
  109. data/ext/cumo/narray/gen/cogen.rb +56 -0
  110. data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
  111. data/ext/cumo/narray/gen/def/bit.rb +37 -0
  112. data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
  113. data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
  114. data/ext/cumo/narray/gen/def/int16.rb +36 -0
  115. data/ext/cumo/narray/gen/def/int32.rb +36 -0
  116. data/ext/cumo/narray/gen/def/int64.rb +36 -0
  117. data/ext/cumo/narray/gen/def/int8.rb +36 -0
  118. data/ext/cumo/narray/gen/def/robject.rb +37 -0
  119. data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
  120. data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
  121. data/ext/cumo/narray/gen/def/uint16.rb +36 -0
  122. data/ext/cumo/narray/gen/def/uint32.rb +36 -0
  123. data/ext/cumo/narray/gen/def/uint64.rb +36 -0
  124. data/ext/cumo/narray/gen/def/uint8.rb +36 -0
  125. data/ext/cumo/narray/gen/erbpp2.rb +346 -0
  126. data/ext/cumo/narray/gen/narray_def.rb +268 -0
  127. data/ext/cumo/narray/gen/spec.rb +425 -0
  128. data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
  129. data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
  130. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
  131. data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
  132. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
  133. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
  134. data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
  135. data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
  136. data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
  137. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
  138. data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
  139. data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
  140. data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
  141. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
  142. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
  143. data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
  144. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
  145. data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
  146. data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
  147. data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
  148. data/ext/cumo/narray/gen/tmpl/class.c +9 -0
  149. data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
  150. data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
  151. data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
  152. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
  153. data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
  154. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
  155. data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
  156. data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
  157. data/ext/cumo/narray/gen/tmpl/each.c +47 -0
  158. data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
  159. data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
  160. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
  161. data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
  162. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
  163. data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
  164. data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
  165. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
  166. data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
  167. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
  168. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
  169. data/ext/cumo/narray/gen/tmpl/format.c +62 -0
  170. data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
  171. data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
  172. data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
  173. data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
  174. data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
  175. data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
  176. data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
  177. data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
  178. data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
  179. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
  180. data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
  181. data/ext/cumo/narray/gen/tmpl/median.c +66 -0
  182. data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
  183. data/ext/cumo/narray/gen/tmpl/module.c +9 -0
  184. data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
  185. data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
  186. data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
  187. data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
  188. data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
  189. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
  190. data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
  191. data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
  192. data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
  193. data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
  194. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
  195. data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
  196. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
  197. data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
  198. data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
  199. data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
  200. data/ext/cumo/narray/gen/tmpl/store.c +41 -0
  201. data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
  202. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
  203. data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
  204. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
  205. data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
  206. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
  207. data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
  208. data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
  209. data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
  210. data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
  211. data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
  212. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
  213. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
  214. data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
  215. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
  216. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
  217. data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
  218. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
  219. data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
  220. data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
  221. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
  222. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
  223. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
  224. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
  225. data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
  226. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
  227. data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
  228. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
  229. data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
  230. data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
  231. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
  232. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
  233. data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
  234. data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
  235. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
  236. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
  237. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
  238. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
  239. data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
  240. data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
  241. data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
  242. data/ext/cumo/narray/index.c +880 -0
  243. data/ext/cumo/narray/kwargs.c +153 -0
  244. data/ext/cumo/narray/math.c +142 -0
  245. data/ext/cumo/narray/narray.c +1948 -0
  246. data/ext/cumo/narray/ndloop.c +2105 -0
  247. data/ext/cumo/narray/rand.c +45 -0
  248. data/ext/cumo/narray/step.c +474 -0
  249. data/ext/cumo/narray/struct.c +886 -0
  250. data/lib/cumo.rb +3 -0
  251. data/lib/cumo/cuda.rb +11 -0
  252. data/lib/cumo/cuda/compile_error.rb +36 -0
  253. data/lib/cumo/cuda/compiler.rb +161 -0
  254. data/lib/cumo/cuda/device.rb +47 -0
  255. data/lib/cumo/cuda/link_state.rb +31 -0
  256. data/lib/cumo/cuda/module.rb +40 -0
  257. data/lib/cumo/cuda/nvrtc_program.rb +27 -0
  258. data/lib/cumo/linalg.rb +12 -0
  259. data/lib/cumo/narray.rb +2 -0
  260. data/lib/cumo/narray/extra.rb +1278 -0
  261. data/lib/erbpp.rb +294 -0
  262. data/lib/erbpp/line_number.rb +137 -0
  263. data/lib/erbpp/narray_def.rb +381 -0
  264. data/numo-narray-version +1 -0
  265. data/run.gdb +7 -0
  266. metadata +353 -0
@@ -0,0 +1,97 @@
1
+ <% unless type_name == 'robject' %>
2
+ void <%="cumo_#{c_iter}_kernel_launch"%>(char *p1, char *p2, char *p3, ssize_t s1, ssize_t s2, ssize_t s3, uint64_t n);
3
+ void <%="cumo_#{c_iter}_int32_kernel_launch"%>(char *p1, char *p2, char *p3, ssize_t s1, ssize_t s2, ssize_t s3, uint64_t n);
4
+ <% end %>
5
+
6
+ static void
7
+ <%=c_iter%>(na_loop_t *const lp)
8
+ {
9
+ size_t i;
10
+ char *p1, *p2, *p3;
11
+ ssize_t s1, s2, s3;
12
+ INIT_COUNTER(lp, i);
13
+ INIT_PTR(lp, 0, p1, s1);
14
+ INIT_PTR(lp, 1, p2, s2);
15
+ INIT_PTR(lp, 2, p3, s3);
16
+ <% if type_name == 'robject' %>
17
+ {
18
+ dtype x, y;
19
+ SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
20
+ for (; i--;) {
21
+ GET_DATA_STRIDE(p1,s1,dtype,x);
22
+ GET_DATA_STRIDE(p2,s2,dtype,y);
23
+ x = m_pow(x,y);
24
+ SET_DATA_STRIDE(p3,s3,dtype,x);
25
+ }
26
+ }
27
+ <% else %>
28
+ <%="cumo_#{c_iter}_kernel_launch"%>(p1,p2,p3,s1,s2,s3,i);
29
+ <% end %>
30
+ }
31
+
32
+ static void
33
+ <%=c_iter%>_int32(na_loop_t *const lp)
34
+ {
35
+ size_t i;
36
+ char *p1, *p2, *p3;
37
+ ssize_t s1, s2, s3;
38
+ INIT_COUNTER(lp, i);
39
+ INIT_PTR(lp, 0, p1, s1);
40
+ INIT_PTR(lp, 1, p2, s2);
41
+ INIT_PTR(lp, 2, p3, s3);
42
+ <% if type_name == 'robject' %>
43
+ {
44
+ dtype x;
45
+ int32_t y;
46
+ SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>_int32", "<%=type_name%>");
47
+ for (; i--;) {
48
+ GET_DATA_STRIDE(p1,s1,dtype,x);
49
+ GET_DATA_STRIDE(p2,s2,int32_t,y);
50
+ x = m_pow_int(x,y);
51
+ SET_DATA_STRIDE(p3,s3,dtype,x);
52
+ }
53
+ }
54
+ <% else %>
55
+ <%="cumo_#{c_iter}_int32_kernel_launch"%>(p1,p2,p3,s1,s2,s3,i);
56
+ <% end %>
57
+ }
58
+
59
+ static VALUE
60
+ <%=c_func%>_self(VALUE self, VALUE other)
61
+ {
62
+ ndfunc_arg_in_t ain[2] = {{cT,0},{cT,0}};
63
+ ndfunc_arg_in_t ain_i[2] = {{cT,0},{cumo_cInt32,0}};
64
+ ndfunc_arg_out_t aout[1] = {{cT,0}};
65
+ ndfunc_t ndf = { <%=c_iter%>, STRIDE_LOOP, 2, 1, ain, aout };
66
+ ndfunc_t ndf_i = { <%=c_iter%>_int32, STRIDE_LOOP, 2, 1, ain_i, aout };
67
+
68
+ // fixme : use na.integer?
69
+ if (FIXNUM_P(other) || rb_obj_is_kind_of(other,cumo_cInt32)) {
70
+ return na_ndloop(&ndf_i, 2, self, other);
71
+ } else {
72
+ return na_ndloop(&ndf, 2, self, other);
73
+ }
74
+ }
75
+
76
+ /*
77
+ Binary power.
78
+ @overload <%=op_map%> other
79
+ @param [Cumo::NArray,Numeric] other
80
+ @return [Cumo::NArray] self to the other-th power.
81
+ */
82
+ static VALUE
83
+ <%=c_func(1)%>(VALUE self, VALUE other)
84
+ {
85
+ <% if is_object %>
86
+ return <%=c_func%>_self(self,other);
87
+ <% else %>
88
+ VALUE klass, v;
89
+ klass = na_upcast(CLASS_OF(self),CLASS_OF(other));
90
+ if (klass==cT) {
91
+ return <%=c_func%>_self(self,other);
92
+ } else {
93
+ v = rb_funcall(klass, id_cast, 1, self);
94
+ return rb_funcall(v, id_pow, 1, other);
95
+ }
96
+ <% end %>
97
+ }
@@ -0,0 +1,29 @@
1
+ <% unless type_name == 'robject' %>
2
+ __global__ void <%="cumo_#{c_iter}_kernel"%>(char *p1, char *p2, char *p3, ssize_t s1, ssize_t s2, ssize_t s3, uint64_t n)
3
+ {
4
+ for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
5
+ *(dtype*)(p3 + (i * s3)) = m_pow(*(dtype*)(p1 + (i * s1)), *(dtype*)(p2 + (i * s2)));
6
+ }
7
+ }
8
+
9
+ __global__ void <%="cumo_#{c_iter}_int32_kernel"%>(char *p1, char *p2, char *p3, ssize_t s1, ssize_t s2, ssize_t s3, uint64_t n)
10
+ {
11
+ for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
12
+ *(dtype*)(p3 + (i * s3)) = m_pow_int(*(dtype*)(p1 + (i * s1)), *(int32_t*)(p2 + (i * s2)));
13
+ }
14
+ }
15
+
16
+ void <%="cumo_#{c_iter}_kernel_launch"%>(char *p1, char *p2, char *p3, ssize_t s1, ssize_t s2, ssize_t s3, uint64_t n)
17
+ {
18
+ size_t gridDim = get_gridDim(n);
19
+ size_t blockDim = get_blockDim(n);
20
+ <%="cumo_#{c_iter}_kernel"%><<<gridDim, blockDim>>>(p1,p2,p3,s1,s2,s3,n);
21
+ }
22
+
23
+ void <%="cumo_#{c_iter}_int32_kernel_launch"%>(char *p1, char *p2, char *p3, ssize_t s1, ssize_t s2, ssize_t s3, uint64_t n)
24
+ {
25
+ size_t gridDim = get_gridDim(n);
26
+ size_t blockDim = get_blockDim(n);
27
+ <%="cumo_#{c_iter}_int32_kernel"%><<<gridDim, blockDim>>>(p1,p2,p3,s1,s2,s3,n);
28
+ }
29
+ <% end %>
@@ -0,0 +1,17 @@
1
+ static dtype pow_<%=type_name%>(dtype x, int p)
2
+ {
3
+ dtype r = m_one;
4
+ switch(p) {
5
+ case 2: return m_square(x);
6
+ case 3: return m_mul(m_square(x),x);
7
+ case 1: return x;
8
+ case 0: return m_one;
9
+ }
10
+ if (p<0) return m_zero;
11
+ while (p) {
12
+ if ((p%2) == 1) r = m_mul(r,x);
13
+ x = m_square(x);
14
+ p /= 2;
15
+ }
16
+ return r;
17
+ }
@@ -0,0 +1,212 @@
1
+ /*
2
+ qsort.c
3
+ Numerical Array Extension for Ruby
4
+ modified by Masahiro TANAKA
5
+ */
6
+
7
+ /*
8
+ * qsort.c: standard quicksort algorithm
9
+ *
10
+ * Modifications from vanilla NetBSD source:
11
+ * Add do ... while() macro fix
12
+ * Remove __inline, _DIAGASSERTs, __P
13
+ * Remove ill-considered "swap_cnt" switch to insertion sort,
14
+ * in favor of a simple check for presorted input.
15
+ *
16
+ * CAUTION: if you change this file, see also qsort_arg.c
17
+ *
18
+ * $PostgreSQL: pgsql/src/port/qsort.c,v 1.12 2006/10/19 20:56:22 tgl Exp $
19
+ */
20
+
21
+ /* $NetBSD: qsort.c,v 1.13 2003/08/07 16:43:42 agc Exp $ */
22
+
23
+ /*-
24
+ * Copyright (c) 1992, 1993
25
+ * The Regents of the University of California. All rights reserved.
26
+ *
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions
29
+ * are met:
30
+ * 1. Redistributions of source code must retain the above copyright
31
+ * notice, this list of conditions and the following disclaimer.
32
+ * 2. Redistributions in binary form must reproduce the above copyright
33
+ * notice, this list of conditions and the following disclaimer in the
34
+ * documentation and/or other materials provided with the distribution.
35
+ * 3. Neither the name of the University nor the names of its contributors
36
+ * may be used to endorse or promote products derived from this software
37
+ * without specific prior written permission.
38
+ *
39
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
40
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
42
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
43
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
44
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
45
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
47
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
48
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49
+ * SUCH DAMAGE.
50
+ */
51
+
52
+ #ifndef QSORT_INCL
53
+ #define QSORT_INCL
54
+ #define Min(x, y) ((x) < (y) ? (x) : (y))
55
+
56
+ /*
57
+ * Qsort routine based on J. L. Bentley and M. D. McIlroy,
58
+ * "Engineering a sort function",
59
+ * Software--Practice and Experience 23 (1993) 1249-1265.
60
+ * We have modified their original by adding a check for already-sorted input,
61
+ * which seems to be a win per discussions on pgsql-hackers around 2006-03-21.
62
+ */
63
+ #define swapcode(TYPE, parmi, parmj, n) \
64
+ do { \
65
+ size_t i = (n) / sizeof (TYPE); \
66
+ TYPE *pi = (TYPE *)(void *)(parmi); \
67
+ TYPE *pj = (TYPE *)(void *)(parmj); \
68
+ do { \
69
+ TYPE t = *pi; \
70
+ *pi++ = *pj; \
71
+ *pj++ = t; \
72
+ } while (--i > 0); \
73
+ } while (0)
74
+
75
+ #define SWAPINIT(a, es) swaptype = ((char *)(a) - (char *)0) % sizeof(long) || \
76
+ (es) % sizeof(long) ? 2 : (es) == sizeof(long)? 0 : 1;
77
+
78
+ static inline void
79
+ swapfunc(a, b, n, swaptype)
80
+ char *a,
81
+ *b;
82
+ size_t n;
83
+ int swaptype;
84
+ {
85
+ if (swaptype <= 1)
86
+ swapcode(long, a, b, n);
87
+ else
88
+ swapcode(char, a, b, n);
89
+ }
90
+
91
+ #define swap(a, b) \
92
+ if (swaptype == 0) { \
93
+ long t = *(long *)(void *)(a); \
94
+ *(long *)(void *)(a) = *(long *)(void *)(b); \
95
+ *(long *)(void *)(b) = t; \
96
+ } else \
97
+ swapfunc(a, b, es, swaptype)
98
+
99
+ #define vecswap(a, b, n) if ((n) > 0) swapfunc((a), (b), (size_t)(n), swaptype)
100
+
101
+ #define med3(a,b,c,_cmp) \
102
+ (cmpgt(b,a) ? \
103
+ (cmpgt(c,b) ? b : (cmpgt(c,a) ? c : a)) \
104
+ : (cmpgt(b,c) ? b : (cmpgt(c,a) ? a : c)))
105
+ #endif
106
+
107
+ #undef qsort_dtype
108
+ #define qsort_dtype <%=dtype%>
109
+ #undef qsort_cast
110
+ #define qsort_cast <%=dcast%>
111
+ <% if "#{suffix}" != "" %>
112
+ #undef cmp
113
+ #undef cmpgt
114
+ #define cmp(a,b) cmp<%=suffix%>(a,b)
115
+ #define cmpgt(a,b) cmpgt<%=suffix%>(a,b)
116
+ <% end %>
117
+ <% c_func(:nodef)%>
118
+
119
+ static void
120
+ <%=type_name%>_qsort<%=suffix%>(void *a, size_t n, ssize_t es)
121
+ {
122
+ char *pa,
123
+ *pb,
124
+ *pc,
125
+ *pd,
126
+ *pl,
127
+ *pm,
128
+ *pn;
129
+ int d,
130
+ r,
131
+ swaptype,
132
+ presorted;
133
+
134
+ loop:SWAPINIT(a, es);
135
+ if (n < 7)
136
+ {
137
+ for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
138
+ for (pl = pm; pl > (char *) a && cmpgt(pl - es, pl);
139
+ pl -= es)
140
+ swap(pl, pl - es);
141
+ return;
142
+ }
143
+ presorted = 1;
144
+ for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
145
+ {
146
+ if (cmpgt(pm - es, pm))
147
+ {
148
+ presorted = 0;
149
+ break;
150
+ }
151
+ }
152
+ if (presorted)
153
+ return;
154
+ pm = (char *) a + (n / 2) * es;
155
+ if (n > 7)
156
+ {
157
+ pl = (char *) a;
158
+ pn = (char *) a + (n - 1) * es;
159
+ if (n > 40)
160
+ {
161
+ d = (n / 8) * es;
162
+ pl = med3(pl, pl + d, pl + 2 * d, cmp);
163
+ pm = med3(pm - d, pm, pm + d, cmp);
164
+ pn = med3(pn - 2 * d, pn - d, pn, cmp);
165
+ }
166
+ pm = med3(pl, pm, pn, cmp);
167
+ }
168
+ swap(a, pm);
169
+ pa = pb = (char *) a + es;
170
+ pc = pd = (char *) a + (n - 1) * es;
171
+ for (;;)
172
+ {
173
+ while (pb <= pc && (r = cmp(pb, a)) <= 0)
174
+ {
175
+ if (r == 0)
176
+ {
177
+ swap(pa, pb);
178
+ pa += es;
179
+ }
180
+ pb += es;
181
+ }
182
+ while (pb <= pc && (r = cmp(pc, a)) >= 0)
183
+ {
184
+ if (r == 0)
185
+ {
186
+ swap(pc, pd);
187
+ pd -= es;
188
+ }
189
+ pc -= es;
190
+ }
191
+ if (pb > pc)
192
+ break;
193
+ swap(pb, pc);
194
+ pb += es;
195
+ pc -= es;
196
+ }
197
+ pn = (char *) a + n * es;
198
+ r = Min(pa - (char *) a, pb - pa);
199
+ vecswap(a, pb - r, r);
200
+ r = Min(pd - pc, pn - pd - es);
201
+ vecswap(pb, pn - r, r);
202
+ if ((r = pb - pa) > es)
203
+ <%=type_name%>_qsort<%=suffix%>(a, r / es, es);
204
+ if ((r = pd - pc) > es)
205
+ {
206
+ /* Iterate rather than recurse to save stack space */
207
+ a = pn - r;
208
+ n = r / es;
209
+ goto loop;
210
+ }
211
+ /* qsort(pn - r, r / es, es, cmp);*/
212
+ }
@@ -0,0 +1,168 @@
1
+ <%
2
+ if is_int && !is_object
3
+ if /Int64$/ =~ class_name
4
+ rand_bit = 64
5
+ else
6
+ rand_bit = 32
7
+ end
8
+ m_rand = "m_rand(max,shift)"
9
+ shift_def = "int shift;"
10
+ shift_set = "shift = #{rand_bit-1} - msb_pos(max);"
11
+ rand_type = "uint#{rand_bit}_t"
12
+ %>
13
+
14
+ #define HWID (sizeof(dtype)*4)
15
+
16
+ static int msb_pos(<%=rand_type%> a)
17
+ {
18
+ int width = HWID;
19
+ int pos = 0;
20
+ <%=rand_type%> mask = (((dtype)1 << HWID)-1) << HWID;
21
+
22
+ if (a==0) {return -1;}
23
+
24
+ while (width) {
25
+ if (a & mask) {
26
+ pos += width;
27
+ } else {
28
+ mask >>= width;
29
+ }
30
+ width >>= 1;
31
+ mask &= mask << width;
32
+ }
33
+ return pos;
34
+ }
35
+
36
+ /* generates a random number on [0,max) */
37
+ <% if rand_bit == 64 %>
38
+ inline static dtype m_rand(uint64_t max, int shift)
39
+ {
40
+ uint64_t x;
41
+ do {
42
+ x = gen_rand32();
43
+ x <<= 32;
44
+ x |= gen_rand32();
45
+ x >>= shift;
46
+ } while (x >= max);
47
+ return x;
48
+ }
49
+ <% else %>
50
+ inline static dtype m_rand(uint32_t max, int shift)
51
+ {
52
+ uint32_t x;
53
+ do {
54
+ x = gen_rand32();
55
+ x >>= shift;
56
+ } while (x >= max);
57
+ return x;
58
+ }
59
+ <% end %>
60
+ <%
61
+ else
62
+ m_rand = "m_rand(max)"
63
+ shift_def = ""
64
+ shift_set = ""
65
+ rand_type = "dtype"
66
+ end
67
+ %>
68
+
69
+ typedef struct {
70
+ dtype low;
71
+ <%=rand_type%> max;
72
+ } rand_opt_t;
73
+
74
+ static void
75
+ <%=c_iter%>(na_loop_t *const lp)
76
+ {
77
+ size_t i;
78
+ char *p1;
79
+ ssize_t s1;
80
+ size_t *idx1;
81
+ dtype x;
82
+ rand_opt_t *g;
83
+ dtype low;
84
+ <%=rand_type%> max;
85
+ <%=shift_def%>
86
+
87
+ INIT_COUNTER(lp, i);
88
+ INIT_PTR_IDX(lp, 0, p1, s1, idx1);
89
+ g = (rand_opt_t*)(lp->opt_ptr);
90
+ low = g->low;
91
+ max = g->max;
92
+ <%=shift_set%>
93
+
94
+ SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
95
+ SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
96
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
97
+ if (idx1) {
98
+ for (; i--;) {
99
+ x = m_add(<%=m_rand%>,low);
100
+ SET_DATA_INDEX(p1,idx1,dtype,x);
101
+ }
102
+ } else {
103
+ for (; i--;) {
104
+ x = m_add(<%=m_rand%>,low);
105
+ SET_DATA_STRIDE(p1,s1,dtype,x);
106
+ }
107
+ }
108
+ }
109
+
110
+
111
+ /*
112
+ Generate uniformly distributed random numbers on self narray.
113
+ @overload rand([[low],high])
114
+ @param [Numeric] low lower inclusive boundary of random numbers. (default=0)
115
+ @param [Numeric] high upper exclusive boundary of random numbers. (default=1 or 1+1i for complex types)
116
+ @return [Cumo::<%=class_name%>] self.
117
+ @example
118
+ Cumo::DFloat.new(6).rand
119
+ => Cumo::DFloat#shape=[6]
120
+ [0.0617545, 0.373067, 0.794815, 0.201042, 0.116041, 0.344032]
121
+ Cumo::DComplex.new(6).rand(5+5i)
122
+ => Cumo::DComplex#shape=[6]
123
+ [2.69974+3.68908i, 0.825443+0.254414i, 0.540323+0.34354i, 4.52061+2.39322i, ...]
124
+ Cumo::Int32.new(6).rand(2,5)
125
+ => Cumo::Int32#shape=[6]
126
+ [4, 3, 3, 2, 4, 2]
127
+ */
128
+ static VALUE
129
+ <%=c_func(-1)%>(int argc, VALUE *args, VALUE self)
130
+ {
131
+ rand_opt_t g;
132
+ VALUE v1=Qnil, v2=Qnil;
133
+ dtype high;
134
+ ndfunc_arg_in_t ain[1] = {{OVERWRITE,0}};
135
+ ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 1,0, ain,0};
136
+
137
+ <% if is_int && !is_object %>
138
+ rb_scan_args(argc, args, "11", &v1, &v2);
139
+ if (v2==Qnil) {
140
+ g.low = m_zero;
141
+ g.max = high = m_num_to_data(v1);
142
+ <% else %>
143
+ rb_scan_args(argc, args, "02", &v1, &v2);
144
+ if (v2==Qnil) {
145
+ g.low = m_zero;
146
+ if (v1==Qnil) {
147
+ <% if is_complex %>
148
+ g.max = high = c_new(1,1);
149
+ <% else %>
150
+ g.max = high = m_one;
151
+ <% end %>
152
+ } else {
153
+ g.max = high = m_num_to_data(v1);
154
+ }
155
+ <% end %>
156
+ } else {
157
+ g.low = m_num_to_data(v1);
158
+ high = m_num_to_data(v2);
159
+ g.max = m_sub(high,g.low);
160
+ }
161
+ <% if is_int && !is_object %>
162
+ if (high <= g.low) {
163
+ rb_raise(rb_eArgError,"high must be larger than low");
164
+ }
165
+ <% end %>
166
+ na_ndloop3(&ndf, &g, 1, self);
167
+ return self;
168
+ }