cumo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.travis.yml +5 -0
  4. data/3rd_party/mkmf-cu/.gitignore +36 -0
  5. data/3rd_party/mkmf-cu/Gemfile +3 -0
  6. data/3rd_party/mkmf-cu/LICENSE +21 -0
  7. data/3rd_party/mkmf-cu/README.md +36 -0
  8. data/3rd_party/mkmf-cu/Rakefile +11 -0
  9. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
  11. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
  12. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
  13. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
  14. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
  15. data/CODE_OF_CONDUCT.md +46 -0
  16. data/Gemfile +8 -0
  17. data/LICENSE.txt +82 -0
  18. data/README.md +252 -0
  19. data/Rakefile +43 -0
  20. data/bench/broadcast_fp32.rb +138 -0
  21. data/bench/cumo_bench.rb +193 -0
  22. data/bench/numo_bench.rb +138 -0
  23. data/bench/reduction_fp32.rb +117 -0
  24. data/bin/console +14 -0
  25. data/bin/setup +8 -0
  26. data/cumo.gemspec +32 -0
  27. data/ext/cumo/cuda/cublas.c +278 -0
  28. data/ext/cumo/cuda/driver.c +421 -0
  29. data/ext/cumo/cuda/memory_pool.cpp +185 -0
  30. data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
  31. data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
  32. data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
  33. data/ext/cumo/cuda/nvrtc.c +207 -0
  34. data/ext/cumo/cuda/runtime.c +167 -0
  35. data/ext/cumo/cumo.c +148 -0
  36. data/ext/cumo/depend.erb +58 -0
  37. data/ext/cumo/extconf.rb +179 -0
  38. data/ext/cumo/include/cumo.h +25 -0
  39. data/ext/cumo/include/cumo/compat.h +23 -0
  40. data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
  41. data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
  42. data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
  43. data/ext/cumo/include/cumo/cuda/driver.h +22 -0
  44. data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
  45. data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
  46. data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
  47. data/ext/cumo/include/cumo/indexer.h +238 -0
  48. data/ext/cumo/include/cumo/intern.h +142 -0
  49. data/ext/cumo/include/cumo/intern_fwd.h +38 -0
  50. data/ext/cumo/include/cumo/intern_kernel.h +6 -0
  51. data/ext/cumo/include/cumo/narray.h +429 -0
  52. data/ext/cumo/include/cumo/narray_kernel.h +149 -0
  53. data/ext/cumo/include/cumo/ndloop.h +95 -0
  54. data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
  55. data/ext/cumo/include/cumo/template.h +158 -0
  56. data/ext/cumo/include/cumo/template_kernel.h +77 -0
  57. data/ext/cumo/include/cumo/types/bit.h +40 -0
  58. data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
  59. data/ext/cumo/include/cumo/types/complex.h +402 -0
  60. data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
  61. data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
  62. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
  63. data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
  64. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
  65. data/ext/cumo/include/cumo/types/dfloat.h +47 -0
  66. data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
  67. data/ext/cumo/include/cumo/types/float_def.h +34 -0
  68. data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
  69. data/ext/cumo/include/cumo/types/float_macro.h +191 -0
  70. data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
  71. data/ext/cumo/include/cumo/types/int16.h +24 -0
  72. data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
  73. data/ext/cumo/include/cumo/types/int32.h +24 -0
  74. data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
  75. data/ext/cumo/include/cumo/types/int64.h +24 -0
  76. data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
  77. data/ext/cumo/include/cumo/types/int8.h +24 -0
  78. data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
  79. data/ext/cumo/include/cumo/types/int_macro.h +67 -0
  80. data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
  81. data/ext/cumo/include/cumo/types/real_accum.h +486 -0
  82. data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
  83. data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
  84. data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
  85. data/ext/cumo/include/cumo/types/robject.h +27 -0
  86. data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
  87. data/ext/cumo/include/cumo/types/scomplex.h +46 -0
  88. data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
  89. data/ext/cumo/include/cumo/types/sfloat.h +48 -0
  90. data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
  91. data/ext/cumo/include/cumo/types/uint16.h +25 -0
  92. data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
  93. data/ext/cumo/include/cumo/types/uint32.h +25 -0
  94. data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
  95. data/ext/cumo/include/cumo/types/uint64.h +25 -0
  96. data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
  97. data/ext/cumo/include/cumo/types/uint8.h +25 -0
  98. data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
  99. data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
  100. data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
  101. data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
  102. data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
  103. data/ext/cumo/narray/SFMT-params.h +97 -0
  104. data/ext/cumo/narray/SFMT-params19937.h +46 -0
  105. data/ext/cumo/narray/SFMT.c +620 -0
  106. data/ext/cumo/narray/SFMT.h +167 -0
  107. data/ext/cumo/narray/array.c +638 -0
  108. data/ext/cumo/narray/data.c +961 -0
  109. data/ext/cumo/narray/gen/cogen.rb +56 -0
  110. data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
  111. data/ext/cumo/narray/gen/def/bit.rb +37 -0
  112. data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
  113. data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
  114. data/ext/cumo/narray/gen/def/int16.rb +36 -0
  115. data/ext/cumo/narray/gen/def/int32.rb +36 -0
  116. data/ext/cumo/narray/gen/def/int64.rb +36 -0
  117. data/ext/cumo/narray/gen/def/int8.rb +36 -0
  118. data/ext/cumo/narray/gen/def/robject.rb +37 -0
  119. data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
  120. data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
  121. data/ext/cumo/narray/gen/def/uint16.rb +36 -0
  122. data/ext/cumo/narray/gen/def/uint32.rb +36 -0
  123. data/ext/cumo/narray/gen/def/uint64.rb +36 -0
  124. data/ext/cumo/narray/gen/def/uint8.rb +36 -0
  125. data/ext/cumo/narray/gen/erbpp2.rb +346 -0
  126. data/ext/cumo/narray/gen/narray_def.rb +268 -0
  127. data/ext/cumo/narray/gen/spec.rb +425 -0
  128. data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
  129. data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
  130. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
  131. data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
  132. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
  133. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
  134. data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
  135. data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
  136. data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
  137. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
  138. data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
  139. data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
  140. data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
  141. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
  142. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
  143. data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
  144. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
  145. data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
  146. data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
  147. data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
  148. data/ext/cumo/narray/gen/tmpl/class.c +9 -0
  149. data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
  150. data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
  151. data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
  152. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
  153. data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
  154. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
  155. data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
  156. data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
  157. data/ext/cumo/narray/gen/tmpl/each.c +47 -0
  158. data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
  159. data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
  160. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
  161. data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
  162. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
  163. data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
  164. data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
  165. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
  166. data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
  167. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
  168. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
  169. data/ext/cumo/narray/gen/tmpl/format.c +62 -0
  170. data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
  171. data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
  172. data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
  173. data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
  174. data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
  175. data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
  176. data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
  177. data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
  178. data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
  179. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
  180. data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
  181. data/ext/cumo/narray/gen/tmpl/median.c +66 -0
  182. data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
  183. data/ext/cumo/narray/gen/tmpl/module.c +9 -0
  184. data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
  185. data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
  186. data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
  187. data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
  188. data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
  189. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
  190. data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
  191. data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
  192. data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
  193. data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
  194. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
  195. data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
  196. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
  197. data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
  198. data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
  199. data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
  200. data/ext/cumo/narray/gen/tmpl/store.c +41 -0
  201. data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
  202. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
  203. data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
  204. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
  205. data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
  206. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
  207. data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
  208. data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
  209. data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
  210. data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
  211. data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
  212. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
  213. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
  214. data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
  215. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
  216. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
  217. data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
  218. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
  219. data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
  220. data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
  221. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
  222. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
  223. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
  224. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
  225. data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
  226. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
  227. data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
  228. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
  229. data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
  230. data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
  231. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
  232. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
  233. data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
  234. data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
  235. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
  236. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
  237. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
  238. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
  239. data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
  240. data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
  241. data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
  242. data/ext/cumo/narray/index.c +880 -0
  243. data/ext/cumo/narray/kwargs.c +153 -0
  244. data/ext/cumo/narray/math.c +142 -0
  245. data/ext/cumo/narray/narray.c +1948 -0
  246. data/ext/cumo/narray/ndloop.c +2105 -0
  247. data/ext/cumo/narray/rand.c +45 -0
  248. data/ext/cumo/narray/step.c +474 -0
  249. data/ext/cumo/narray/struct.c +886 -0
  250. data/lib/cumo.rb +3 -0
  251. data/lib/cumo/cuda.rb +11 -0
  252. data/lib/cumo/cuda/compile_error.rb +36 -0
  253. data/lib/cumo/cuda/compiler.rb +161 -0
  254. data/lib/cumo/cuda/device.rb +47 -0
  255. data/lib/cumo/cuda/link_state.rb +31 -0
  256. data/lib/cumo/cuda/module.rb +40 -0
  257. data/lib/cumo/cuda/nvrtc_program.rb +27 -0
  258. data/lib/cumo/linalg.rb +12 -0
  259. data/lib/cumo/narray.rb +2 -0
  260. data/lib/cumo/narray/extra.rb +1278 -0
  261. data/lib/erbpp.rb +294 -0
  262. data/lib/erbpp/line_number.rb +137 -0
  263. data/lib/erbpp/narray_def.rb +381 -0
  264. data/numo-narray-version +1 -0
  265. data/run.gdb +7 -0
  266. metadata +353 -0
@@ -0,0 +1,48 @@
1
+ static void
2
+ <%=c_iter%>(na_loop_t *const lp)
3
+ {
4
+ size_t i;
5
+ BIT_DIGIT *a1, x=0;
6
+ size_t p1;
7
+ ssize_t s1;
8
+ size_t *idx1;
9
+ VALUE y;
10
+
11
+ INIT_COUNTER(lp, i);
12
+ INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
13
+
14
+ SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
15
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
16
+
17
+ if (idx1) {
18
+ for (; i--;) {
19
+ LOAD_BIT(a1, p1+*idx1, x); idx1++;
20
+ y = m_data_to_num(x);
21
+ rb_yield(y);
22
+ }
23
+ } else {
24
+ for (; i--;) {
25
+ LOAD_BIT(a1, p1, x); p1+=s1;
26
+ y = m_data_to_num(x);
27
+ rb_yield(y);
28
+ }
29
+ }
30
+ }
31
+
32
+ /*
33
+ Calls the given block once for each element in self,
34
+ passing that element as a parameter.
35
+ @overload <%=name%>
36
+ @return [Cumo::NArray] self
37
+ For a block {|x| ... }
38
+ @yield [x] x is element of NArray.
39
+ */
40
+ static VALUE
41
+ <%=c_func(0)%>(VALUE self)
42
+ {
43
+ ndfunc_arg_in_t ain[1] = {{Qnil,0}};
44
+ ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 1,0, ain,0};
45
+
46
+ na_ndloop(&ndf, 1, self);
47
+ return self;
48
+ }
@@ -0,0 +1,70 @@
1
+ static inline void
2
+ yield_each_with_index(dtype x, size_t *c, VALUE *a, int nd, int md)
3
+ {
4
+ int j;
5
+
6
+ a[0] = m_data_to_num(x);
7
+ for (j=0; j<=nd; j++) {
8
+ a[j+1] = SIZET2NUM(c[j]);
9
+ }
10
+ rb_yield(rb_ary_new4(md,a));
11
+ }
12
+
13
+
14
+ static void
15
+ <%=c_iter%>(na_loop_t *const lp)
16
+ {
17
+ size_t i;
18
+ BIT_DIGIT *a1, x=0;
19
+ size_t p1;
20
+ ssize_t s1;
21
+ size_t *idx1;
22
+
23
+ VALUE *a;
24
+ size_t *c;
25
+ int nd, md;
26
+
27
+ c = (size_t*)(lp->opt_ptr);
28
+ nd = lp->ndim - 1;
29
+ md = lp->ndim + 1;
30
+ a = ALLOCA_N(VALUE,md);
31
+
32
+ INIT_COUNTER(lp, i);
33
+ INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
34
+ c[nd] = 0;
35
+
36
+ SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
37
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
38
+
39
+ if (idx1) {
40
+ for (; i--;) {
41
+ LOAD_BIT(a1, p1+*idx1, x); idx1++;
42
+ yield_each_with_index(x,c,a,nd,md);
43
+ c[nd]++;
44
+ }
45
+ } else {
46
+ for (; i--;) {
47
+ LOAD_BIT(a1, p1, x); p1+=s1;
48
+ yield_each_with_index(x,c,a,nd,md);
49
+ c[nd]++;
50
+ }
51
+ }
52
+ }
53
+
54
+ /*
55
+ Invokes the given block once for each element of self,
56
+ passing that element and indices along each axis as parameters.
57
+ @overload <%=name%>
58
+ @return [Cumo::NArray] self
59
+ For a block {|x,i,j,...| ... }
60
+ @yield [x,i,j,...] x is an element, i,j,... are multidimensional indices.
61
+ */
62
+ static VALUE
63
+ <%=c_func(0)%>(VALUE self)
64
+ {
65
+ ndfunc_arg_in_t ain[1] = {{Qnil,0}};
66
+ ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 1,0, ain,0};
67
+
68
+ na_ndloop_with_index(&ndf, 1, self);
69
+ return self;
70
+ }
@@ -0,0 +1,30 @@
1
+ /*
2
+ Extract an element only if self is a dimensionless NArray.
3
+ @overload extract
4
+ @return [Numeric,Cumo::NArray]
5
+ --- Extract element value as Ruby Object if self is a dimensionless NArray,
6
+ otherwise returns self.
7
+ */
8
+
9
+ // TODO(sonots): Return Cumo::Bit instead of ruby built-in object to avoid synchronization
10
+ static VALUE
11
+ <%=c_func(0)%>(VALUE self)
12
+ {
13
+ BIT_DIGIT *ptr, val;
14
+ size_t pos;
15
+ narray_t *na;
16
+ GetNArray(self,na);
17
+
18
+ if (na->ndim==0) {
19
+ pos = na_get_offset(self);
20
+ ptr = (BIT_DIGIT*)na_get_pointer_for_read(self);
21
+
22
+ SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
23
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
24
+
25
+ val = ((*((ptr)+(pos)/NB)) >> ((pos)%NB)) & 1u;
26
+ na_release_lock(self);
27
+ return INT2FIX(val);
28
+ }
29
+ return self;
30
+ }
@@ -0,0 +1,29 @@
1
+ /*
2
+ Extract an element only if self is a dimensionless NArray.
3
+ @overload extract_cpu
4
+ @return [Numeric,Cumo::NArray]
5
+ --- Extract element value as Ruby Object if self is a dimensionless NArray,
6
+ otherwise returns self.
7
+ */
8
+
9
+ static VALUE
10
+ <%=c_func(0)%>(VALUE self)
11
+ {
12
+ BIT_DIGIT *ptr, val;
13
+ size_t pos;
14
+ narray_t *na;
15
+ GetNArray(self,na);
16
+
17
+ if (na->ndim==0) {
18
+ pos = na_get_offset(self);
19
+ ptr = (BIT_DIGIT*)na_get_pointer_for_read(self);
20
+
21
+ SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
22
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
23
+
24
+ val = ((*((ptr)+(pos)/NB)) >> ((pos)%NB)) & 1u;
25
+ na_release_lock(self);
26
+ return INT2FIX(val);
27
+ }
28
+ return self;
29
+ }
@@ -0,0 +1,69 @@
1
+ static void
2
+ <%=c_iter%>(na_loop_t *const lp)
3
+ {
4
+ size_t n;
5
+ size_t p3;
6
+ ssize_t s3;
7
+ size_t *idx3;
8
+ int len;
9
+ BIT_DIGIT *a3;
10
+ BIT_DIGIT y;
11
+ VALUE x = lp->option;
12
+
13
+ // TODO(sonots): CUDA kernelize
14
+ SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
15
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
16
+
17
+ if (x==INT2FIX(0) || x==Qfalse) {
18
+ y = 0;
19
+ } else
20
+ if (x==INT2FIX(1) || x==Qtrue) {
21
+ y = ~(BIT_DIGIT)0;
22
+ } else {
23
+ rb_raise(rb_eArgError, "invalid value for Bit");
24
+ }
25
+
26
+ INIT_COUNTER(lp, n);
27
+ INIT_PTR_BIT_IDX(lp, 0, a3, p3, s3, idx3);
28
+ if (idx3) {
29
+ y = y&1;
30
+ for (; n--;) {
31
+ STORE_BIT(a3, p3+*idx3, y); idx3++;
32
+ }
33
+ } else if (s3!=1) {
34
+ y = y&1;
35
+ for (; n--;) {
36
+ STORE_BIT(a3, p3, y); p3+=s3;
37
+ }
38
+ } else {
39
+ if (p3>0 || n<NB) {
40
+ len = NB - p3;
41
+ if ((int)n<len) len=n;
42
+ *a3 = (y & (SLB(len)<<p3)) | (*a3 & ~(SLB(len)<<p3));
43
+ a3++;
44
+ n -= len;
45
+ }
46
+ for (; n>=NB; n-=NB) {
47
+ *(a3++) = y;
48
+ }
49
+ if (n>0) {
50
+ *a3 = (y & SLB(n)) | (*a3 & BALL<<n);
51
+ }
52
+ }
53
+ }
54
+
55
+ /*
56
+ Fill elements with other.
57
+ @overload <%=name%> other
58
+ @param [Numeric] other
59
+ @return [Cumo::<%=class_name%>] self.
60
+ */
61
+ static VALUE
62
+ <%=c_func(1)%>(VALUE self, VALUE val)
63
+ {
64
+ ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{sym_option}};
65
+ ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 2,0, ain,0};
66
+
67
+ na_ndloop(&ndf, 2, self, val);
68
+ return self;
69
+ }
@@ -0,0 +1,64 @@
1
+ static VALUE
2
+ format_<%=type_name%>(VALUE fmt, dtype x)
3
+ {
4
+ if (NIL_P(fmt)) {
5
+ char s[4];
6
+ int n;
7
+ n = m_sprintf(s,x);
8
+ return rb_str_new(s,n);
9
+ }
10
+ return rb_funcall(fmt, '%', 1, m_data_to_num(x));
11
+ }
12
+
13
+ static void
14
+ <%=c_iter%>(na_loop_t *const lp)
15
+ {
16
+ size_t i;
17
+ BIT_DIGIT *a1, x=0;
18
+ size_t p1;
19
+ char *p2;
20
+ ssize_t s1, s2;
21
+ size_t *idx1;
22
+ VALUE y;
23
+ VALUE fmt = lp->option;
24
+
25
+ SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
26
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
27
+
28
+ INIT_COUNTER(lp, i);
29
+ INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
30
+ INIT_PTR(lp, 1, p2, s2);
31
+
32
+ if (idx1) {
33
+ for (; i--;) {
34
+ LOAD_BIT(a1, p1+*idx1, x); idx1++;
35
+ y = format_<%=type_name%>(fmt, x);
36
+ SET_DATA_STRIDE(p2, s2, VALUE, y);
37
+ }
38
+ } else {
39
+ for (; i--;) {
40
+ LOAD_BIT(a1, p1, x); p1+=s1;
41
+ y = format_<%=type_name%>(fmt, x);
42
+ SET_DATA_STRIDE(p2, s2, VALUE, y);
43
+ }
44
+ }
45
+ }
46
+
47
+ /*
48
+ Format elements into strings.
49
+ @overload <%=name%> format
50
+ @param [String] format
51
+ @return [Cumo::RObject] array of formated strings.
52
+ */
53
+ static VALUE
54
+ <%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
55
+ {
56
+ VALUE fmt=Qnil;
57
+
58
+ ndfunc_arg_in_t ain[2] = {{Qnil,0},{sym_option}};
59
+ ndfunc_arg_out_t aout[1] = {{cumo_cRObject,0}};
60
+ ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 2,1, ain,aout};
61
+
62
+ rb_scan_args(argc, argv, "01", &fmt);
63
+ return na_ndloop(&ndf, 2, self, fmt);
64
+ }
@@ -0,0 +1,51 @@
1
+ static void
2
+ <%=c_iter%>(na_loop_t *const lp)
3
+ {
4
+ size_t i;
5
+ BIT_DIGIT *a1, x=0;
6
+ size_t p1;
7
+ ssize_t s1;
8
+ size_t *idx1;
9
+ VALUE y;
10
+ VALUE fmt = lp->option;
11
+ volatile VALUE a;
12
+
13
+ INIT_COUNTER(lp, i);
14
+ INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
15
+ a = rb_ary_new2(i);
16
+ rb_ary_push(lp->args[1].value, a);
17
+ if (idx1) {
18
+ for (; i--;) {
19
+ LOAD_BIT(a1, p1+*idx1, x); idx1++;
20
+ y = format_bit(fmt, x);
21
+ rb_ary_push(a,y);
22
+ }
23
+ } else {
24
+ for (; i--;) {
25
+ LOAD_BIT(a1, p1, x); p1+=s1;
26
+ y = format_bit(fmt, x);
27
+ rb_ary_push(a,y);
28
+ }
29
+ }
30
+ }
31
+
32
+ /*
33
+ Format elements into strings.
34
+ @overload <%=name%> format
35
+ @param [String] format
36
+ @return [Array] array of formated strings.
37
+ */
38
+ static VALUE
39
+ <%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
40
+ {
41
+ VALUE fmt=Qnil;
42
+ ndfunc_arg_in_t ain[3] = {{Qnil,0},{sym_loop_opt},{sym_option}};
43
+ ndfunc_arg_out_t aout[1] = {{rb_cArray,0}}; // dummy?
44
+ ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 3,1, ain,aout};
45
+
46
+ SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
47
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
48
+
49
+ rb_scan_args(argc, argv, "01", &fmt);
50
+ return na_ndloop_cast_narray_to_rarray(&ndf, self, fmt);
51
+ }
@@ -0,0 +1,21 @@
1
+ static VALUE
2
+ <%=c_iter%>(char *ptr, size_t pos, VALUE fmt)
3
+ {
4
+ dtype x;
5
+ LOAD_BIT(ptr,pos,x);
6
+ return format_<%=type_name%>(fmt, x);
7
+ }
8
+
9
+ /*
10
+ Returns a string containing a human-readable representation of NArray.
11
+ @overload inspect
12
+ @return [String]
13
+ */
14
+ static VALUE
15
+ <%=c_func(0)%>(VALUE ary)
16
+ {
17
+ SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
18
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
19
+
20
+ return na_ndloop_inspect(ary, <%=c_iter%>, Qnil);
21
+ }
@@ -0,0 +1,136 @@
1
+ static void
2
+ <%=c_iter%>(na_loop_t *const lp)
3
+ {
4
+ size_t i;
5
+ BIT_DIGIT *a;
6
+ size_t p1, p2;
7
+ ssize_t s1, s2;
8
+ size_t *idx1, *idx2, *pidx;
9
+ BIT_DIGIT x=0;
10
+ size_t count;
11
+ where_opt_t *g;
12
+
13
+ SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
14
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
15
+
16
+ g = (where_opt_t*)(lp->opt_ptr);
17
+ count = g->count;
18
+ pidx = (size_t*)(g->idx1);
19
+ INIT_COUNTER(lp, i);
20
+ INIT_PTR_BIT_IDX(lp, 0, a, p1, s1, idx1);
21
+ //INIT_PTR_IDX(lp, 1, p2, s2, idx2);
22
+ p2 = lp->args[1].iter[0].pos;
23
+ s2 = lp->args[1].iter[0].step;
24
+ idx2 = lp->args[1].iter[0].idx;
25
+
26
+ if (idx1) {
27
+ if (idx2) {
28
+ for (; i--;) {
29
+ LOAD_BIT(a, p1+*idx1, x);
30
+ idx1++;
31
+ if (x) {
32
+ *(pidx++) = p2+*idx2;
33
+ count++;
34
+ }
35
+ idx2++;
36
+ }
37
+ } else {
38
+ for (; i--;) {
39
+ LOAD_BIT(a, p1+*idx1, x);
40
+ idx1++;
41
+ if (x) {
42
+ *(pidx++) = p2;
43
+ count++;
44
+ }
45
+ p2 += s2;
46
+ }
47
+ }
48
+ } else {
49
+ if (idx2) {
50
+ for (; i--;) {
51
+ LOAD_BIT(a, p1, x);
52
+ p1 += s1;
53
+ if (x) {
54
+ *(pidx++) = p2+*idx2;
55
+ count++;
56
+ }
57
+ idx2++;
58
+ }
59
+ } else {
60
+ for (; i--;) {
61
+ LOAD_BIT(a, p1, x);
62
+ p1 += s1;
63
+ if (x) {
64
+ *(pidx++) = p2;
65
+ count++;
66
+ }
67
+ p2 += s2;
68
+ }
69
+ }
70
+ }
71
+ g->count = count;
72
+ g->idx1 = (char*)pidx;
73
+ }
74
+
75
+ #if SIZEOF_VOIDP == 8
76
+ #define cIndex cumo_cInt64
77
+ #elif SIZEOF_VOIDP == 4
78
+ #define cIndex cumo_cInt32
79
+ #endif
80
+
81
+ /*
82
+ Return subarray of argument masked with self bit array.
83
+ @overload <%=op_map%>(array)
84
+ @param [Cumo::NArray] array narray to be masked.
85
+ @return [Cumo::NArray] view of masked array.
86
+ */
87
+ static VALUE
88
+ <%=c_func(1)%>(VALUE mask, VALUE val)
89
+ {
90
+ volatile VALUE idx_1, view;
91
+ narray_data_t *nidx;
92
+ narray_view_t *nv;
93
+ narray_t *na;
94
+ narray_view_t *na1;
95
+ stridx_t stridx0;
96
+ size_t n_1;
97
+ where_opt_t g;
98
+ ndfunc_arg_in_t ain[2] = {{cT,0},{Qnil,0}};
99
+ ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP, 2, 0, ain, 0};
100
+
101
+ // TODO(sonots): bit_count_true synchronizes with CPU. Avoid.
102
+ n_1 = NUM2SIZET(<%=find_tmpl("count_true_cpu").c_func%>(0, NULL, mask));
103
+ idx_1 = nary_new(cIndex, 1, &n_1);
104
+ g.count = 0;
105
+ g.elmsz = SIZEOF_VOIDP;
106
+ g.idx1 = na_get_pointer_for_write(idx_1);
107
+ g.idx0 = NULL;
108
+ na_ndloop3(&ndf, &g, 2, mask, val);
109
+
110
+ view = na_s_allocate_view(CLASS_OF(val));
111
+ GetNArrayView(view, nv);
112
+ na_setup_shape((narray_t*)nv, 1, &n_1);
113
+
114
+ GetNArrayData(idx_1,nidx);
115
+ SDX_SET_INDEX(stridx0,(size_t*)nidx->ptr);
116
+ nidx->ptr = NULL;
117
+
118
+ nv->stridx = ALLOC_N(stridx_t,1);
119
+ nv->stridx[0] = stridx0;
120
+ nv->offset = 0;
121
+
122
+ GetNArray(val, na);
123
+ switch(NA_TYPE(na)) {
124
+ case NARRAY_DATA_T:
125
+ nv->data = val;
126
+ break;
127
+ case NARRAY_VIEW_T:
128
+ GetNArrayView(val, na1);
129
+ nv->data = na1->data;
130
+ break;
131
+ default:
132
+ rb_raise(rb_eRuntimeError,"invalid NA_TYPE: %d",NA_TYPE(na));
133
+ }
134
+
135
+ return view;
136
+ }