cumo 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (266) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.travis.yml +5 -0
  4. data/3rd_party/mkmf-cu/.gitignore +36 -0
  5. data/3rd_party/mkmf-cu/Gemfile +3 -0
  6. data/3rd_party/mkmf-cu/LICENSE +21 -0
  7. data/3rd_party/mkmf-cu/README.md +36 -0
  8. data/3rd_party/mkmf-cu/Rakefile +11 -0
  9. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
  11. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
  12. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
  13. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
  14. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
  15. data/CODE_OF_CONDUCT.md +46 -0
  16. data/Gemfile +8 -0
  17. data/LICENSE.txt +82 -0
  18. data/README.md +252 -0
  19. data/Rakefile +43 -0
  20. data/bench/broadcast_fp32.rb +138 -0
  21. data/bench/cumo_bench.rb +193 -0
  22. data/bench/numo_bench.rb +138 -0
  23. data/bench/reduction_fp32.rb +117 -0
  24. data/bin/console +14 -0
  25. data/bin/setup +8 -0
  26. data/cumo.gemspec +32 -0
  27. data/ext/cumo/cuda/cublas.c +278 -0
  28. data/ext/cumo/cuda/driver.c +421 -0
  29. data/ext/cumo/cuda/memory_pool.cpp +185 -0
  30. data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
  31. data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
  32. data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
  33. data/ext/cumo/cuda/nvrtc.c +207 -0
  34. data/ext/cumo/cuda/runtime.c +167 -0
  35. data/ext/cumo/cumo.c +148 -0
  36. data/ext/cumo/depend.erb +58 -0
  37. data/ext/cumo/extconf.rb +179 -0
  38. data/ext/cumo/include/cumo.h +25 -0
  39. data/ext/cumo/include/cumo/compat.h +23 -0
  40. data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
  41. data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
  42. data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
  43. data/ext/cumo/include/cumo/cuda/driver.h +22 -0
  44. data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
  45. data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
  46. data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
  47. data/ext/cumo/include/cumo/indexer.h +238 -0
  48. data/ext/cumo/include/cumo/intern.h +142 -0
  49. data/ext/cumo/include/cumo/intern_fwd.h +38 -0
  50. data/ext/cumo/include/cumo/intern_kernel.h +6 -0
  51. data/ext/cumo/include/cumo/narray.h +429 -0
  52. data/ext/cumo/include/cumo/narray_kernel.h +149 -0
  53. data/ext/cumo/include/cumo/ndloop.h +95 -0
  54. data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
  55. data/ext/cumo/include/cumo/template.h +158 -0
  56. data/ext/cumo/include/cumo/template_kernel.h +77 -0
  57. data/ext/cumo/include/cumo/types/bit.h +40 -0
  58. data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
  59. data/ext/cumo/include/cumo/types/complex.h +402 -0
  60. data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
  61. data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
  62. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
  63. data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
  64. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
  65. data/ext/cumo/include/cumo/types/dfloat.h +47 -0
  66. data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
  67. data/ext/cumo/include/cumo/types/float_def.h +34 -0
  68. data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
  69. data/ext/cumo/include/cumo/types/float_macro.h +191 -0
  70. data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
  71. data/ext/cumo/include/cumo/types/int16.h +24 -0
  72. data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
  73. data/ext/cumo/include/cumo/types/int32.h +24 -0
  74. data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
  75. data/ext/cumo/include/cumo/types/int64.h +24 -0
  76. data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
  77. data/ext/cumo/include/cumo/types/int8.h +24 -0
  78. data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
  79. data/ext/cumo/include/cumo/types/int_macro.h +67 -0
  80. data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
  81. data/ext/cumo/include/cumo/types/real_accum.h +486 -0
  82. data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
  83. data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
  84. data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
  85. data/ext/cumo/include/cumo/types/robject.h +27 -0
  86. data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
  87. data/ext/cumo/include/cumo/types/scomplex.h +46 -0
  88. data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
  89. data/ext/cumo/include/cumo/types/sfloat.h +48 -0
  90. data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
  91. data/ext/cumo/include/cumo/types/uint16.h +25 -0
  92. data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
  93. data/ext/cumo/include/cumo/types/uint32.h +25 -0
  94. data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
  95. data/ext/cumo/include/cumo/types/uint64.h +25 -0
  96. data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
  97. data/ext/cumo/include/cumo/types/uint8.h +25 -0
  98. data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
  99. data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
  100. data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
  101. data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
  102. data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
  103. data/ext/cumo/narray/SFMT-params.h +97 -0
  104. data/ext/cumo/narray/SFMT-params19937.h +46 -0
  105. data/ext/cumo/narray/SFMT.c +620 -0
  106. data/ext/cumo/narray/SFMT.h +167 -0
  107. data/ext/cumo/narray/array.c +638 -0
  108. data/ext/cumo/narray/data.c +961 -0
  109. data/ext/cumo/narray/gen/cogen.rb +56 -0
  110. data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
  111. data/ext/cumo/narray/gen/def/bit.rb +37 -0
  112. data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
  113. data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
  114. data/ext/cumo/narray/gen/def/int16.rb +36 -0
  115. data/ext/cumo/narray/gen/def/int32.rb +36 -0
  116. data/ext/cumo/narray/gen/def/int64.rb +36 -0
  117. data/ext/cumo/narray/gen/def/int8.rb +36 -0
  118. data/ext/cumo/narray/gen/def/robject.rb +37 -0
  119. data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
  120. data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
  121. data/ext/cumo/narray/gen/def/uint16.rb +36 -0
  122. data/ext/cumo/narray/gen/def/uint32.rb +36 -0
  123. data/ext/cumo/narray/gen/def/uint64.rb +36 -0
  124. data/ext/cumo/narray/gen/def/uint8.rb +36 -0
  125. data/ext/cumo/narray/gen/erbpp2.rb +346 -0
  126. data/ext/cumo/narray/gen/narray_def.rb +268 -0
  127. data/ext/cumo/narray/gen/spec.rb +425 -0
  128. data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
  129. data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
  130. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
  131. data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
  132. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
  133. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
  134. data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
  135. data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
  136. data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
  137. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
  138. data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
  139. data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
  140. data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
  141. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
  142. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
  143. data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
  144. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
  145. data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
  146. data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
  147. data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
  148. data/ext/cumo/narray/gen/tmpl/class.c +9 -0
  149. data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
  150. data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
  151. data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
  152. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
  153. data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
  154. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
  155. data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
  156. data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
  157. data/ext/cumo/narray/gen/tmpl/each.c +47 -0
  158. data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
  159. data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
  160. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
  161. data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
  162. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
  163. data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
  164. data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
  165. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
  166. data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
  167. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
  168. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
  169. data/ext/cumo/narray/gen/tmpl/format.c +62 -0
  170. data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
  171. data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
  172. data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
  173. data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
  174. data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
  175. data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
  176. data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
  177. data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
  178. data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
  179. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
  180. data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
  181. data/ext/cumo/narray/gen/tmpl/median.c +66 -0
  182. data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
  183. data/ext/cumo/narray/gen/tmpl/module.c +9 -0
  184. data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
  185. data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
  186. data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
  187. data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
  188. data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
  189. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
  190. data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
  191. data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
  192. data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
  193. data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
  194. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
  195. data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
  196. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
  197. data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
  198. data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
  199. data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
  200. data/ext/cumo/narray/gen/tmpl/store.c +41 -0
  201. data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
  202. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
  203. data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
  204. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
  205. data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
  206. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
  207. data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
  208. data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
  209. data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
  210. data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
  211. data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
  212. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
  213. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
  214. data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
  215. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
  216. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
  217. data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
  218. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
  219. data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
  220. data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
  221. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
  222. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
  223. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
  224. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
  225. data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
  226. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
  227. data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
  228. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
  229. data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
  230. data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
  231. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
  232. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
  233. data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
  234. data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
  235. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
  236. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
  237. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
  238. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
  239. data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
  240. data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
  241. data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
  242. data/ext/cumo/narray/index.c +880 -0
  243. data/ext/cumo/narray/kwargs.c +153 -0
  244. data/ext/cumo/narray/math.c +142 -0
  245. data/ext/cumo/narray/narray.c +1948 -0
  246. data/ext/cumo/narray/ndloop.c +2105 -0
  247. data/ext/cumo/narray/rand.c +45 -0
  248. data/ext/cumo/narray/step.c +474 -0
  249. data/ext/cumo/narray/struct.c +886 -0
  250. data/lib/cumo.rb +3 -0
  251. data/lib/cumo/cuda.rb +11 -0
  252. data/lib/cumo/cuda/compile_error.rb +36 -0
  253. data/lib/cumo/cuda/compiler.rb +161 -0
  254. data/lib/cumo/cuda/device.rb +47 -0
  255. data/lib/cumo/cuda/link_state.rb +31 -0
  256. data/lib/cumo/cuda/module.rb +40 -0
  257. data/lib/cumo/cuda/nvrtc_program.rb +27 -0
  258. data/lib/cumo/linalg.rb +12 -0
  259. data/lib/cumo/narray.rb +2 -0
  260. data/lib/cumo/narray/extra.rb +1278 -0
  261. data/lib/erbpp.rb +294 -0
  262. data/lib/erbpp/line_number.rb +137 -0
  263. data/lib/erbpp/narray_def.rb +381 -0
  264. data/numo-narray-version +1 -0
  265. data/run.gdb +7 -0
  266. metadata +353 -0
@@ -0,0 +1,2105 @@
1
+ #include <ruby.h>
2
+ #include "cumo/narray.h"
3
+ #include "cumo/cuda/memory_pool.h"
4
+ #include "cumo/cuda/runtime.h"
5
+
6
+ #if 0
7
+ #define DBG(x) x
8
+ #else
9
+ #define DBG(x)
10
+ #endif
11
+
12
+ #ifdef HAVE_STDARG_PROTOTYPES
13
+ #include <stdarg.h>
14
+ #define va_init_list(a,b) va_start(a,b)
15
+ #else
16
+ #include <varargs.h>
17
+ #define va_init_list(a,b) va_start(a)
18
+ #endif
19
+
20
+ typedef struct NA_BUFFER_COPY {
21
+ int ndim;
22
+ size_t elmsz;
23
+ size_t *n;
24
+ char *src_ptr;
25
+ char *buf_ptr;
26
+ na_loop_iter_t *src_iter;
27
+ na_loop_iter_t *buf_iter;
28
+ } na_buffer_copy_t;
29
+
30
+ typedef struct NA_LOOP_XARGS {
31
+ na_loop_iter_t *iter; // moved from na_loop_t
32
+ na_buffer_copy_t *bufcp; // copy data to buffer
33
+ int flag; // NDL_READ NDL_WRITE
34
+ bool free_user_iter; // alloc LARG(lp,j).iter=lp->xargs[j].iter
35
+ } na_loop_xargs_t;
36
+
37
+ typedef struct NA_MD_LOOP {
38
+ int narg;
39
+ int nin;
40
+ int ndim; // n of total dimention looped at loop_narray. NOTE: lp->ndim + lp-.user.ndim is the total dimension.
41
+ unsigned int copy_flag; // set i-th bit if i-th arg is cast
42
+ void *ptr; // memory for n
43
+ na_loop_iter_t *iter_ptr; // memory for iter
44
+ size_t *n; // n of elements for each dim (shape)
45
+ na_loop_t user; // loop in user function
46
+ na_loop_xargs_t *xargs; // extra data for each arg
47
+ int writeback; // write back result to i-th arg
48
+ int init_aidx; // index of initializer argument
49
+ int reduce_dim; // number of dimensions to reduce in reduction kernel, e.g., for an array of shape: [2,3,4],
50
+ // 3 for sum(), 1 for sum(axis: 1), 2 for sum(axis: [1,2])
51
+ int *trans_map;
52
+ VALUE vargs;
53
+ VALUE reduce; // dimension indicies to reduce in reduction kernel (in bits), e.g., for an array of shape:
54
+ // [2,3,4], 111b for sum(), 010b for sum(axis: 1), 110b for sum(axis: [1,2])
55
+ VALUE loop_opt;
56
+ ndfunc_t *ndfunc;
57
+ void (*loop_func)();
58
+ } na_md_loop_t;
59
+
60
+ #define LARG(lp,iarg) ((lp)->user.args[iarg])
61
+ #define LITER(lp,idim,iarg) ((lp)->xargs[iarg].iter[idim])
62
+ #define LITER_SRC(lp,idim) ((lp)->src_iter[idim])
63
+ #define LBUFCP(lp,j) ((lp)->xargs[j].bufcp)
64
+
65
+ #define CASTABLE(t) (RTEST(t) && (t)!=OVERWRITE)
66
+
67
+ #define NDL_READ 1
68
+ #define NDL_WRITE 2
69
+ #define NDL_READ_WRITE (NDL_READ|NDL_WRITE)
70
+
71
+ static ID id_cast;
72
+ static ID id_extract;
73
+
74
+ static inline VALUE
75
+ nary_type_s_cast(VALUE type, VALUE obj)
76
+ {
77
+ return rb_funcall(type,id_cast,1,obj);
78
+ }
79
+
80
+ static void
81
+ print_ndfunc(ndfunc_t *nf) {
82
+ volatile VALUE t;
83
+ int i, k;
84
+ printf("ndfunc_t = 0x%"SZF"x {\n",(size_t)nf);
85
+ printf(" func = 0x%"SZF"x\n", (size_t)nf->func);
86
+ printf(" flag = 0x%"SZF"x\n", (size_t)nf->flag);
87
+ printf(" nin = %d\n", nf->nin);
88
+ printf(" nout = %d\n", nf->nout);
89
+ printf(" ain = 0x%"SZF"x\n", (size_t)nf->ain);
90
+ for (i=0; i<nf->nin; i++) {
91
+ t = rb_inspect(nf->ain[i].type);
92
+ printf(" ain[%d].type = %s\n", i, StringValuePtr(t));
93
+ printf(" ain[%d].dim = %d\n", i, nf->ain[i].dim);
94
+ }
95
+ printf(" aout = 0x%"SZF"x\n", (size_t)nf->aout);
96
+ for (i=0; i<nf->nout; i++) {
97
+ t = rb_inspect(nf->aout[i].type);
98
+ printf(" aout[%d].type = %s\n", i, StringValuePtr(t));
99
+ printf(" aout[%d].dim = %d\n", i, nf->aout[i].dim);
100
+ for (k=0; k<nf->aout[i].dim; k++) {
101
+ printf(" aout[%d].shape[%d] = %"SZF"u\n", i, k, nf->aout[i].shape[k]);
102
+ }
103
+ }
104
+ printf("}\n");
105
+ }
106
+
107
+
108
+ static void
109
+ print_ndloop(na_md_loop_t *lp) {
110
+ int i,j,nd;
111
+ printf("na_md_loop_t = 0x%"SZF"x {\n",(size_t)lp);
112
+ printf(" narg = %d\n", lp->narg);
113
+ printf(" nin = %d\n", lp->nin);
114
+ printf(" ndim = %d\n", lp->ndim);
115
+ printf(" copy_flag = %x\n", lp->copy_flag);
116
+ printf(" writeback = %d\n", lp->writeback);
117
+ printf(" init_aidx = %d\n", lp->init_aidx);
118
+ printf(" reduce_dim = %d\n", lp->reduce_dim);
119
+ printf(" trans_map = 0x%"SZF"x\n", (size_t)lp->trans_map);
120
+ nd = lp->ndim + lp->user.ndim;
121
+ for (i=0; i<nd; i++) {
122
+ printf(" trans_map[%d] = %d\n", i, lp->trans_map[i]);
123
+ }
124
+ printf(" n = 0x%"SZF"x\n", (size_t)lp->n);
125
+ nd = lp->ndim + lp->user.ndim;
126
+ for (i=0; i<=lp->ndim; i++) {
127
+ printf(" n[%d] = %"SZF"u\n", i, lp->n[i]);
128
+ }
129
+ printf(" user.n = 0x%"SZF"x\n", (size_t)lp->user.n);
130
+ if (lp->user.n) {
131
+ for (i=0; i<=lp->user.ndim; i++) {
132
+ printf(" user.n[%d] = %"SZF"u\n", i, lp->user.n[i]);
133
+ }
134
+ }
135
+ printf(" xargs = 0x%"SZF"x\n", (size_t)lp->xargs);
136
+ printf(" iter_ptr = 0x%"SZF"x\n", (size_t)lp->iter_ptr);
137
+ printf(" user.narg = %d\n", lp->user.narg);
138
+ printf(" user.ndim = %d\n", lp->user.ndim);
139
+ printf(" user.args = 0x%"SZF"x\n", (size_t)lp->user.args);
140
+ for (j=0; j<lp->narg; j++) {
141
+ }
142
+ printf(" user.opt_ptr = 0x%"SZF"x\n", (size_t)lp->user.opt_ptr);
143
+ if (lp->reduce==Qnil) {
144
+ printf(" reduce = nil\n");
145
+ } else {
146
+ printf(" reduce = 0x%x\n", NUM2INT(lp->reduce));
147
+ }
148
+ for (j=0; j<lp->narg; j++) {
149
+ printf("--user.args[%d]--\n", j);
150
+ printf(" user.args[%d].ptr = 0x%"SZF"x\n", j, (size_t)LARG(lp,j).ptr);
151
+ printf(" user.args[%d].elmsz = %"SZF"d\n", j, LARG(lp,j).elmsz);
152
+ printf(" user.args[%d].value = 0x%"PRI_VALUE_PREFIX"x\n", j, LARG(lp,j).value);
153
+ printf(" user.args[%d].ndim = %d\n", j, LARG(lp,j).ndim);
154
+ printf(" user.args[%d].shape = 0x%"SZF"x\n", j, (size_t)LARG(lp,j).shape);
155
+ if (LARG(lp,j).shape) {
156
+ for (i=0; i<LARG(lp,j).ndim; i++) {
157
+ printf(" user.args[%d].shape[%d] = %"SZF"d\n", j, i, LARG(lp,j).shape[i]);
158
+ }
159
+ }
160
+ printf(" user.args[%d].iter = 0x%"SZF"x\n", j,(size_t)lp->user.args[j].iter);
161
+ if (lp->user.args[j].iter) {
162
+ for (i=0; i<lp->user.ndim; i++) {
163
+ printf(" &user.args[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&lp->user.args[j].iter[i]);
164
+ printf(" user.args[%d].iter[%d].pos = %"SZF"u\n", j,i, lp->user.args[j].iter[i].pos);
165
+ printf(" user.args[%d].iter[%d].step = %"SZF"u\n", j,i, lp->user.args[j].iter[i].step);
166
+ printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)lp->user.args[j].iter[i].idx);
167
+ }
168
+ }
169
+ //
170
+ printf(" xargs[%d].flag = %d\n", j, lp->xargs[j].flag);
171
+ printf(" xargs[%d].free_user_iter = %d\n", j, lp->xargs[j].free_user_iter);
172
+ for (i=0; i<=nd; i++) {
173
+ printf(" &xargs[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&LITER(lp,i,j));
174
+ printf(" xargs[%d].iter[%d].pos = %"SZF"u\n", j,i, LITER(lp,i,j).pos);
175
+ printf(" xargs[%d].iter[%d].step = %"SZF"u\n", j,i, LITER(lp,i,j).step);
176
+ printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)LITER(lp,i,j).idx);
177
+ }
178
+ printf(" xargs[%d].bufcp = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp);
179
+ if (lp->xargs[j].bufcp) {
180
+ printf(" xargs[%d].bufcp->ndim = %d\n", j, lp->xargs[j].bufcp->ndim);
181
+ printf(" xargs[%d].bufcp->elmsz = %"SZF"d\n", j, lp->xargs[j].bufcp->elmsz);
182
+ printf(" xargs[%d].bufcp->n = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->n);
183
+ printf(" xargs[%d].bufcp->src_ptr = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->src_ptr);
184
+ printf(" xargs[%d].bufcp->buf_ptr = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->buf_ptr);
185
+ printf(" xargs[%d].bufcp->src_iter = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->src_iter);
186
+ printf(" xargs[%d].bufcp->buf_iter = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->buf_iter);
187
+ }
188
+ }
189
+ printf("}\n");
190
+ }
191
+
192
+
193
+ // returns 0x01 if NDF_HAS_LOOP, but not supporting NDF_STRIDE_LOOP
194
+ // returns 0x02 if NDF_HAS_LOOP, but not supporting NDF_INDEX_LOOP
195
+ static unsigned int
196
+ ndloop_func_loop_spec(ndfunc_t *nf, int user_ndim)
197
+ {
198
+ unsigned int f=0;
199
+ // If user function supports LOOP
200
+ if (user_ndim > 0 || NDF_TEST(nf,NDF_HAS_LOOP)) {
201
+ if (!NDF_TEST(nf,NDF_STRIDE_LOOP)) {
202
+ f |= 1;
203
+ }
204
+ if (!NDF_TEST(nf,NDF_INDEX_LOOP)) {
205
+ f |= 2;
206
+ }
207
+ }
208
+ return f;
209
+ }
210
+
211
+
212
+
213
+
214
+ static int
215
+ ndloop_cast_required(VALUE type, VALUE value)
216
+ {
217
+ return CASTABLE(type) && type != CLASS_OF(value);
218
+ }
219
+
220
+ static int
221
+ ndloop_castable_type(VALUE type)
222
+ {
223
+ return rb_obj_is_kind_of(type, rb_cClass) && RTEST(rb_class_inherited_p(type, cNArray));
224
+ }
225
+
226
+ static void
227
+ ndloop_cast_error(VALUE type, VALUE value)
228
+ {
229
+ VALUE x = rb_inspect(type);
230
+ char* s = StringValueCStr(x);
231
+ rb_bug("fail cast from %s to %s", rb_obj_classname(value),s);
232
+ rb_raise(rb_eTypeError,"fail cast from %s to %s",
233
+ rb_obj_classname(value), s);
234
+ }
235
+
236
+ // convert input argeuments given by RARRAY_PTR(args)[j]
237
+ // to type specified by nf->args[j].type
238
+ // returns copy_flag where nth-bit is set if nth argument is converted.
239
+ static unsigned int
240
+ ndloop_cast_args(ndfunc_t *nf, VALUE args)
241
+ {
242
+ int j;
243
+ unsigned int copy_flag=0;
244
+ VALUE type, value;
245
+
246
+ for (j=0; j<nf->nin; j++) {
247
+
248
+ type = nf->ain[j].type;
249
+ if (TYPE(type)==T_SYMBOL)
250
+ continue;
251
+ value = RARRAY_AREF(args,j);
252
+ if (!ndloop_cast_required(type, value))
253
+ continue;
254
+
255
+ if (ndloop_castable_type(type)) {
256
+ RARRAY_ASET(args,j,nary_type_s_cast(type, value));
257
+ copy_flag |= 1<<j;
258
+ } else {
259
+ ndloop_cast_error(type, value);
260
+ }
261
+ }
262
+
263
+ RB_GC_GUARD(type); RB_GC_GUARD(value);
264
+ return copy_flag;
265
+ }
266
+
267
+
268
+ static void
269
+ ndloop_handle_symbol_in_ain(VALUE type, VALUE value, int at, na_md_loop_t *lp)
270
+ {
271
+ if (type==sym_reduce) {
272
+ lp->reduce = value;
273
+ }
274
+ else if (type==sym_option) {
275
+ lp->user.option = value;
276
+ }
277
+ else if (type==sym_loop_opt) {
278
+ lp->loop_opt = value;
279
+ }
280
+ else if (type==sym_init) {
281
+ lp->init_aidx = at;
282
+ }
283
+ else {
284
+ rb_bug("ndloop parse_options: unknown type");
285
+ }
286
+ }
287
+
288
+ static inline int
289
+ max2(int x, int y)
290
+ {
291
+ return x > y ? x : y;
292
+ }
293
+
294
+ static void
295
+ ndloop_find_max_dimension(na_md_loop_t *lp, ndfunc_t *nf, VALUE args)
296
+ {
297
+ int j;
298
+ int nin=0; // number of input objects (except for symbols)
299
+ int user_nd=0; // max dimension of user function
300
+ int loop_nd=0; // max dimension of md-loop
301
+
302
+ for (j=0; j<RARRAY_LEN(args); j++) {
303
+ VALUE t = nf->ain[j].type;
304
+ VALUE v = RARRAY_AREF(args,j);
305
+ if (TYPE(t)==T_SYMBOL) {
306
+ ndloop_handle_symbol_in_ain(t, v, j, lp);
307
+ } else {
308
+ nin++;
309
+ user_nd = max2(user_nd, nf->ain[j].dim);
310
+ if (IsNArray(v))
311
+ loop_nd = max2(loop_nd, RNARRAY_NDIM(v) - nf->ain[j].dim);
312
+ }
313
+ }
314
+
315
+ lp->narg = lp->user.narg = nin + nf->nout;
316
+ lp->nin = nin;
317
+ lp->ndim = loop_nd;
318
+ lp->user.ndim = user_nd;
319
+ }
320
+
321
+ /*
322
+ user-dimension:
323
+ user_nd = MAX( nf->args[j].dim )
324
+
325
+ user-support dimension:
326
+
327
+ loop dimension:
328
+ loop_nd
329
+ */
330
+
331
+ static void
332
+ ndloop_alloc(na_md_loop_t *lp, ndfunc_t *nf, VALUE args,
333
+ void *opt_ptr, unsigned int copy_flag,
334
+ void (*loop_func)(ndfunc_t*, na_md_loop_t*))
335
+ {
336
+ int i,j;
337
+ int narg;
338
+ int max_nd;
339
+
340
+ char *buf;
341
+ size_t n1, n2, n3, n4, n5;
342
+
343
+ long args_len;
344
+
345
+ na_loop_iter_t *iter;
346
+
347
+ int trans_dim;
348
+ unsigned int f;
349
+
350
+ args_len = RARRAY_LEN(args);
351
+
352
+ if (args_len != nf->nin) {
353
+ rb_bug("wrong number of arguments for ndfunc (%lu for %d)",
354
+ args_len, nf->nin);
355
+ }
356
+
357
+ lp->vargs = args;
358
+ lp->ndfunc = nf;
359
+ lp->loop_func = loop_func;
360
+ lp->copy_flag = copy_flag;
361
+
362
+ lp->reduce = Qnil;
363
+ lp->user.option = Qnil;
364
+ lp->user.opt_ptr = opt_ptr;
365
+ lp->user.err_type = Qfalse;
366
+ lp->loop_opt = Qnil;
367
+ lp->writeback = -1;
368
+ lp->init_aidx = -1;
369
+
370
+ lp->ptr = NULL;
371
+ lp->user.n = NULL;
372
+
373
+ ndloop_find_max_dimension(lp, nf, args);
374
+ narg = lp->nin + nf->nout;
375
+ max_nd = lp->ndim + lp->user.ndim;
376
+
377
+ n1 = sizeof(size_t)*(max_nd+1);
378
+ n2 = sizeof(na_loop_xargs_t)*narg;
379
+ n2 = ((n2-1)/8+1)*8;
380
+ n3 = sizeof(na_loop_args_t)*narg;
381
+ n3 = ((n3-1)/8+1)*8;
382
+ n4 = sizeof(na_loop_iter_t)*narg*(max_nd+1);
383
+ n4 = ((n4-1)/8+1)*8;
384
+ n5 = sizeof(int)*(max_nd+1);
385
+
386
+ lp->ptr = buf = (char*)xmalloc(n1+n2+n3+n4+n5);
387
+ lp->n = (size_t*)buf; buf+=n1;
388
+ lp->xargs = (na_loop_xargs_t*)buf; buf+=n2;
389
+ lp->user.args = (na_loop_args_t*)buf; buf+=n3;
390
+ lp->iter_ptr = iter = (na_loop_iter_t*)buf; buf+=n4;
391
+ lp->trans_map = (int*)buf;
392
+
393
+ for (j=0; j<narg; j++) {
394
+ LARG(lp,j).value = Qnil;
395
+ LARG(lp,j).iter = NULL;
396
+ LARG(lp,j).shape = NULL;
397
+ LARG(lp,j).ndim = 0;
398
+ lp->xargs[j].iter = &(iter[(max_nd+1)*j]);
399
+ lp->xargs[j].bufcp = NULL;
400
+ lp->xargs[j].flag = (j<lp->nin) ? NDL_READ : NDL_WRITE;
401
+ lp->xargs[j].free_user_iter = 0;
402
+ }
403
+
404
+ for (i=0; i<=max_nd; i++) {
405
+ lp->n[i] = 1;
406
+ for (j=0; j<narg; j++) {
407
+ LITER(lp,i,j).pos = 0;
408
+ LITER(lp,i,j).step = 0;
409
+ LITER(lp,i,j).idx = NULL;
410
+ }
411
+ }
412
+
413
+ // transpose reduce-dimensions to last dimensions
414
+ // array loop
415
+ // [*,+,*,+,*] => [*,*,*,+,+]
416
+ // trans_map=[0,3,1,4,2] <= [0,1,2,3,4]
417
+ if (NDF_TEST(nf,NDF_FLAT_REDUCE) && RTEST(lp->reduce)) {
418
+ trans_dim = 0;
419
+ for (i=0; i<max_nd; i++) {
420
+ if (na_test_reduce(lp->reduce, i)) {
421
+ lp->trans_map[i] = -1;
422
+ } else {
423
+ lp->trans_map[i] = trans_dim++;
424
+ }
425
+ }
426
+ j = trans_dim;
427
+ for (i=0; i<max_nd; i++) {
428
+ if (lp->trans_map[i] == -1) {
429
+ lp->trans_map[i] = j++;
430
+ }
431
+ }
432
+ lp->reduce_dim = max_nd - trans_dim;
433
+ f = 0;
434
+ for (i=trans_dim; i<max_nd; i++) {
435
+ f |= 1<<i;
436
+ }
437
+ lp->reduce = INT2FIX(f);
438
+ } else {
439
+ for (i=0; i<max_nd; i++) {
440
+ lp->trans_map[i] = i;
441
+ }
442
+ lp->reduce_dim = 0;
443
+ }
444
+ }
445
+
446
+
447
+ static VALUE
448
+ ndloop_release(VALUE vlp)
449
+ {
450
+ int j;
451
+ VALUE v;
452
+ na_md_loop_t *lp = (na_md_loop_t*)(vlp);
453
+
454
+ for (j=0; j < lp->narg; j++) {
455
+ v = LARG(lp,j).value;
456
+ if (IsNArray(v)) {
457
+ na_release_lock(v);
458
+ }
459
+ }
460
+ for (j=0; j<lp->narg; j++) {
461
+ //printf("lp->xargs[%d].bufcp=%lx\n",j,(size_t)(lp->xargs[j].bufcp));
462
+ if (lp->xargs[j].bufcp) {
463
+ xfree(lp->xargs[j].bufcp->buf_iter);
464
+ if (cumo_cuda_runtime_is_device_memory(lp->xargs[j].bufcp->buf_ptr)) {
465
+ cumo_cuda_runtime_free(lp->xargs[j].bufcp->buf_ptr);
466
+ }
467
+ else {
468
+ xfree(lp->xargs[j].bufcp->buf_ptr);
469
+ }
470
+ xfree(lp->xargs[j].bufcp->n);
471
+ xfree(lp->xargs[j].bufcp);
472
+ if (lp->xargs[j].free_user_iter) {
473
+ xfree(LARG(lp,j).iter);
474
+ }
475
+ }
476
+ }
477
+ xfree(lp->ptr);
478
+ return Qnil;
479
+ }
480
+
481
+
482
+ /*
483
+ set lp->n[i] (shape of n-d iteration) here
484
+ */
485
+ static void
486
+ ndloop_check_shape(na_md_loop_t *lp, int nf_dim, narray_t *na)
487
+ {
488
+ int i, k;
489
+ size_t n;
490
+ int dim_beg;
491
+
492
+ dim_beg = lp->ndim + nf_dim - na->ndim;
493
+
494
+ for (k = na->ndim - nf_dim - 1; k>=0; k--) {
495
+ i = lp->trans_map[k + dim_beg];
496
+ n = na->shape[k];
497
+ // if n==1 then repeat this dimension
498
+ if (n != 1) {
499
+ if (lp->n[i] == 1) {
500
+ lp->n[i] = n;
501
+ } else if (lp->n[i] != n) {
502
+ // inconsistent array shape
503
+ rb_raise(nary_eShapeError,"shape1[%d](=%"SZF"u) != shape2[%d](=%"SZF"u)",
504
+ i, lp->n[i], k, n);
505
+ }
506
+ }
507
+ }
508
+ }
509
+
510
+
511
+ /*
512
+ na->shape[i] == lp->n[ dim_map[i] ]
513
+ */
514
+ static void
515
+ ndloop_set_stepidx(na_md_loop_t *lp, int j, VALUE vna, int *dim_map, int rwflag)
516
+ {
517
+ size_t n, s;
518
+ int i, k, nd;
519
+ stridx_t sdx;
520
+ narray_t *na;
521
+
522
+ LARG(lp,j).value = vna;
523
+ LARG(lp,j).elmsz = nary_element_stride(vna);
524
+ if (rwflag == NDL_READ) {
525
+ LARG(lp,j).ptr = na_get_pointer_for_read(vna);
526
+ } else
527
+ if (rwflag == NDL_WRITE) {
528
+ LARG(lp,j).ptr = na_get_pointer_for_write(vna);
529
+ } else
530
+ if (rwflag == NDL_READ_WRITE) {
531
+ LARG(lp,j).ptr = na_get_pointer_for_read_write(vna);
532
+ } else {
533
+ rb_bug("invalid value for read-write flag");
534
+ }
535
+ GetNArray(vna,na);
536
+ nd = LARG(lp,j).ndim;
537
+
538
+ switch(NA_TYPE(na)) {
539
+ case NARRAY_DATA_T:
540
+ if (NA_DATA_PTR(na)==NULL && NA_SIZE(na)>0) {
541
+ rb_bug("cannot read no-data NArray");
542
+ rb_raise(rb_eRuntimeError,"cannot read no-data NArray");
543
+ }
544
+ // through
545
+ case NARRAY_FILEMAP_T:
546
+ s = LARG(lp,j).elmsz;
547
+ for (k=na->ndim; k--;) {
548
+ n = na->shape[k];
549
+ if (n > 1 || nd > 0) {
550
+ i = dim_map[k];
551
+ //printf("n=%d k=%d i=%d\n",n,k,i);
552
+ LITER(lp,i,j).step = s;
553
+ //LITER(lp,i,j).idx = NULL;
554
+ }
555
+ s *= n;
556
+ nd--;
557
+ }
558
+ LITER(lp,0,j).pos = 0;
559
+ break;
560
+ case NARRAY_VIEW_T:
561
+ LITER(lp,0,j).pos = NA_VIEW_OFFSET(na);
562
+ for (k=0; k<na->ndim; k++) {
563
+ n = na->shape[k];
564
+ sdx = NA_VIEW_STRIDX(na)[k];
565
+ if (n > 1 || nd > 0) {
566
+ i = dim_map[k];
567
+ if (SDX_IS_INDEX(sdx)) {
568
+ LITER(lp,i,j).step = 0;
569
+ LITER(lp,i,j).idx = SDX_GET_INDEX(sdx);
570
+ } else {
571
+ LITER(lp,i,j).step = SDX_GET_STRIDE(sdx);
572
+ //LITER(lp,i,j).idx = NULL;
573
+ }
574
+ } else if (n==1) {
575
+ if (SDX_IS_INDEX(sdx)) {
576
+ SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("ndloop_set_stepidx", "any");
577
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
578
+ LITER(lp,0,j).pos += SDX_GET_INDEX(sdx)[0];
579
+ }
580
+ }
581
+ nd--;
582
+ }
583
+ break;
584
+ default:
585
+ rb_bug("invalid narray internal type");
586
+ }
587
+ }
588
+
589
+
590
+
591
+ static void
592
+ ndloop_init_args(ndfunc_t *nf, na_md_loop_t *lp, VALUE args)
593
+ {
594
+ int i, j;
595
+ VALUE v;
596
+ narray_t *na;
597
+ int nf_dim;
598
+ int dim_beg;
599
+ int *dim_map;
600
+ int max_nd = lp->ndim + lp->user.ndim;
601
+ int flag;
602
+ size_t s;
603
+
604
+ /*
605
+ na->shape[i] == lp->n[ dim_map[i] ]
606
+ */
607
+ dim_map = ALLOCA_N(int, max_nd);
608
+
609
+ // input arguments
610
+ for (j=0; j<nf->nin; j++) {
611
+ if (TYPE(nf->ain[j].type)==T_SYMBOL) {
612
+ continue;
613
+ }
614
+ v = RARRAY_AREF(args,j);
615
+ if (IsNArray(v)) {
616
+ // set LARG(lp,j) with v
617
+ GetNArray(v,na);
618
+ nf_dim = nf->ain[j].dim;
619
+ if (nf_dim > na->ndim) {
620
+ rb_raise(nary_eDimensionError,"requires >= %d-dimensioal array "
621
+ "while %d-dimensional array is given",nf_dim,na->ndim);
622
+ }
623
+ ndloop_check_shape(lp, nf_dim, na);
624
+ dim_beg = lp->ndim + nf->ain[j].dim - na->ndim;
625
+ for (i=0; i<na->ndim; i++) {
626
+ dim_map[i] = lp->trans_map[i+dim_beg];
627
+ //printf("dim_map[%d]=%d na->shape[%d]=%d\n",i,dim_map[i],i,na->shape[i]);
628
+ }
629
+ if (nf->ain[j].type==OVERWRITE) {
630
+ lp->xargs[j].flag = flag = NDL_WRITE;
631
+ } else {
632
+ lp->xargs[j].flag = flag = NDL_READ;
633
+ }
634
+ LARG(lp,j).ndim = nf_dim;
635
+ ndloop_set_stepidx(lp, j, v, dim_map, flag);
636
+ if (nf_dim > 0) {
637
+ LARG(lp,j).shape = na->shape + (na->ndim - nf_dim);
638
+ }
639
+ } else if (TYPE(v)==T_ARRAY) {
640
+ LARG(lp,j).value = v;
641
+ LARG(lp,j).elmsz = sizeof(VALUE);
642
+ LARG(lp,j).ptr = NULL;
643
+ for (i=0; i<=max_nd; i++) {
644
+ LITER(lp,i,j).step = 1;
645
+ }
646
+ }
647
+ }
648
+ // check whether # of element is zero
649
+ for (s=1,i=0; i<=max_nd; i++) {
650
+ s *= lp->n[i];
651
+ }
652
+ if (s==0) {
653
+ for (i=0; i<=max_nd; i++) {
654
+ lp->n[i] = 0;
655
+ }
656
+ }
657
+ }
658
+
659
+
660
+ static int
661
+ ndloop_check_inplace(VALUE type, int na_ndim, size_t *na_shape, VALUE v)
662
+ {
663
+ int i;
664
+ narray_t *na;
665
+
666
+ // type check
667
+ if (type != CLASS_OF(v)) {
668
+ return 0;
669
+ }
670
+ GetNArray(v,na);
671
+ // shape check
672
+ if (na->ndim != na_ndim) {
673
+ return 0;
674
+ }
675
+ for (i=0; i<na_ndim; i++) {
676
+ if (na_shape[i] != na->shape[i]) {
677
+ return 0;
678
+ }
679
+ }
680
+ // v is selected as output
681
+ return 1;
682
+ }
683
+
684
+ static VALUE
685
+ ndloop_find_inplace(ndfunc_t *nf, na_md_loop_t *lp, VALUE type,
686
+ int na_ndim, size_t *na_shape, VALUE args)
687
+ {
688
+ int j;
689
+ VALUE v;
690
+
691
+ // find inplace
692
+ for (j=0; j<nf->nin; j++) {
693
+ v = RARRAY_AREF(args,j);
694
+ if (IsNArray(v)) {
695
+ if (TEST_INPLACE(v)) {
696
+ if (ndloop_check_inplace(type,na_ndim,na_shape,v)) {
697
+ // if already copied, create outary and write-back
698
+ if (lp->copy_flag & (1<<j)) {
699
+ lp->writeback = j;
700
+ }
701
+ return v;
702
+ }
703
+ }
704
+ }
705
+ }
706
+ // find casted or copied input array
707
+ for (j=0; j<nf->nin; j++) {
708
+ if (lp->copy_flag & (1<<j)) {
709
+ v = RARRAY_AREF(args,j);
710
+ if (ndloop_check_inplace(type,na_ndim,na_shape,v)) {
711
+ return v;
712
+ }
713
+ }
714
+ }
715
+ return Qnil;
716
+ }
717
+
718
+
719
+
720
+ static VALUE
721
+ ndloop_get_arg_type(ndfunc_t *nf, VALUE args, VALUE t)
722
+ {
723
+ int i;
724
+
725
+ // if type is FIXNUM, get the type of i-th argument
726
+ if (FIXNUM_P(t)) {
727
+ i = FIX2INT(t);
728
+ if (i<0 || i>=nf->nin) {
729
+ rb_bug("invalid type: index (%d) out of # of args",i);
730
+ }
731
+ t = nf->ain[i].type;
732
+ // if i-th type is Qnil, get the type of i-th input value
733
+ if (!CASTABLE(t)) {
734
+ t = CLASS_OF(RARRAY_AREF(args,i));
735
+ }
736
+ }
737
+ return t;
738
+ }
739
+
740
+
741
+ static VALUE
742
+ ndloop_set_output_narray(ndfunc_t *nf, na_md_loop_t *lp, int k,
743
+ VALUE type, VALUE args)
744
+ {
745
+ int i, j;
746
+ int na_ndim;
747
+ int lp_dim;
748
+ volatile VALUE v=Qnil;
749
+ size_t *na_shape;
750
+ int *dim_map;
751
+ int flag = NDL_READ_WRITE;
752
+ int nd;
753
+ int max_nd = lp->ndim + nf->aout[k].dim;
754
+
755
+ na_shape = ALLOCA_N(size_t, max_nd);
756
+ dim_map = ALLOCA_N(int, max_nd);
757
+
758
+ //printf("max_nd=%d lp->ndim=%d\n",max_nd,lp->ndim);
759
+
760
+ // md-loop shape
761
+ na_ndim = 0;
762
+ for (i=0; i<lp->ndim; i++) {
763
+ // na_shape[i] == lp->n[lp->trans_map[i]]
764
+ lp_dim = lp->trans_map[i];
765
+ //printf("i=%d lp_dim=%d\n",i,lp_dim);
766
+ if (NDF_TEST(nf,NDF_CUM)) { // cumulate with shape kept
767
+ na_shape[na_ndim] = lp->n[lp_dim];
768
+ } else
769
+ if (na_test_reduce(lp->reduce,lp_dim)) { // accumulate dimension
770
+ if (NDF_TEST(nf,NDF_KEEP_DIM)) {
771
+ na_shape[na_ndim] = 1; // leave it
772
+ } else {
773
+ continue; // delete dimension
774
+ }
775
+ } else {
776
+ na_shape[na_ndim] = lp->n[lp_dim];
777
+ }
778
+ //printf("i=%d lp_dim=%d na_shape[%d]=%ld\n",i,lp_dim,i,na_shape[i]);
779
+ dim_map[na_ndim++] = lp_dim;
780
+ //dim_map[lp_dim] = na_ndim++;
781
+ }
782
+
783
+ // user-specified shape
784
+ for (i=0; i<nf->aout[k].dim; i++) {
785
+ na_shape[na_ndim] = nf->aout[k].shape[i];
786
+ dim_map[na_ndim++] = i + lp->ndim;
787
+ }
788
+
789
+ // find inplace from input arrays
790
+ if (k==0 && NDF_TEST(nf,NDF_INPLACE)) {
791
+ v = ndloop_find_inplace(nf,lp,type,na_ndim,na_shape,args);
792
+ }
793
+ if (!RTEST(v)) {
794
+ // new object
795
+ v = nary_new(type, na_ndim, na_shape);
796
+ flag = NDL_WRITE;
797
+ }
798
+
799
+ j = lp->nin + k;
800
+ LARG(lp,j).ndim = nd = nf->aout[k].dim;
801
+ ndloop_set_stepidx(lp, j, v, dim_map, flag);
802
+ if (nd > 0) {
803
+ LARG(lp,j).shape = nf->aout[k].shape;
804
+ }
805
+
806
+ return v;
807
+ }
808
+
809
+ static VALUE
810
+ ndloop_set_output(ndfunc_t *nf, na_md_loop_t *lp, VALUE args)
811
+ {
812
+ int i, j, k, idx;
813
+ volatile VALUE v, t, results;
814
+ VALUE init;
815
+
816
+ int max_nd = lp->ndim + lp->user.ndim;
817
+
818
+ // output results
819
+ results = rb_ary_new2(nf->nout);
820
+
821
+ for (k=0; k<nf->nout; k++) {
822
+ t = nf->aout[k].type;
823
+ t = ndloop_get_arg_type(nf,args,t);
824
+
825
+ if (rb_obj_is_kind_of(t, rb_cClass)) {
826
+ if (RTEST(rb_class_inherited_p(t, cNArray))) {
827
+ // NArray
828
+ v = ndloop_set_output_narray(nf,lp,k,t,args);
829
+ rb_ary_push(results, v);
830
+ }
831
+ else if (RTEST(rb_class_inherited_p(t, rb_cArray))) {
832
+ // Ruby Array
833
+ j = lp->nin + k;
834
+ for (i=0; i<=max_nd; i++) {
835
+ LITER(lp,i,j).step = sizeof(VALUE);
836
+ }
837
+ LARG(lp,j).value = t;
838
+ LARG(lp,j).elmsz = sizeof(VALUE);
839
+ } else {
840
+ rb_raise(rb_eRuntimeError,"ndloop_set_output: invalid for type");
841
+ }
842
+ }
843
+ }
844
+
845
+ // initialilzer
846
+ k = lp->init_aidx;
847
+ if (k > -1) {
848
+ idx = nf->ain[k].dim;
849
+ v = RARRAY_AREF(results,idx);
850
+ init = RARRAY_AREF(args,k);
851
+ na_store(v,init);
852
+ }
853
+
854
+ return results;
855
+ }
856
+
857
+
858
+ // Compressing dimesions.
859
+ //
860
+ // For example, compressing [2,3] shape into [6] so that we can process
861
+ // all elements with one user loop.
862
+ static void
863
+ ndfunc_contract_loop(na_md_loop_t *lp)
864
+ {
865
+ int i,j,k,success,cnt=0;
866
+ int red0, redi;
867
+
868
+ redi = na_test_reduce(lp->reduce,0);
869
+
870
+ //for (i=0; i<lp->ndim; i++) {
871
+ // printf("lp->n[%d]=%lu\n",i,lp->n[i]);
872
+ //}
873
+
874
+ for (i=1; i<lp->ndim; i++) {
875
+ red0 = redi;
876
+ redi = na_test_reduce(lp->reduce,i);
877
+ //printf("contract i=%d reduce_cond=%d %d\n",i,red0,redi);
878
+ if (red0 != redi) {
879
+ continue;
880
+ }
881
+ success = 1;
882
+ for (j=0; j<lp->narg; j++) {
883
+ if (!(LITER(lp,i,j).idx == NULL &&
884
+ LITER(lp,i-1,j).idx == NULL &&
885
+ LITER(lp,i-1,j).step == LITER(lp,i,j).step*(ssize_t)(lp->n[i]))) {
886
+ success = 0;
887
+ break;
888
+ }
889
+ }
890
+ if (success) {
891
+ //printf("contract i=%d-th and %d-th, lp->n[%d]=%"SZF"d, lp->n[%d]=%"SZF"d\n",
892
+ // i-1,i, i,lp->n[i], i-1,lp->n[i-1]);
893
+ // contract (i-1)-th and i-th dimension
894
+ lp->n[i] *= lp->n[i-1];
895
+ // shift dimensions
896
+ for (k=i-1; k>cnt; k--) {
897
+ lp->n[k] = lp->n[k-1];
898
+ }
899
+ //printf("k=%d\n",k);
900
+ for (; k>=0; k--) {
901
+ lp->n[k] = 1;
902
+ }
903
+ for (j=0; j<lp->narg; j++) {
904
+ for (k=i-1; k>cnt; k--) {
905
+ LITER(lp,k,j) = LITER(lp,k-1,j);
906
+ }
907
+ }
908
+ if (redi) {
909
+ lp->reduce_dim--;
910
+ }
911
+ cnt++;
912
+ }
913
+ }
914
+ //printf("contract cnt=%d\n",cnt);
915
+ if (cnt>0) {
916
+ for (j=0; j<lp->narg; j++) {
917
+ LITER(lp,cnt,j).pos = LITER(lp,0,j).pos;
918
+ lp->xargs[j].iter = &LITER(lp,cnt,j);
919
+ }
920
+ lp->n = &(lp->n[cnt]);
921
+ lp->ndim -= cnt;
922
+ //for (i=0; i<lp->ndim; i++) {printf("lp->n[%d]=%lu\n",i,lp->n[i]);}
923
+ }
924
+ }
925
+
926
+
927
+ // Ndloop does loop at two places, loop_narray and user loop.
928
+ // loop_narray is an outer loop, and the user loop is an internal loop.
929
+ //
930
+ // lp->ndim: ndim to be looped at loop_narray
931
+ // lp->user.ndim: ndim to be looped at user function
932
+ //
933
+ // For example, for element-wise function, lp->user.ndim is 1, and lp->ndim -= 1.
934
+ static void
935
+ ndfunc_set_user_loop(ndfunc_t *nf, na_md_loop_t *lp)
936
+ {
937
+ int j, ud=0;
938
+
939
+ if (lp->reduce_dim > 0) {
940
+ // Increase user.ndim by number of dimensions to reduce for reduction function.
941
+ ud = lp->reduce_dim;
942
+ }
943
+ else if (lp->ndim > 0 && NDF_TEST(nf,NDF_HAS_LOOP)) {
944
+ // Set user.ndim to 1 (default is 0) for element-wise function.
945
+ ud = 1;
946
+ }
947
+ else {
948
+ goto skip_ud;
949
+ }
950
+ if (ud > lp->ndim) {
951
+ rb_bug("Reduce-dimension is larger than loop-dimension");
952
+ }
953
+ // Increase user loop dimension. NOTE: lp->ndim + lp->user.ndim is the total dimension.
954
+ lp->user.ndim += ud;
955
+ lp->ndim -= ud;
956
+ for (j=0; j<lp->narg; j++) {
957
+ if (LARG(lp,j).shape) {
958
+ rb_bug("HAS_LOOP or reduce-dimension=%d conflicts with user-dimension",lp->reduce_dim);
959
+ }
960
+ LARG(lp,j).ndim += ud;
961
+ LARG(lp,j).shape = &(lp->n[lp->ndim]);
962
+ //printf("LARG(lp,j).ndim=%d,LARG(lp,j).shape=%lx\n",LARG(lp,j).ndim,(size_t)LARG(lp,j).shape);
963
+ }
964
+ //printf("lp->reduce_dim=%d lp->user.ndim=%d lp->ndim=%d\n",lp->reduce_dim,lp->user.ndim,lp->ndim);
965
+
966
+ skip_ud:
967
+ // user function shape is the latter part of na_md_loop shape.
968
+ lp->user.n = &(lp->n[lp->ndim]);
969
+ for (j=0; j<lp->narg; j++) {
970
+ LARG(lp,j).iter = &LITER(lp,lp->ndim,j);
971
+ //printf("in ndfunc_set_user_loop: lp->user.args[%d].iter=%lx\n",j,(size_t)(LARG(lp,j).iter));
972
+ }
973
+ }
974
+
975
+
976
+ // Initialize lp->user for indexer loop.
977
+ static void
978
+ ndfunc_set_user_indexer_loop(ndfunc_t *nf, na_md_loop_t *lp)
979
+ {
980
+ int j;
981
+
982
+ lp->user.ndim = lp->ndim;
983
+ lp->ndim = 0;
984
+
985
+ if (NDF_TEST(nf,NDF_FLAT_REDUCE)) {
986
+ // in
987
+ LARG(lp,0).ndim = lp->user.ndim;
988
+ LARG(lp,0).shape = &(lp->n[lp->ndim]);
989
+ // out is constructed at na_make_reduction_arg from in and lp->reduce
990
+
991
+ lp->user.n = &(lp->n[lp->ndim]);
992
+ for (j=0; j<lp->narg; j++) {
993
+ LARG(lp,j).iter = &LITER(lp,lp->ndim,j);
994
+ }
995
+
996
+ lp->user.reduce_dim = lp->reduce_dim;
997
+ lp->user.reduce = lp->reduce;
998
+ } else { // element-wise
999
+ for (j=0; j<lp->narg; j++) {
1000
+ LARG(lp,j).ndim = lp->user.ndim;
1001
+ LARG(lp,j).shape = &(lp->n[lp->ndim]);
1002
+ }
1003
+
1004
+ lp->user.n = &(lp->n[lp->ndim]);
1005
+ for (j=0; j<lp->narg; j++) {
1006
+ LARG(lp,j).iter = &LITER(lp,lp->ndim,j);
1007
+ }
1008
+
1009
+ lp->user.reduce_dim = 0;
1010
+ lp->user.reduce = 0;
1011
+ }
1012
+ }
1013
+
1014
+
1015
+ // Judge whether a (contiguous) buffer copy is required or not, and malloc if it is required.
1016
+ //
1017
+ // CASES TO REQUIRE A BUFFER COPY:
1018
+ // 1) ndloop has `idx` but does not support NDF_INDEX_LOOP.
1019
+ // 2) ndloop has non-contiguous arrays but does not support NDF_STRIDE_LOOP.
1020
+ static void
1021
+ ndfunc_set_bufcp(ndfunc_t *nf, na_md_loop_t *lp)
1022
+ {
1023
+ unsigned int f;
1024
+ int i, j;
1025
+ int nd, ndim;
1026
+ bool zero_step;
1027
+ ssize_t n, sz, elmsz, stride, n_total; //, last_step;
1028
+ size_t *buf_shape;
1029
+ na_loop_iter_t *buf_iter=NULL, *src_iter;
1030
+
1031
+ unsigned int loop_spec = ndloop_func_loop_spec(nf, lp->user.ndim);
1032
+ //if (loop_spec==0) return;
1033
+
1034
+ n_total = lp->user.n[0];
1035
+ for (i=1; i<lp->user.ndim; i++) {
1036
+ n_total *= lp->user.n[i];
1037
+ }
1038
+
1039
+ //for (j=0; j<lp->nin; j++) {
1040
+ for (j=0; j<lp->narg; j++) {
1041
+ //ndim = nd = lp->user.ndim;
1042
+ ndim = nd = LARG(lp,j).ndim;
1043
+ sz = elmsz = LARG(lp,j).elmsz;
1044
+ src_iter = LARG(lp,j).iter;
1045
+ //last_step = src_iter[ndim-1].step;
1046
+ f = 0;
1047
+ zero_step = 1;
1048
+ for (i=ndim; i>0; ) {
1049
+ i--;
1050
+ if (LARG(lp,j).shape) {
1051
+ n = LARG(lp,j).shape[i];
1052
+ } else {
1053
+ //printf("shape is NULL\n");
1054
+ n = lp->user.n[i];
1055
+ }
1056
+ stride = sz * n;
1057
+ //printf("{j=%d,i=%d,ndim=%d,nd=%d,idx=%lx,step=%ld,n=%ld,sz=%ld,stride=%ld}\n",j,i,ndim,nd,(size_t)src_iter[i].idx,src_iter[i].step,n,sz,stride);
1058
+ if (src_iter[i].idx) {
1059
+ f |= 2; // INDEX LOOP
1060
+ zero_step = 0;
1061
+ } else {
1062
+ if (src_iter[i].step != sz) {
1063
+ f |= 1; // NON_CONTIGUOUS LOOP
1064
+ } else {
1065
+ // CONTIGUOUS LOOP
1066
+ if (i==ndim-1) { // contract if last dimension
1067
+ ndim = i;
1068
+ elmsz = stride;
1069
+ }
1070
+ }
1071
+ if (src_iter[i].step != 0) {
1072
+ zero_step = 0;
1073
+ }
1074
+ }
1075
+ sz = stride;
1076
+ }
1077
+ //printf("[j=%d f=%d loop_spec=%d zero_step=%d]\n",j,f,loop_spec,zero_step);
1078
+
1079
+ if (zero_step) {
1080
+ // no buffer needed
1081
+ continue;
1082
+ }
1083
+
1084
+ // should check flatten-able loop to avoid buffering
1085
+
1086
+
1087
+ // over loop_spec or reduce_loop is not contiguous
1088
+ if (f & loop_spec || (lp->reduce_dim > 1 && ndim > 0)) {
1089
+ //printf("(buf,nd=%d)",nd);
1090
+ buf_iter = ALLOC_N(na_loop_iter_t,nd+3);
1091
+ buf_shape = ALLOC_N(size_t,nd);
1092
+ buf_iter[nd].pos = 0;
1093
+ buf_iter[nd].step = 0;
1094
+ buf_iter[nd].idx = NULL;
1095
+ sz = LARG(lp,j).elmsz;
1096
+ //last_step = sz;
1097
+ for (i=nd; i>0; ) {
1098
+ i--;
1099
+ buf_iter[i].pos = 0;
1100
+ buf_iter[i].step = sz;
1101
+ buf_iter[i].idx = NULL;
1102
+ //n = lp->user.n[i];
1103
+ n = LARG(lp,j).shape[i];
1104
+ buf_shape[i] = n;
1105
+ sz *= n;
1106
+ }
1107
+ LBUFCP(lp,j) = ALLOC(na_buffer_copy_t);
1108
+ LBUFCP(lp,j)->ndim = ndim;
1109
+ LBUFCP(lp,j)->elmsz = elmsz;
1110
+ LBUFCP(lp,j)->n = buf_shape;
1111
+ LBUFCP(lp,j)->src_iter = src_iter;
1112
+ LBUFCP(lp,j)->buf_iter = buf_iter;
1113
+ LARG(lp,j).iter = buf_iter;
1114
+ //printf("in ndfunc_set_bufcp(1): lp->user.args[%d].iter=%lx\n",j,(size_t)(LARG(lp,j).iter));
1115
+ LBUFCP(lp,j)->src_ptr = LARG(lp,j).ptr;
1116
+ if (cumo_cuda_runtime_is_device_memory(LARG(lp,j).ptr)) {
1117
+ LARG(lp,j).ptr = LBUFCP(lp,j)->buf_ptr = cumo_cuda_runtime_malloc(sz);
1118
+ }
1119
+ else {
1120
+ LARG(lp,j).ptr = LBUFCP(lp,j)->buf_ptr = xmalloc(sz);
1121
+ }
1122
+ //printf("(LBUFCP(lp,%d)->buf_ptr=%lx)\n",j,(size_t)(LBUFCP(lp,j)->buf_ptr));
1123
+ }
1124
+ }
1125
+
1126
+ #if 0
1127
+ for (j=0; j<lp->narg; j++) {
1128
+ ndim = lp->user.ndim;
1129
+ src_iter = LARG(lp,j).iter;
1130
+ last_step = src_iter[ndim-1].step;
1131
+ if (lp->reduce_dim>1) {
1132
+ //printf("(reduce_dim=%d,ndim=%d,nd=%d,n=%ld,lst=%ld)\n",lp->reduce_dim,ndim,nd,n_total,last_step);
1133
+ buf_iter = ALLOC_N(na_loop_iter_t,2);
1134
+ buf_iter[0].pos = LARG(lp,j).iter[0].pos;
1135
+ buf_iter[0].step = last_step;
1136
+ buf_iter[0].idx = NULL;
1137
+ buf_iter[1].pos = 0;
1138
+ buf_iter[1].step = 0;
1139
+ buf_iter[1].idx = NULL;
1140
+ LARG(lp,j).iter = buf_iter;
1141
+ //printf("in ndfunc_set_bufcp(2): lp->user.args[%d].iter=%lx\n",j,(size_t)(LARG(lp,j).iter));
1142
+ lp->xargs[j].free_user_iter = 1;
1143
+ }
1144
+ }
1145
+ #endif
1146
+
1147
+ // flatten reduce dimensions
1148
+ if (lp->reduce_dim > 1) {
1149
+ #if 1
1150
+ for (j=0; j<lp->narg; j++) {
1151
+ ndim = lp->user.ndim;
1152
+ LARG(lp,j).iter[0].step = LARG(lp,j).iter[ndim-1].step;
1153
+ LARG(lp,j).iter[0].idx = NULL;
1154
+ }
1155
+ #endif
1156
+ lp->user.n[0] = n_total;
1157
+ lp->user.ndim = 1;
1158
+ }
1159
+ }
1160
+
1161
+
1162
+ // Make contiguous memory for ops not supporting index or stride (step) loop
1163
+ static void
1164
+ ndloop_copy_to_buffer(na_buffer_copy_t *lp)
1165
+ {
1166
+ size_t *c;
1167
+ char *src, *buf;
1168
+ int i;
1169
+ int nd = lp->ndim;
1170
+ size_t elmsz = lp->elmsz;
1171
+ size_t buf_pos = 0;
1172
+ DBG(size_t j);
1173
+
1174
+ //printf("\nto_buf nd=%d elmsz=%ld\n",nd,elmsz);
1175
+ DBG(printf("<to buf> ["));
1176
+ // zero-dimension
1177
+ if (nd==0) {
1178
+ src = lp->src_ptr + LITER_SRC(lp,0).pos;
1179
+ buf = lp->buf_ptr;
1180
+ if (cumo_cuda_runtime_is_device_memory(src) && cumo_cuda_runtime_is_device_memory(buf)) {
1181
+ DBG(printf("DtoD] ["));
1182
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(buf,src,elmsz,cudaMemcpyDeviceToDevice,0));
1183
+ } else {
1184
+ DBG(printf("HtoH] ["));
1185
+ memcpy(buf,src,elmsz);
1186
+ }
1187
+ DBG(for (j=0; j<elmsz/8; j++) {printf("%g,",((double*)(buf))[j]);});
1188
+ goto loop_end;
1189
+ }
1190
+ // initialize loop counter
1191
+ c = ALLOCA_N(size_t, nd+1);
1192
+ for (i=0; i<=nd; i++) c[i]=0;
1193
+ // loop body
1194
+ for (i=0;;) {
1195
+ // i-th dimension
1196
+ for (; i<nd; i++) {
1197
+ if (LITER_SRC(lp,i).idx) {
1198
+ SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("ndloop_copy_to_buffer", "any");
1199
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
1200
+ LITER_SRC(lp,i+1).pos = LITER_SRC(lp,i).pos + LITER_SRC(lp,i).idx[c[i]];
1201
+ } else {
1202
+ LITER_SRC(lp,i+1).pos = LITER_SRC(lp,i).pos + LITER_SRC(lp,i).step*c[i];
1203
+ }
1204
+ }
1205
+ src = lp->src_ptr + LITER_SRC(lp,nd).pos;
1206
+ buf = lp->buf_ptr + buf_pos;
1207
+ if (cumo_cuda_runtime_is_device_memory(src) && cumo_cuda_runtime_is_device_memory(buf)) {
1208
+ DBG(printf("DtoD] ["));
1209
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(buf,src,elmsz,cudaMemcpyDeviceToDevice,0));
1210
+ } else {
1211
+ DBG(printf("HtoH] ["));
1212
+ memcpy(buf,src,elmsz);
1213
+ }
1214
+ DBG(for (j=0; j<elmsz/8; j++) {printf("%g,",((double*)(buf))[j]);});
1215
+ buf_pos += elmsz;
1216
+ // count up
1217
+ for (;;) {
1218
+ if (i<=0) goto loop_end;
1219
+ i--;
1220
+ if (++c[i] < lp->n[i]) break;
1221
+ c[i] = 0;
1222
+ }
1223
+ }
1224
+ loop_end:
1225
+ ;
1226
+ DBG(printf("]\n"));
1227
+ }
1228
+
1229
+ static void
1230
+ ndloop_copy_from_buffer(na_buffer_copy_t *lp)
1231
+ {
1232
+ size_t *c;
1233
+ char *src, *buf;
1234
+ int i;
1235
+ int nd = lp->ndim;
1236
+ size_t elmsz = lp->elmsz;
1237
+ size_t buf_pos = 0;
1238
+ DBG(size_t j);
1239
+
1240
+ //printf("\nfrom_buf nd=%d elmsz=%ld\n",nd,elmsz);
1241
+ DBG(printf("<from buf> ["));
1242
+ // zero-dimension
1243
+ if (nd==0) {
1244
+ src = lp->src_ptr + LITER_SRC(lp,0).pos;
1245
+ buf = lp->buf_ptr;
1246
+ if (cumo_cuda_runtime_is_device_memory(src) && cumo_cuda_runtime_is_device_memory(buf)) {
1247
+ DBG(printf("DtoD] ["));
1248
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(src,buf,elmsz,cudaMemcpyDeviceToDevice,0));
1249
+ } else {
1250
+ DBG(printf("HtoH] ["));
1251
+ memcpy(src,buf,elmsz);
1252
+ }
1253
+ DBG(for (j=0; j<elmsz/8; j++) {printf("%g,",((double*)(src))[j]);});
1254
+ goto loop_end;
1255
+ }
1256
+ // initialize loop counter
1257
+ c = ALLOCA_N(size_t, nd+1);
1258
+ for (i=0; i<=nd; i++) c[i]=0;
1259
+ // loop body
1260
+ for (i=0;;) {
1261
+ // i-th dimension
1262
+ for (; i<nd; i++) {
1263
+ if (LITER_SRC(lp,i).idx) {
1264
+ LITER_SRC(lp,i+1).pos = LITER_SRC(lp,i).pos + LITER_SRC(lp,i).idx[c[i]];
1265
+ } else {
1266
+ LITER_SRC(lp,i+1).pos = LITER_SRC(lp,i).pos + LITER_SRC(lp,i).step*c[i];
1267
+ }
1268
+ }
1269
+ src = lp->src_ptr + LITER_SRC(lp,nd).pos;
1270
+ buf = lp->buf_ptr + buf_pos;
1271
+ if (cumo_cuda_runtime_is_device_memory(src) && cumo_cuda_runtime_is_device_memory(buf)) {
1272
+ DBG(printf("DtoD] ["));
1273
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(src,buf,elmsz,cudaMemcpyDeviceToDevice,0));
1274
+ } else {
1275
+ DBG(printf("HtoH] ["));
1276
+ memcpy(src,buf,elmsz);
1277
+ }
1278
+ DBG(for (j=0; j<elmsz/8; j++) {printf("%g,",((double*)(src))[j]);});
1279
+ buf_pos += elmsz;
1280
+ // count up
1281
+ for (;;) {
1282
+ if (i<=0) goto loop_end;
1283
+ i--;
1284
+ if (++c[i] < lp->n[i]) break;
1285
+ c[i] = 0;
1286
+ }
1287
+ }
1288
+ loop_end:
1289
+ DBG(printf("]\n"));
1290
+ }
1291
+
1292
+
1293
+ static void
1294
+ ndfunc_write_back(ndfunc_t *nf, na_md_loop_t *lp, VALUE orig_args, VALUE results)
1295
+ {
1296
+ VALUE src, dst;
1297
+
1298
+ if (lp->writeback >= 0) {
1299
+ dst = RARRAY_AREF(orig_args,lp->writeback);
1300
+ src = RARRAY_AREF(results,0);
1301
+ na_store(dst,src);
1302
+ RARRAY_ASET(results,0,dst);
1303
+ }
1304
+ }
1305
+
1306
+
1307
+ static VALUE
1308
+ ndloop_extract(VALUE results, ndfunc_t *nf)
1309
+ {
1310
+ // long n, i;
1311
+ // VALUE x, y;
1312
+ // narray_t *na;
1313
+
1314
+ // extract result objects
1315
+ switch(nf->nout) {
1316
+ case 0:
1317
+ return Qnil;
1318
+ case 1:
1319
+ return RARRAY_AREF(results,0);
1320
+ // x = RARRAY_AREF(results,0);
1321
+ // if (NDF_TEST(nf,NDF_EXTRACT)) {
1322
+ // if (IsNArray(x)){
1323
+ // GetNArray(x,na);
1324
+ // if (NA_NDIM(na)==0) {
1325
+ // x = rb_funcall(x, id_extract, 0);
1326
+ // }
1327
+ // }
1328
+ // }
1329
+ // return x;
1330
+ }
1331
+ // if (NDF_TEST(nf,NDF_EXTRACT)) {
1332
+ // n = RARRAY_LEN(results);
1333
+ // for (i=0; i<n; i++) {
1334
+ // x = RARRAY_AREF(results,i);
1335
+ // if (IsNArray(x)){
1336
+ // GetNArray(x,na);
1337
+ // if (NA_NDIM(na)==0) {
1338
+ // y = rb_funcall(x, id_extract, 0);
1339
+ // RARRAY_ASET(results,i,y);
1340
+ // }
1341
+ // }
1342
+ // }
1343
+ // }
1344
+ return results;
1345
+ }
1346
+
1347
+ static bool
1348
+ loop_is_using_idx(na_md_loop_t *lp)
1349
+ {
1350
+ int i, j;
1351
+ int nd = lp->ndim;
1352
+
1353
+ if (nd<0) {
1354
+ rb_bug("bug? lp->ndim = %d\n", lp->ndim);
1355
+ }
1356
+
1357
+ // i-th dimension
1358
+ for (i=0; i<nd; i++) {
1359
+ // j-th argument
1360
+ for (j=0; j<lp->narg; j++) {
1361
+ if (LITER(lp,i,j).idx) {
1362
+ return true;
1363
+ }
1364
+ }
1365
+ }
1366
+ return false;
1367
+ }
1368
+
1369
+ static void
1370
+ loop_narray(ndfunc_t *nf, na_md_loop_t *lp);
1371
+
1372
+ static VALUE
1373
+ ndloop_run(VALUE vlp)
1374
+ {
1375
+ volatile VALUE args, orig_args, results;
1376
+ na_md_loop_t *lp = (na_md_loop_t*)(vlp);
1377
+ ndfunc_t *nf;
1378
+
1379
+ orig_args = lp->vargs;
1380
+ nf = lp->ndfunc;
1381
+
1382
+ args = rb_obj_dup(orig_args);
1383
+
1384
+ // setup ndloop iterator with arguments
1385
+ ndloop_init_args(nf, lp, args);
1386
+ results = ndloop_set_output(nf, lp, args);
1387
+ //if (na_debug_flag) {
1388
+ // printf("-- ndloop_set_output --\n");
1389
+ // print_ndloop(lp);
1390
+ //}
1391
+
1392
+ // contract loop (compact dimessions)
1393
+ if (NDF_TEST(nf,NDF_INDEXER_LOOP) && NDF_TEST(nf,NDF_FLAT_REDUCE)) {
1394
+ // do nothing
1395
+ // TODO(sonots): support compacting dimensions in reduction indexer loop if it allows speed up.
1396
+ } else {
1397
+ if (lp->loop_func == loop_narray) {
1398
+ ndfunc_contract_loop(lp);
1399
+ if (na_debug_flag) {
1400
+ printf("-- ndfunc_contract_loop --\n");
1401
+ print_ndloop(lp);
1402
+ }
1403
+ }
1404
+ }
1405
+
1406
+ // setup lp->user
1407
+ if (NDF_TEST(nf,NDF_INDEXER_LOOP)) {
1408
+ ndfunc_set_user_indexer_loop(nf, lp);
1409
+ if (na_debug_flag) {
1410
+ printf("-- ndfunc_set_user_indexer_loop --\n");
1411
+ print_ndloop(lp);
1412
+ }
1413
+ } else {
1414
+ ndfunc_set_user_loop(nf, lp);
1415
+ if (na_debug_flag) {
1416
+ printf("-- ndfunc_set_user_loop --\n");
1417
+ print_ndloop(lp);
1418
+ }
1419
+ }
1420
+
1421
+ // setup buffering during loop
1422
+ if (NDF_TEST(nf,NDF_INDEXER_LOOP) && NDF_TEST(nf,NDF_FLAT_REDUCE) && !loop_is_using_idx(lp)) {
1423
+ // do nothing
1424
+ } else {
1425
+ if (lp->loop_func == loop_narray) {
1426
+ ndfunc_set_bufcp(nf, lp);
1427
+ }
1428
+ if (na_debug_flag) {
1429
+ printf("-- ndfunc_set_bufcp --\n");
1430
+ print_ndloop(lp);
1431
+ }
1432
+ }
1433
+
1434
+ // loop
1435
+ (*(lp->loop_func))(nf, lp);
1436
+
1437
+ if (RTEST(lp->user.err_type)) {
1438
+ rb_raise(lp->user.err_type, "error in NArray operation");
1439
+ }
1440
+
1441
+ // write-back will be placed here
1442
+ ndfunc_write_back(nf, lp, orig_args, results);
1443
+
1444
+ // extract result objects
1445
+ return ndloop_extract(results, nf);
1446
+ }
1447
+
1448
+
1449
+ // ---------------------------------------------------------------------------
1450
+
1451
+ static void
1452
+ loop_narray(ndfunc_t *nf, na_md_loop_t *lp)
1453
+ {
1454
+ size_t *c;
1455
+ int i, j;
1456
+ int nd = lp->ndim;
1457
+
1458
+ if (nd<0) {
1459
+ rb_bug("bug? lp->ndim = %d\n", lp->ndim);
1460
+ }
1461
+
1462
+ if (nd==0 || NDF_TEST(nf,NDF_INDEXER_LOOP)) {
1463
+ for (j=0; j<lp->nin; j++) {
1464
+ if (lp->xargs[j].bufcp) {
1465
+ //printf("copy_to_buffer j=%d\n",j);
1466
+ ndloop_copy_to_buffer(lp->xargs[j].bufcp);
1467
+ }
1468
+ }
1469
+ (*(nf->func))(&(lp->user));
1470
+ for (j=0; j<lp->narg; j++) {
1471
+ if (lp->xargs[j].bufcp && (lp->xargs[j].flag & NDL_WRITE)) {
1472
+ //printf("copy_from_buffer j=%d\n",j);
1473
+ // copy data to work buffer
1474
+ ndloop_copy_from_buffer(lp->xargs[j].bufcp);
1475
+ }
1476
+ }
1477
+ return;
1478
+ }
1479
+
1480
+ // initialize loop counter
1481
+ c = ALLOCA_N(size_t, nd+1);
1482
+ for (i=0; i<=nd; i++) c[i]=0;
1483
+
1484
+ // loop body
1485
+ for (i=0;;) {
1486
+ // i-th dimension
1487
+ for (; i<nd; i++) {
1488
+ // j-th argument
1489
+ for (j=0; j<lp->narg; j++) {
1490
+ if (LITER(lp,i,j).idx) {
1491
+ LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
1492
+ } else {
1493
+ LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
1494
+ }
1495
+ //printf("j=%d c[i=%d]=%lu pos=%lu\n",j,i,c[i],LITER(lp,i+1,j).pos);
1496
+ }
1497
+ }
1498
+ for (j=0; j<lp->nin; j++) {
1499
+ if (lp->xargs[j].bufcp) {
1500
+ // copy data to work buffer
1501
+ // cp lp->iter[j][nd..*] to lp->user.args[j].iter[0..*]
1502
+ //printf("copy_to_buffer j=%d\n",j);
1503
+ ndloop_copy_to_buffer(lp->xargs[j].bufcp);
1504
+ }
1505
+ }
1506
+ (*(nf->func))(&(lp->user));
1507
+ for (j=0; j<lp->narg; j++) {
1508
+ if (lp->xargs[j].bufcp && (lp->xargs[j].flag & NDL_WRITE)) {
1509
+ // copy data to work buffer
1510
+ //printf("copy_from_buffer j=%d\n",j);
1511
+ ndloop_copy_from_buffer(lp->xargs[j].bufcp);
1512
+ }
1513
+ }
1514
+ if (RTEST(lp->user.err_type)) {return;}
1515
+
1516
+ for (;;) {
1517
+ if (i<=0) goto loop_end;
1518
+ i--;
1519
+ if (++c[i] < lp->n[i]) break;
1520
+ c[i] = 0;
1521
+ }
1522
+ }
1523
+ loop_end:
1524
+ ;
1525
+ }
1526
+
1527
+
1528
+ static VALUE
1529
+ na_ndloop_main(ndfunc_t *nf, VALUE args, void *opt_ptr)
1530
+ {
1531
+ unsigned int copy_flag;
1532
+ na_md_loop_t lp;
1533
+
1534
+ if (na_debug_flag) print_ndfunc(nf);
1535
+
1536
+ // cast arguments to NArray
1537
+ copy_flag = ndloop_cast_args(nf, args);
1538
+
1539
+ // allocate ndloop struct
1540
+ ndloop_alloc(&lp, nf, args, opt_ptr, copy_flag, loop_narray);
1541
+
1542
+ return rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
1543
+ }
1544
+
1545
+
1546
+ VALUE
1547
+ #ifdef HAVE_STDARG_PROTOTYPES
1548
+ na_ndloop(ndfunc_t *nf, int argc, ...)
1549
+ #else
1550
+ na_ndloop(nf, argc, va_alist)
1551
+ ndfunc_t *nf;
1552
+ int argc;
1553
+ va_dcl
1554
+ #endif
1555
+ {
1556
+ va_list ar;
1557
+
1558
+ int i;
1559
+ VALUE *argv;
1560
+ volatile VALUE args;
1561
+
1562
+ argv = ALLOCA_N(VALUE,argc);
1563
+
1564
+ va_init_list(ar, argc);
1565
+ for (i=0; i<argc; i++) {
1566
+ argv[i] = va_arg(ar, VALUE);
1567
+ }
1568
+ va_end(ar);
1569
+
1570
+ args = rb_ary_new4(argc, argv);
1571
+
1572
+ return na_ndloop_main(nf, args, NULL);
1573
+ }
1574
+
1575
+
1576
+ VALUE
1577
+ na_ndloop2(ndfunc_t *nf, VALUE args)
1578
+ {
1579
+ return na_ndloop_main(nf, args, NULL);
1580
+ }
1581
+
1582
+ VALUE
1583
+ #ifdef HAVE_STDARG_PROTOTYPES
1584
+ na_ndloop3(ndfunc_t *nf, void *ptr, int argc, ...)
1585
+ #else
1586
+ na_ndloop3(nf, ptr, argc, va_alist)
1587
+ ndfunc_t *nf;
1588
+ void *ptr;
1589
+ int argc;
1590
+ va_dcl
1591
+ #endif
1592
+ {
1593
+ va_list ar;
1594
+
1595
+ int i;
1596
+ VALUE *argv;
1597
+ volatile VALUE args;
1598
+
1599
+ argv = ALLOCA_N(VALUE,argc);
1600
+
1601
+ va_init_list(ar, argc);
1602
+ for (i=0; i<argc; i++) {
1603
+ argv[i] = va_arg(ar, VALUE);
1604
+ }
1605
+ va_end(ar);
1606
+
1607
+ args = rb_ary_new4(argc, argv);
1608
+
1609
+ return na_ndloop_main(nf, args, ptr);
1610
+ }
1611
+
1612
+ VALUE
1613
+ na_ndloop4(ndfunc_t *nf, void *ptr, VALUE args)
1614
+ {
1615
+ return na_ndloop_main(nf, args, ptr);
1616
+ }
1617
+
1618
+ //----------------------------------------------------------------------
1619
+
1620
+ VALUE
1621
+ na_info_str(VALUE ary)
1622
+ {
1623
+ int nd, i;
1624
+ char tmp[32];
1625
+ VALUE buf;
1626
+ narray_t *na;
1627
+
1628
+ GetNArray(ary,na);
1629
+ nd = na->ndim;
1630
+
1631
+ buf = rb_str_new2(rb_class2name(CLASS_OF(ary)));
1632
+ if (NA_TYPE(na) == NARRAY_VIEW_T) {
1633
+ rb_str_cat(buf,"(view)",6);
1634
+ }
1635
+ rb_str_cat(buf,"#shape=[",8);
1636
+ if (nd>0) {
1637
+ for (i=0;;) {
1638
+ sprintf(tmp,"%"SZF"u",na->shape[i]);
1639
+ rb_str_cat2(buf,tmp);
1640
+ if (++i==nd) break;
1641
+ rb_str_cat(buf,",",1);
1642
+ }
1643
+ }
1644
+ rb_str_cat(buf,"]",1);
1645
+ return buf;
1646
+ }
1647
+
1648
+
1649
+ //----------------------------------------------------------------------
1650
+
1651
+ #define ncol cumo_na_inspect_cols
1652
+ #define nrow cumo_na_inspect_rows
1653
+ extern int ncol, nrow;
1654
+
1655
+ static void
1656
+ loop_inspect(ndfunc_t *nf, na_md_loop_t *lp)
1657
+ {
1658
+ int nd, i, ii;
1659
+ size_t *c;
1660
+ int col=0, row=0;
1661
+ long len;
1662
+ VALUE str;
1663
+ na_text_func_t func = (na_text_func_t)(nf->func);
1664
+ VALUE buf, opt;
1665
+
1666
+ nd = lp->ndim;
1667
+ buf = lp->loop_opt;
1668
+ //opt = *(VALUE*)(lp->user.opt_ptr);
1669
+ opt = lp->user.option;
1670
+
1671
+ for (i=0; i<nd; i++) {
1672
+ if (lp->n[i] == 0) {
1673
+ rb_str_cat(buf,"[]",2);
1674
+ return;
1675
+ }
1676
+ }
1677
+
1678
+ rb_str_cat(buf,"\n",1);
1679
+
1680
+ c = ALLOCA_N(size_t, nd+1);
1681
+ for (i=0; i<=nd; i++) c[i]=0;
1682
+
1683
+ if (nd>0) {
1684
+ rb_str_cat(buf,"[",1);
1685
+ } else {
1686
+ rb_str_cat(buf,"",0);
1687
+ }
1688
+
1689
+ col = nd*2;
1690
+ for (i=0;;) {
1691
+ if (i<nd-1) {
1692
+ for (ii=0; ii<i; ii++) rb_str_cat(buf," ",1);
1693
+ for (; ii<nd-1; ii++) rb_str_cat(buf,"[",1);
1694
+ }
1695
+ for (; i<nd; i++) {
1696
+ if (LITER(lp,i,0).idx) {
1697
+ LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).idx[c[i]];
1698
+ } else {
1699
+ LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).step*c[i];
1700
+ }
1701
+ }
1702
+ str = (*func)(LARG(lp,0).ptr, LITER(lp,i,0).pos, opt);
1703
+
1704
+ len = RSTRING_LEN(str) + 2;
1705
+ if (ncol>0 && col+len > ncol-3) {
1706
+ rb_str_cat(buf,"...",3);
1707
+ c[i-1] = lp->n[i-1];
1708
+ } else {
1709
+ rb_str_append(buf, str);
1710
+ col += len;
1711
+ }
1712
+ for (;;) {
1713
+ if (i==0) goto loop_end;
1714
+ i--;
1715
+ if (++c[i] < lp->n[i]) break;
1716
+ rb_str_cat(buf,"]",1);
1717
+ c[i] = 0;
1718
+ }
1719
+ //line_break:
1720
+ rb_str_cat(buf,", ",2);
1721
+ if (i<nd-1) {
1722
+ rb_str_cat(buf,"\n ",2);
1723
+ col = nd*2;
1724
+ row++;
1725
+ if (row==nrow) {
1726
+ rb_str_cat(buf,"...",3);
1727
+ goto loop_end;
1728
+ }
1729
+ }
1730
+ }
1731
+ loop_end:
1732
+ ;
1733
+ }
1734
+
1735
+
1736
+ VALUE
1737
+ na_ndloop_inspect(VALUE nary, na_text_func_t func, VALUE opt)
1738
+ {
1739
+ volatile VALUE args;
1740
+ na_md_loop_t lp;
1741
+ VALUE buf;
1742
+ ndfunc_arg_in_t ain[3] = {{Qnil,0},{sym_loop_opt},{sym_option}};
1743
+ ndfunc_t nf = { (na_iter_func_t)func, NO_LOOP, 3, 0, ain, 0 };
1744
+ //nf = ndfunc_alloc(NULL, NO_LOOP, 1, 0, Qnil);
1745
+
1746
+ buf = na_info_str(nary);
1747
+
1748
+ if (na_get_pointer(nary)==NULL) {
1749
+ return rb_str_cat(buf,"(empty)",7);
1750
+ }
1751
+
1752
+ //rb_p(args);
1753
+ //if (na_debug_flag) print_ndfunc(&nf);
1754
+
1755
+ args = rb_ary_new3(3,nary,buf,opt);
1756
+
1757
+ // cast arguments to NArray
1758
+ //ndloop_cast_args(nf, args);
1759
+
1760
+ // allocate ndloop struct
1761
+ ndloop_alloc(&lp, &nf, args, NULL, 0, loop_inspect);
1762
+
1763
+ rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
1764
+
1765
+ return buf;
1766
+ }
1767
+
1768
+
1769
+ //----------------------------------------------------------------------
1770
+
1771
+ static void
1772
+ loop_store_subnarray(ndfunc_t *nf, na_md_loop_t *lp, int i0, size_t *c, VALUE a)
1773
+ {
1774
+ int nd = lp->ndim;
1775
+ int i, j;
1776
+ narray_t *na;
1777
+ int *dim_map;
1778
+ VALUE a_type;
1779
+
1780
+ a_type = CLASS_OF(LARG(lp,0).value);
1781
+ if (CLASS_OF(a) != a_type) {
1782
+ a = rb_funcall(a_type, id_cast, 1, a);
1783
+ }
1784
+ GetNArray(a,na);
1785
+ if (na->ndim != nd-i0+1) {
1786
+ rb_raise(nary_eShapeError, "mismatched dimension of sub-narray: "
1787
+ "nd_src=%d, nd_dst=%d", na->ndim, nd-i0+1);
1788
+ }
1789
+ dim_map = ALLOCA_N(int, na->ndim);
1790
+ for (i=0; i<na->ndim; i++) {
1791
+ dim_map[i] = lp->trans_map[i+i0];
1792
+ //printf("dim_map[i=%d] = %d, i0=%d\n", i, dim_map[i], i0);
1793
+ }
1794
+ ndloop_set_stepidx(lp, 1, a, dim_map, NDL_READ);
1795
+ LARG(lp,1).shape = &(na->shape[na->ndim-1]);
1796
+
1797
+ // loop body
1798
+ for (i=i0;;) {
1799
+ LARG(lp,1).value = Qtrue;
1800
+ for (; i<nd; i++) {
1801
+ for (j=0; j<2; j++) {
1802
+ if (LITER(lp,i,j).idx) {
1803
+ LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
1804
+ } else {
1805
+ LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
1806
+ }
1807
+ }
1808
+ if (c[i] >= na->shape[i-i0]) {
1809
+ LARG(lp,1).value = Qfalse;
1810
+ }
1811
+ }
1812
+
1813
+ (*(nf->func))(&(lp->user));
1814
+
1815
+ for (;;) {
1816
+ if (i<=i0) goto loop_end;
1817
+ i--; c[i]++;
1818
+ if (c[i] < lp->n[i]) break;
1819
+ c[i] = 0;
1820
+ }
1821
+ }
1822
+ loop_end:
1823
+ LARG(lp,1).ptr = NULL;
1824
+ }
1825
+
1826
+
1827
+ static void
1828
+ loop_store_rarray(ndfunc_t *nf, na_md_loop_t *lp)
1829
+ {
1830
+ size_t *c;
1831
+ int i;
1832
+ VALUE *a;
1833
+ int nd = lp->ndim;
1834
+
1835
+ // counter
1836
+ c = ALLOCA_N(size_t, nd+1);
1837
+ for (i=0; i<=nd; i++) c[i]=0;
1838
+
1839
+ // array at each dimension
1840
+ a = ALLOCA_N(VALUE, nd+1);
1841
+ a[0] = LARG(lp,1).value;
1842
+
1843
+ //print_ndloop(lp);
1844
+
1845
+ // loop body
1846
+ for (i=0;;) {
1847
+ for (; i<nd; i++) {
1848
+ if (LITER(lp,i,0).idx) {
1849
+ LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).idx[c[i]];
1850
+ } else {
1851
+ LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).step*c[i];
1852
+ }
1853
+ if (TYPE(a[i])==T_ARRAY) {
1854
+ if (c[i] < (size_t)RARRAY_LEN(a[i])) {
1855
+ a[i+1] = RARRAY_AREF(a[i],c[i]);
1856
+ } else {
1857
+ a[i+1] = Qnil;
1858
+ }
1859
+ } else if (IsNArray(a[i])) {
1860
+ //printf("a[i=%d]=0x%lx\n",i,a[i]);
1861
+ loop_store_subnarray(nf,lp,i,c,a[i]);
1862
+ goto loop_next;
1863
+ } else {
1864
+ if (c[i]==0) {
1865
+ a[i+1] = a[i];
1866
+ } else {
1867
+ a[i+1] = Qnil;
1868
+ }
1869
+ }
1870
+ //printf("c[%d]=%lu\n",i,c[i]);
1871
+ }
1872
+
1873
+ //printf("a[i=%d]=0x%lx\n",i,a[i]);
1874
+ if (IsNArray(a[i])) {
1875
+ loop_store_subnarray(nf,lp,i,c,a[i]);
1876
+ } else {
1877
+ LARG(lp,1).value = a[i];
1878
+ (*(nf->func))(&(lp->user));
1879
+ }
1880
+
1881
+ loop_next:
1882
+ for (;;) {
1883
+ if (i<=0) goto loop_end;
1884
+ i--; c[i]++;
1885
+ if (c[i] < lp->n[i]) break;
1886
+ c[i] = 0;
1887
+ }
1888
+ }
1889
+ loop_end:
1890
+ ;
1891
+ }
1892
+
1893
+ VALUE
1894
+ na_ndloop_store_rarray(ndfunc_t *nf, VALUE nary, VALUE rary)
1895
+ {
1896
+ na_md_loop_t lp;
1897
+ VALUE args;
1898
+
1899
+ //rb_p(args);
1900
+ if (na_debug_flag) print_ndfunc(nf);
1901
+
1902
+ args = rb_assoc_new(nary,rary);
1903
+
1904
+ // cast arguments to NArray
1905
+ //ndloop_cast_args(nf, args);
1906
+
1907
+ // allocate ndloop struct
1908
+ ndloop_alloc(&lp, nf, args, NULL, 0, loop_store_rarray);
1909
+
1910
+ return rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
1911
+ }
1912
+
1913
+
1914
+ VALUE
1915
+ na_ndloop_store_rarray2(ndfunc_t *nf, VALUE nary, VALUE rary, VALUE opt)
1916
+ {
1917
+ na_md_loop_t lp;
1918
+ VALUE args;
1919
+
1920
+ //rb_p(args);
1921
+ if (na_debug_flag) print_ndfunc(nf);
1922
+
1923
+ //args = rb_assoc_new(rary,nary);
1924
+ args = rb_ary_new3(3,nary,rary,opt);
1925
+
1926
+ // cast arguments to NArray
1927
+ //ndloop_cast_args(nf, args);
1928
+
1929
+ // allocate ndloop struct
1930
+ ndloop_alloc(&lp, nf, args, NULL, 0, loop_store_rarray);
1931
+
1932
+ return rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
1933
+ }
1934
+
1935
+
1936
+ //----------------------------------------------------------------------
1937
+
1938
+ static void
1939
+ loop_narray_to_rarray(ndfunc_t *nf, na_md_loop_t *lp)
1940
+ {
1941
+ size_t *c;
1942
+ int i;
1943
+ //int nargs = nf->narg + nf->nres;
1944
+ int nd = lp->ndim;
1945
+ VALUE *a;
1946
+ volatile VALUE a0;
1947
+
1948
+ // alloc counter
1949
+ c = ALLOCA_N(size_t, nd+1);
1950
+ for (i=0; i<=nd; i++) c[i]=0;
1951
+ //c[i]=1; // for zero-dim
1952
+ //fprintf(stderr,"in loop_narray_to_rarray, nd=%d\n",nd);
1953
+
1954
+ a = ALLOCA_N(VALUE, nd+1);
1955
+ a[0] = a0 = lp->loop_opt;
1956
+
1957
+ // loop body
1958
+ for (i=0;;) {
1959
+ for (; i<nd; i++) {
1960
+ if (LITER(lp,i,0).idx) {
1961
+ LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).idx[c[i]];
1962
+ } else {
1963
+ LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).step*c[i];
1964
+ }
1965
+ if (c[i]==0) {
1966
+ a[i+1] = rb_ary_new2(lp->n[i]);
1967
+ rb_ary_push(a[i],a[i+1]);
1968
+ }
1969
+ }
1970
+
1971
+ //lp->user.info = a[i];
1972
+ LARG(lp,1).value = a[i];
1973
+ (*(nf->func))(&(lp->user));
1974
+
1975
+ for (;;) {
1976
+ if (i<=0) goto loop_end;
1977
+ i--;
1978
+ if (++c[i] < lp->n[i]) break;
1979
+ c[i] = 0;
1980
+ }
1981
+ }
1982
+ loop_end:
1983
+ ;
1984
+ }
1985
+
1986
+ VALUE
1987
+ na_ndloop_cast_narray_to_rarray(ndfunc_t *nf, VALUE nary, VALUE fmt)
1988
+ {
1989
+ na_md_loop_t lp;
1990
+ VALUE args, a0;
1991
+
1992
+ //rb_p(args);
1993
+ if (na_debug_flag) print_ndfunc(nf);
1994
+
1995
+ a0 = rb_ary_new();
1996
+ args = rb_ary_new3(3,nary,a0,fmt);
1997
+
1998
+ // cast arguments to NArray
1999
+ //ndloop_cast_args(nf, args);
2000
+
2001
+ // allocate ndloop struct
2002
+ ndloop_alloc(&lp, nf, args, NULL, 0, loop_narray_to_rarray);
2003
+
2004
+ rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
2005
+ return RARRAY_AREF(a0,0);
2006
+ }
2007
+
2008
+
2009
+ //----------------------------------------------------------------------
2010
+
2011
+ static void
2012
+ loop_narray_with_index(ndfunc_t *nf, na_md_loop_t *lp)
2013
+ {
2014
+ size_t *c;
2015
+ int i,j;
2016
+ int nd = lp->ndim;
2017
+
2018
+ if (nd < 0) {
2019
+ rb_bug("bug? lp->ndim = %d\n", lp->ndim);
2020
+ }
2021
+ if (lp->n[0] == 0) { // empty array
2022
+ return;
2023
+ }
2024
+
2025
+ // pass total ndim to iterator
2026
+ lp->user.ndim += nd;
2027
+
2028
+ // alloc counter
2029
+ lp->user.opt_ptr = c = ALLOCA_N(size_t, nd+1);
2030
+ for (i=0; i<=nd; i++) c[i]=0;
2031
+
2032
+ // loop body
2033
+ for (i=0;;) {
2034
+ for (; i<nd; i++) {
2035
+ // j-th argument
2036
+ for (j=0; j<lp->narg; j++) {
2037
+ if (LITER(lp,i,j).idx) {
2038
+ LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
2039
+ } else {
2040
+ LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
2041
+ }
2042
+ //printf("j=%d c[i=%d]=%lu pos=%lu\n",j,i,c[i],LITER(lp,i+1,j).pos);
2043
+ }
2044
+ }
2045
+
2046
+ (*(nf->func))(&(lp->user));
2047
+
2048
+ for (;;) {
2049
+ if (i<=0) goto loop_end;
2050
+ i--;
2051
+ if (++c[i] < lp->n[i]) break;
2052
+ c[i] = 0;
2053
+ }
2054
+ }
2055
+ loop_end:
2056
+ ;
2057
+ }
2058
+
2059
+
2060
+ VALUE
2061
+ #ifdef HAVE_STDARG_PROTOTYPES
2062
+ na_ndloop_with_index(ndfunc_t *nf, int argc, ...)
2063
+ #else
2064
+ na_ndloop_with_index(nf, argc, va_alist)
2065
+ ndfunc_t *nf;
2066
+ int argc;
2067
+ va_dcl
2068
+ #endif
2069
+ {
2070
+ va_list ar;
2071
+
2072
+ int i;
2073
+ VALUE *argv;
2074
+ volatile VALUE args;
2075
+ na_md_loop_t lp;
2076
+
2077
+ argv = ALLOCA_N(VALUE,argc);
2078
+
2079
+ va_init_list(ar, argc);
2080
+ for (i=0; i<argc; i++) {
2081
+ argv[i] = va_arg(ar, VALUE);
2082
+ }
2083
+ va_end(ar);
2084
+
2085
+ args = rb_ary_new4(argc, argv);
2086
+
2087
+ //return na_ndloop_main(nf, args, NULL);
2088
+ if (na_debug_flag) print_ndfunc(nf);
2089
+
2090
+ // cast arguments to NArray
2091
+ //copy_flag = ndloop_cast_args(nf, args);
2092
+
2093
+ // allocate ndloop struct
2094
+ ndloop_alloc(&lp, nf, args, 0, 0, loop_narray_with_index);
2095
+
2096
+ return rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
2097
+ }
2098
+
2099
+
2100
+ void
2101
+ Init_cumo_nary_ndloop()
2102
+ {
2103
+ id_cast = rb_intern("cast");
2104
+ id_extract = rb_intern("extract");
2105
+ }