cumo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.travis.yml +5 -0
  4. data/3rd_party/mkmf-cu/.gitignore +36 -0
  5. data/3rd_party/mkmf-cu/Gemfile +3 -0
  6. data/3rd_party/mkmf-cu/LICENSE +21 -0
  7. data/3rd_party/mkmf-cu/README.md +36 -0
  8. data/3rd_party/mkmf-cu/Rakefile +11 -0
  9. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
  11. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
  12. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
  13. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
  14. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
  15. data/CODE_OF_CONDUCT.md +46 -0
  16. data/Gemfile +8 -0
  17. data/LICENSE.txt +82 -0
  18. data/README.md +252 -0
  19. data/Rakefile +43 -0
  20. data/bench/broadcast_fp32.rb +138 -0
  21. data/bench/cumo_bench.rb +193 -0
  22. data/bench/numo_bench.rb +138 -0
  23. data/bench/reduction_fp32.rb +117 -0
  24. data/bin/console +14 -0
  25. data/bin/setup +8 -0
  26. data/cumo.gemspec +32 -0
  27. data/ext/cumo/cuda/cublas.c +278 -0
  28. data/ext/cumo/cuda/driver.c +421 -0
  29. data/ext/cumo/cuda/memory_pool.cpp +185 -0
  30. data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
  31. data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
  32. data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
  33. data/ext/cumo/cuda/nvrtc.c +207 -0
  34. data/ext/cumo/cuda/runtime.c +167 -0
  35. data/ext/cumo/cumo.c +148 -0
  36. data/ext/cumo/depend.erb +58 -0
  37. data/ext/cumo/extconf.rb +179 -0
  38. data/ext/cumo/include/cumo.h +25 -0
  39. data/ext/cumo/include/cumo/compat.h +23 -0
  40. data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
  41. data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
  42. data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
  43. data/ext/cumo/include/cumo/cuda/driver.h +22 -0
  44. data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
  45. data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
  46. data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
  47. data/ext/cumo/include/cumo/indexer.h +238 -0
  48. data/ext/cumo/include/cumo/intern.h +142 -0
  49. data/ext/cumo/include/cumo/intern_fwd.h +38 -0
  50. data/ext/cumo/include/cumo/intern_kernel.h +6 -0
  51. data/ext/cumo/include/cumo/narray.h +429 -0
  52. data/ext/cumo/include/cumo/narray_kernel.h +149 -0
  53. data/ext/cumo/include/cumo/ndloop.h +95 -0
  54. data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
  55. data/ext/cumo/include/cumo/template.h +158 -0
  56. data/ext/cumo/include/cumo/template_kernel.h +77 -0
  57. data/ext/cumo/include/cumo/types/bit.h +40 -0
  58. data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
  59. data/ext/cumo/include/cumo/types/complex.h +402 -0
  60. data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
  61. data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
  62. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
  63. data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
  64. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
  65. data/ext/cumo/include/cumo/types/dfloat.h +47 -0
  66. data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
  67. data/ext/cumo/include/cumo/types/float_def.h +34 -0
  68. data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
  69. data/ext/cumo/include/cumo/types/float_macro.h +191 -0
  70. data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
  71. data/ext/cumo/include/cumo/types/int16.h +24 -0
  72. data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
  73. data/ext/cumo/include/cumo/types/int32.h +24 -0
  74. data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
  75. data/ext/cumo/include/cumo/types/int64.h +24 -0
  76. data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
  77. data/ext/cumo/include/cumo/types/int8.h +24 -0
  78. data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
  79. data/ext/cumo/include/cumo/types/int_macro.h +67 -0
  80. data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
  81. data/ext/cumo/include/cumo/types/real_accum.h +486 -0
  82. data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
  83. data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
  84. data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
  85. data/ext/cumo/include/cumo/types/robject.h +27 -0
  86. data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
  87. data/ext/cumo/include/cumo/types/scomplex.h +46 -0
  88. data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
  89. data/ext/cumo/include/cumo/types/sfloat.h +48 -0
  90. data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
  91. data/ext/cumo/include/cumo/types/uint16.h +25 -0
  92. data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
  93. data/ext/cumo/include/cumo/types/uint32.h +25 -0
  94. data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
  95. data/ext/cumo/include/cumo/types/uint64.h +25 -0
  96. data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
  97. data/ext/cumo/include/cumo/types/uint8.h +25 -0
  98. data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
  99. data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
  100. data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
  101. data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
  102. data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
  103. data/ext/cumo/narray/SFMT-params.h +97 -0
  104. data/ext/cumo/narray/SFMT-params19937.h +46 -0
  105. data/ext/cumo/narray/SFMT.c +620 -0
  106. data/ext/cumo/narray/SFMT.h +167 -0
  107. data/ext/cumo/narray/array.c +638 -0
  108. data/ext/cumo/narray/data.c +961 -0
  109. data/ext/cumo/narray/gen/cogen.rb +56 -0
  110. data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
  111. data/ext/cumo/narray/gen/def/bit.rb +37 -0
  112. data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
  113. data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
  114. data/ext/cumo/narray/gen/def/int16.rb +36 -0
  115. data/ext/cumo/narray/gen/def/int32.rb +36 -0
  116. data/ext/cumo/narray/gen/def/int64.rb +36 -0
  117. data/ext/cumo/narray/gen/def/int8.rb +36 -0
  118. data/ext/cumo/narray/gen/def/robject.rb +37 -0
  119. data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
  120. data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
  121. data/ext/cumo/narray/gen/def/uint16.rb +36 -0
  122. data/ext/cumo/narray/gen/def/uint32.rb +36 -0
  123. data/ext/cumo/narray/gen/def/uint64.rb +36 -0
  124. data/ext/cumo/narray/gen/def/uint8.rb +36 -0
  125. data/ext/cumo/narray/gen/erbpp2.rb +346 -0
  126. data/ext/cumo/narray/gen/narray_def.rb +268 -0
  127. data/ext/cumo/narray/gen/spec.rb +425 -0
  128. data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
  129. data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
  130. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
  131. data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
  132. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
  133. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
  134. data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
  135. data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
  136. data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
  137. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
  138. data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
  139. data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
  140. data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
  141. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
  142. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
  143. data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
  144. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
  145. data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
  146. data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
  147. data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
  148. data/ext/cumo/narray/gen/tmpl/class.c +9 -0
  149. data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
  150. data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
  151. data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
  152. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
  153. data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
  154. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
  155. data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
  156. data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
  157. data/ext/cumo/narray/gen/tmpl/each.c +47 -0
  158. data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
  159. data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
  160. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
  161. data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
  162. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
  163. data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
  164. data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
  165. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
  166. data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
  167. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
  168. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
  169. data/ext/cumo/narray/gen/tmpl/format.c +62 -0
  170. data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
  171. data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
  172. data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
  173. data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
  174. data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
  175. data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
  176. data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
  177. data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
  178. data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
  179. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
  180. data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
  181. data/ext/cumo/narray/gen/tmpl/median.c +66 -0
  182. data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
  183. data/ext/cumo/narray/gen/tmpl/module.c +9 -0
  184. data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
  185. data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
  186. data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
  187. data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
  188. data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
  189. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
  190. data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
  191. data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
  192. data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
  193. data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
  194. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
  195. data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
  196. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
  197. data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
  198. data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
  199. data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
  200. data/ext/cumo/narray/gen/tmpl/store.c +41 -0
  201. data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
  202. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
  203. data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
  204. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
  205. data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
  206. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
  207. data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
  208. data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
  209. data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
  210. data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
  211. data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
  212. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
  213. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
  214. data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
  215. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
  216. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
  217. data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
  218. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
  219. data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
  220. data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
  221. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
  222. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
  223. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
  224. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
  225. data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
  226. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
  227. data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
  228. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
  229. data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
  230. data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
  231. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
  232. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
  233. data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
  234. data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
  235. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
  236. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
  237. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
  238. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
  239. data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
  240. data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
  241. data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
  242. data/ext/cumo/narray/index.c +880 -0
  243. data/ext/cumo/narray/kwargs.c +153 -0
  244. data/ext/cumo/narray/math.c +142 -0
  245. data/ext/cumo/narray/narray.c +1948 -0
  246. data/ext/cumo/narray/ndloop.c +2105 -0
  247. data/ext/cumo/narray/rand.c +45 -0
  248. data/ext/cumo/narray/step.c +474 -0
  249. data/ext/cumo/narray/struct.c +886 -0
  250. data/lib/cumo.rb +3 -0
  251. data/lib/cumo/cuda.rb +11 -0
  252. data/lib/cumo/cuda/compile_error.rb +36 -0
  253. data/lib/cumo/cuda/compiler.rb +161 -0
  254. data/lib/cumo/cuda/device.rb +47 -0
  255. data/lib/cumo/cuda/link_state.rb +31 -0
  256. data/lib/cumo/cuda/module.rb +40 -0
  257. data/lib/cumo/cuda/nvrtc_program.rb +27 -0
  258. data/lib/cumo/linalg.rb +12 -0
  259. data/lib/cumo/narray.rb +2 -0
  260. data/lib/cumo/narray/extra.rb +1278 -0
  261. data/lib/erbpp.rb +294 -0
  262. data/lib/erbpp/line_number.rb +137 -0
  263. data/lib/erbpp/narray_def.rb +381 -0
  264. data/numo-narray-version +1 -0
  265. data/run.gdb +7 -0
  266. metadata +353 -0
@@ -0,0 +1,2105 @@
1
+ #include <ruby.h>
2
+ #include "cumo/narray.h"
3
+ #include "cumo/cuda/memory_pool.h"
4
+ #include "cumo/cuda/runtime.h"
5
+
6
+ #if 0
7
+ #define DBG(x) x
8
+ #else
9
+ #define DBG(x)
10
+ #endif
11
+
12
+ #ifdef HAVE_STDARG_PROTOTYPES
13
+ #include <stdarg.h>
14
+ #define va_init_list(a,b) va_start(a,b)
15
+ #else
16
+ #include <varargs.h>
17
+ #define va_init_list(a,b) va_start(a)
18
+ #endif
19
+
20
+ typedef struct NA_BUFFER_COPY {
21
+ int ndim;
22
+ size_t elmsz;
23
+ size_t *n;
24
+ char *src_ptr;
25
+ char *buf_ptr;
26
+ na_loop_iter_t *src_iter;
27
+ na_loop_iter_t *buf_iter;
28
+ } na_buffer_copy_t;
29
+
30
+ typedef struct NA_LOOP_XARGS {
31
+ na_loop_iter_t *iter; // moved from na_loop_t
32
+ na_buffer_copy_t *bufcp; // copy data to buffer
33
+ int flag; // NDL_READ NDL_WRITE
34
+ bool free_user_iter; // alloc LARG(lp,j).iter=lp->xargs[j].iter
35
+ } na_loop_xargs_t;
36
+
37
+ typedef struct NA_MD_LOOP {
38
+ int narg;
39
+ int nin;
40
+ int ndim; // n of total dimention looped at loop_narray. NOTE: lp->ndim + lp-.user.ndim is the total dimension.
41
+ unsigned int copy_flag; // set i-th bit if i-th arg is cast
42
+ void *ptr; // memory for n
43
+ na_loop_iter_t *iter_ptr; // memory for iter
44
+ size_t *n; // n of elements for each dim (shape)
45
+ na_loop_t user; // loop in user function
46
+ na_loop_xargs_t *xargs; // extra data for each arg
47
+ int writeback; // write back result to i-th arg
48
+ int init_aidx; // index of initializer argument
49
+ int reduce_dim; // number of dimensions to reduce in reduction kernel, e.g., for an array of shape: [2,3,4],
50
+ // 3 for sum(), 1 for sum(axis: 1), 2 for sum(axis: [1,2])
51
+ int *trans_map;
52
+ VALUE vargs;
53
+ VALUE reduce; // dimension indicies to reduce in reduction kernel (in bits), e.g., for an array of shape:
54
+ // [2,3,4], 111b for sum(), 010b for sum(axis: 1), 110b for sum(axis: [1,2])
55
+ VALUE loop_opt;
56
+ ndfunc_t *ndfunc;
57
+ void (*loop_func)();
58
+ } na_md_loop_t;
59
+
60
+ #define LARG(lp,iarg) ((lp)->user.args[iarg])
61
+ #define LITER(lp,idim,iarg) ((lp)->xargs[iarg].iter[idim])
62
+ #define LITER_SRC(lp,idim) ((lp)->src_iter[idim])
63
+ #define LBUFCP(lp,j) ((lp)->xargs[j].bufcp)
64
+
65
+ #define CASTABLE(t) (RTEST(t) && (t)!=OVERWRITE)
66
+
67
+ #define NDL_READ 1
68
+ #define NDL_WRITE 2
69
+ #define NDL_READ_WRITE (NDL_READ|NDL_WRITE)
70
+
71
+ static ID id_cast;
72
+ static ID id_extract;
73
+
74
+ static inline VALUE
75
+ nary_type_s_cast(VALUE type, VALUE obj)
76
+ {
77
+ return rb_funcall(type,id_cast,1,obj);
78
+ }
79
+
80
+ static void
81
+ print_ndfunc(ndfunc_t *nf) {
82
+ volatile VALUE t;
83
+ int i, k;
84
+ printf("ndfunc_t = 0x%"SZF"x {\n",(size_t)nf);
85
+ printf(" func = 0x%"SZF"x\n", (size_t)nf->func);
86
+ printf(" flag = 0x%"SZF"x\n", (size_t)nf->flag);
87
+ printf(" nin = %d\n", nf->nin);
88
+ printf(" nout = %d\n", nf->nout);
89
+ printf(" ain = 0x%"SZF"x\n", (size_t)nf->ain);
90
+ for (i=0; i<nf->nin; i++) {
91
+ t = rb_inspect(nf->ain[i].type);
92
+ printf(" ain[%d].type = %s\n", i, StringValuePtr(t));
93
+ printf(" ain[%d].dim = %d\n", i, nf->ain[i].dim);
94
+ }
95
+ printf(" aout = 0x%"SZF"x\n", (size_t)nf->aout);
96
+ for (i=0; i<nf->nout; i++) {
97
+ t = rb_inspect(nf->aout[i].type);
98
+ printf(" aout[%d].type = %s\n", i, StringValuePtr(t));
99
+ printf(" aout[%d].dim = %d\n", i, nf->aout[i].dim);
100
+ for (k=0; k<nf->aout[i].dim; k++) {
101
+ printf(" aout[%d].shape[%d] = %"SZF"u\n", i, k, nf->aout[i].shape[k]);
102
+ }
103
+ }
104
+ printf("}\n");
105
+ }
106
+
107
+
108
+ static void
109
+ print_ndloop(na_md_loop_t *lp) {
110
+ int i,j,nd;
111
+ printf("na_md_loop_t = 0x%"SZF"x {\n",(size_t)lp);
112
+ printf(" narg = %d\n", lp->narg);
113
+ printf(" nin = %d\n", lp->nin);
114
+ printf(" ndim = %d\n", lp->ndim);
115
+ printf(" copy_flag = %x\n", lp->copy_flag);
116
+ printf(" writeback = %d\n", lp->writeback);
117
+ printf(" init_aidx = %d\n", lp->init_aidx);
118
+ printf(" reduce_dim = %d\n", lp->reduce_dim);
119
+ printf(" trans_map = 0x%"SZF"x\n", (size_t)lp->trans_map);
120
+ nd = lp->ndim + lp->user.ndim;
121
+ for (i=0; i<nd; i++) {
122
+ printf(" trans_map[%d] = %d\n", i, lp->trans_map[i]);
123
+ }
124
+ printf(" n = 0x%"SZF"x\n", (size_t)lp->n);
125
+ nd = lp->ndim + lp->user.ndim;
126
+ for (i=0; i<=lp->ndim; i++) {
127
+ printf(" n[%d] = %"SZF"u\n", i, lp->n[i]);
128
+ }
129
+ printf(" user.n = 0x%"SZF"x\n", (size_t)lp->user.n);
130
+ if (lp->user.n) {
131
+ for (i=0; i<=lp->user.ndim; i++) {
132
+ printf(" user.n[%d] = %"SZF"u\n", i, lp->user.n[i]);
133
+ }
134
+ }
135
+ printf(" xargs = 0x%"SZF"x\n", (size_t)lp->xargs);
136
+ printf(" iter_ptr = 0x%"SZF"x\n", (size_t)lp->iter_ptr);
137
+ printf(" user.narg = %d\n", lp->user.narg);
138
+ printf(" user.ndim = %d\n", lp->user.ndim);
139
+ printf(" user.args = 0x%"SZF"x\n", (size_t)lp->user.args);
140
+ for (j=0; j<lp->narg; j++) {
141
+ }
142
+ printf(" user.opt_ptr = 0x%"SZF"x\n", (size_t)lp->user.opt_ptr);
143
+ if (lp->reduce==Qnil) {
144
+ printf(" reduce = nil\n");
145
+ } else {
146
+ printf(" reduce = 0x%x\n", NUM2INT(lp->reduce));
147
+ }
148
+ for (j=0; j<lp->narg; j++) {
149
+ printf("--user.args[%d]--\n", j);
150
+ printf(" user.args[%d].ptr = 0x%"SZF"x\n", j, (size_t)LARG(lp,j).ptr);
151
+ printf(" user.args[%d].elmsz = %"SZF"d\n", j, LARG(lp,j).elmsz);
152
+ printf(" user.args[%d].value = 0x%"PRI_VALUE_PREFIX"x\n", j, LARG(lp,j).value);
153
+ printf(" user.args[%d].ndim = %d\n", j, LARG(lp,j).ndim);
154
+ printf(" user.args[%d].shape = 0x%"SZF"x\n", j, (size_t)LARG(lp,j).shape);
155
+ if (LARG(lp,j).shape) {
156
+ for (i=0; i<LARG(lp,j).ndim; i++) {
157
+ printf(" user.args[%d].shape[%d] = %"SZF"d\n", j, i, LARG(lp,j).shape[i]);
158
+ }
159
+ }
160
+ printf(" user.args[%d].iter = 0x%"SZF"x\n", j,(size_t)lp->user.args[j].iter);
161
+ if (lp->user.args[j].iter) {
162
+ for (i=0; i<lp->user.ndim; i++) {
163
+ printf(" &user.args[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&lp->user.args[j].iter[i]);
164
+ printf(" user.args[%d].iter[%d].pos = %"SZF"u\n", j,i, lp->user.args[j].iter[i].pos);
165
+ printf(" user.args[%d].iter[%d].step = %"SZF"u\n", j,i, lp->user.args[j].iter[i].step);
166
+ printf(" user.args[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)lp->user.args[j].iter[i].idx);
167
+ }
168
+ }
169
+ //
170
+ printf(" xargs[%d].flag = %d\n", j, lp->xargs[j].flag);
171
+ printf(" xargs[%d].free_user_iter = %d\n", j, lp->xargs[j].free_user_iter);
172
+ for (i=0; i<=nd; i++) {
173
+ printf(" &xargs[%d].iter[%d] = 0x%"SZF"x\n", j,i, (size_t)&LITER(lp,i,j));
174
+ printf(" xargs[%d].iter[%d].pos = %"SZF"u\n", j,i, LITER(lp,i,j).pos);
175
+ printf(" xargs[%d].iter[%d].step = %"SZF"u\n", j,i, LITER(lp,i,j).step);
176
+ printf(" xargs[%d].iter[%d].idx = 0x%"SZF"x\n", j,i, (size_t)LITER(lp,i,j).idx);
177
+ }
178
+ printf(" xargs[%d].bufcp = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp);
179
+ if (lp->xargs[j].bufcp) {
180
+ printf(" xargs[%d].bufcp->ndim = %d\n", j, lp->xargs[j].bufcp->ndim);
181
+ printf(" xargs[%d].bufcp->elmsz = %"SZF"d\n", j, lp->xargs[j].bufcp->elmsz);
182
+ printf(" xargs[%d].bufcp->n = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->n);
183
+ printf(" xargs[%d].bufcp->src_ptr = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->src_ptr);
184
+ printf(" xargs[%d].bufcp->buf_ptr = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->buf_ptr);
185
+ printf(" xargs[%d].bufcp->src_iter = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->src_iter);
186
+ printf(" xargs[%d].bufcp->buf_iter = 0x%"SZF"x\n", j, (size_t)lp->xargs[j].bufcp->buf_iter);
187
+ }
188
+ }
189
+ printf("}\n");
190
+ }
191
+
192
+
193
+ // returns 0x01 if NDF_HAS_LOOP, but not supporting NDF_STRIDE_LOOP
194
+ // returns 0x02 if NDF_HAS_LOOP, but not supporting NDF_INDEX_LOOP
195
+ static unsigned int
196
+ ndloop_func_loop_spec(ndfunc_t *nf, int user_ndim)
197
+ {
198
+ unsigned int f=0;
199
+ // If user function supports LOOP
200
+ if (user_ndim > 0 || NDF_TEST(nf,NDF_HAS_LOOP)) {
201
+ if (!NDF_TEST(nf,NDF_STRIDE_LOOP)) {
202
+ f |= 1;
203
+ }
204
+ if (!NDF_TEST(nf,NDF_INDEX_LOOP)) {
205
+ f |= 2;
206
+ }
207
+ }
208
+ return f;
209
+ }
210
+
211
+
212
+
213
+
214
+ static int
215
+ ndloop_cast_required(VALUE type, VALUE value)
216
+ {
217
+ return CASTABLE(type) && type != CLASS_OF(value);
218
+ }
219
+
220
+ static int
221
+ ndloop_castable_type(VALUE type)
222
+ {
223
+ return rb_obj_is_kind_of(type, rb_cClass) && RTEST(rb_class_inherited_p(type, cNArray));
224
+ }
225
+
226
+ static void
227
+ ndloop_cast_error(VALUE type, VALUE value)
228
+ {
229
+ VALUE x = rb_inspect(type);
230
+ char* s = StringValueCStr(x);
231
+ rb_bug("fail cast from %s to %s", rb_obj_classname(value),s);
232
+ rb_raise(rb_eTypeError,"fail cast from %s to %s",
233
+ rb_obj_classname(value), s);
234
+ }
235
+
236
+ // convert input argeuments given by RARRAY_PTR(args)[j]
237
+ // to type specified by nf->args[j].type
238
+ // returns copy_flag where nth-bit is set if nth argument is converted.
239
+ static unsigned int
240
+ ndloop_cast_args(ndfunc_t *nf, VALUE args)
241
+ {
242
+ int j;
243
+ unsigned int copy_flag=0;
244
+ VALUE type, value;
245
+
246
+ for (j=0; j<nf->nin; j++) {
247
+
248
+ type = nf->ain[j].type;
249
+ if (TYPE(type)==T_SYMBOL)
250
+ continue;
251
+ value = RARRAY_AREF(args,j);
252
+ if (!ndloop_cast_required(type, value))
253
+ continue;
254
+
255
+ if (ndloop_castable_type(type)) {
256
+ RARRAY_ASET(args,j,nary_type_s_cast(type, value));
257
+ copy_flag |= 1<<j;
258
+ } else {
259
+ ndloop_cast_error(type, value);
260
+ }
261
+ }
262
+
263
+ RB_GC_GUARD(type); RB_GC_GUARD(value);
264
+ return copy_flag;
265
+ }
266
+
267
+
268
+ static void
269
+ ndloop_handle_symbol_in_ain(VALUE type, VALUE value, int at, na_md_loop_t *lp)
270
+ {
271
+ if (type==sym_reduce) {
272
+ lp->reduce = value;
273
+ }
274
+ else if (type==sym_option) {
275
+ lp->user.option = value;
276
+ }
277
+ else if (type==sym_loop_opt) {
278
+ lp->loop_opt = value;
279
+ }
280
+ else if (type==sym_init) {
281
+ lp->init_aidx = at;
282
+ }
283
+ else {
284
+ rb_bug("ndloop parse_options: unknown type");
285
+ }
286
+ }
287
+
288
+ static inline int
289
+ max2(int x, int y)
290
+ {
291
+ return x > y ? x : y;
292
+ }
293
+
294
+ static void
295
+ ndloop_find_max_dimension(na_md_loop_t *lp, ndfunc_t *nf, VALUE args)
296
+ {
297
+ int j;
298
+ int nin=0; // number of input objects (except for symbols)
299
+ int user_nd=0; // max dimension of user function
300
+ int loop_nd=0; // max dimension of md-loop
301
+
302
+ for (j=0; j<RARRAY_LEN(args); j++) {
303
+ VALUE t = nf->ain[j].type;
304
+ VALUE v = RARRAY_AREF(args,j);
305
+ if (TYPE(t)==T_SYMBOL) {
306
+ ndloop_handle_symbol_in_ain(t, v, j, lp);
307
+ } else {
308
+ nin++;
309
+ user_nd = max2(user_nd, nf->ain[j].dim);
310
+ if (IsNArray(v))
311
+ loop_nd = max2(loop_nd, RNARRAY_NDIM(v) - nf->ain[j].dim);
312
+ }
313
+ }
314
+
315
+ lp->narg = lp->user.narg = nin + nf->nout;
316
+ lp->nin = nin;
317
+ lp->ndim = loop_nd;
318
+ lp->user.ndim = user_nd;
319
+ }
320
+
321
+ /*
322
+ user-dimension:
323
+ user_nd = MAX( nf->args[j].dim )
324
+
325
+ user-support dimension:
326
+
327
+ loop dimension:
328
+ loop_nd
329
+ */
330
+
331
+ static void
332
+ ndloop_alloc(na_md_loop_t *lp, ndfunc_t *nf, VALUE args,
333
+ void *opt_ptr, unsigned int copy_flag,
334
+ void (*loop_func)(ndfunc_t*, na_md_loop_t*))
335
+ {
336
+ int i,j;
337
+ int narg;
338
+ int max_nd;
339
+
340
+ char *buf;
341
+ size_t n1, n2, n3, n4, n5;
342
+
343
+ long args_len;
344
+
345
+ na_loop_iter_t *iter;
346
+
347
+ int trans_dim;
348
+ unsigned int f;
349
+
350
+ args_len = RARRAY_LEN(args);
351
+
352
+ if (args_len != nf->nin) {
353
+ rb_bug("wrong number of arguments for ndfunc (%lu for %d)",
354
+ args_len, nf->nin);
355
+ }
356
+
357
+ lp->vargs = args;
358
+ lp->ndfunc = nf;
359
+ lp->loop_func = loop_func;
360
+ lp->copy_flag = copy_flag;
361
+
362
+ lp->reduce = Qnil;
363
+ lp->user.option = Qnil;
364
+ lp->user.opt_ptr = opt_ptr;
365
+ lp->user.err_type = Qfalse;
366
+ lp->loop_opt = Qnil;
367
+ lp->writeback = -1;
368
+ lp->init_aidx = -1;
369
+
370
+ lp->ptr = NULL;
371
+ lp->user.n = NULL;
372
+
373
+ ndloop_find_max_dimension(lp, nf, args);
374
+ narg = lp->nin + nf->nout;
375
+ max_nd = lp->ndim + lp->user.ndim;
376
+
377
+ n1 = sizeof(size_t)*(max_nd+1);
378
+ n2 = sizeof(na_loop_xargs_t)*narg;
379
+ n2 = ((n2-1)/8+1)*8;
380
+ n3 = sizeof(na_loop_args_t)*narg;
381
+ n3 = ((n3-1)/8+1)*8;
382
+ n4 = sizeof(na_loop_iter_t)*narg*(max_nd+1);
383
+ n4 = ((n4-1)/8+1)*8;
384
+ n5 = sizeof(int)*(max_nd+1);
385
+
386
+ lp->ptr = buf = (char*)xmalloc(n1+n2+n3+n4+n5);
387
+ lp->n = (size_t*)buf; buf+=n1;
388
+ lp->xargs = (na_loop_xargs_t*)buf; buf+=n2;
389
+ lp->user.args = (na_loop_args_t*)buf; buf+=n3;
390
+ lp->iter_ptr = iter = (na_loop_iter_t*)buf; buf+=n4;
391
+ lp->trans_map = (int*)buf;
392
+
393
+ for (j=0; j<narg; j++) {
394
+ LARG(lp,j).value = Qnil;
395
+ LARG(lp,j).iter = NULL;
396
+ LARG(lp,j).shape = NULL;
397
+ LARG(lp,j).ndim = 0;
398
+ lp->xargs[j].iter = &(iter[(max_nd+1)*j]);
399
+ lp->xargs[j].bufcp = NULL;
400
+ lp->xargs[j].flag = (j<lp->nin) ? NDL_READ : NDL_WRITE;
401
+ lp->xargs[j].free_user_iter = 0;
402
+ }
403
+
404
+ for (i=0; i<=max_nd; i++) {
405
+ lp->n[i] = 1;
406
+ for (j=0; j<narg; j++) {
407
+ LITER(lp,i,j).pos = 0;
408
+ LITER(lp,i,j).step = 0;
409
+ LITER(lp,i,j).idx = NULL;
410
+ }
411
+ }
412
+
413
+ // transpose reduce-dimensions to last dimensions
414
+ // array loop
415
+ // [*,+,*,+,*] => [*,*,*,+,+]
416
+ // trans_map=[0,3,1,4,2] <= [0,1,2,3,4]
417
+ if (NDF_TEST(nf,NDF_FLAT_REDUCE) && RTEST(lp->reduce)) {
418
+ trans_dim = 0;
419
+ for (i=0; i<max_nd; i++) {
420
+ if (na_test_reduce(lp->reduce, i)) {
421
+ lp->trans_map[i] = -1;
422
+ } else {
423
+ lp->trans_map[i] = trans_dim++;
424
+ }
425
+ }
426
+ j = trans_dim;
427
+ for (i=0; i<max_nd; i++) {
428
+ if (lp->trans_map[i] == -1) {
429
+ lp->trans_map[i] = j++;
430
+ }
431
+ }
432
+ lp->reduce_dim = max_nd - trans_dim;
433
+ f = 0;
434
+ for (i=trans_dim; i<max_nd; i++) {
435
+ f |= 1<<i;
436
+ }
437
+ lp->reduce = INT2FIX(f);
438
+ } else {
439
+ for (i=0; i<max_nd; i++) {
440
+ lp->trans_map[i] = i;
441
+ }
442
+ lp->reduce_dim = 0;
443
+ }
444
+ }
445
+
446
+
447
+ static VALUE
448
+ ndloop_release(VALUE vlp)
449
+ {
450
+ int j;
451
+ VALUE v;
452
+ na_md_loop_t *lp = (na_md_loop_t*)(vlp);
453
+
454
+ for (j=0; j < lp->narg; j++) {
455
+ v = LARG(lp,j).value;
456
+ if (IsNArray(v)) {
457
+ na_release_lock(v);
458
+ }
459
+ }
460
+ for (j=0; j<lp->narg; j++) {
461
+ //printf("lp->xargs[%d].bufcp=%lx\n",j,(size_t)(lp->xargs[j].bufcp));
462
+ if (lp->xargs[j].bufcp) {
463
+ xfree(lp->xargs[j].bufcp->buf_iter);
464
+ if (cumo_cuda_runtime_is_device_memory(lp->xargs[j].bufcp->buf_ptr)) {
465
+ cumo_cuda_runtime_free(lp->xargs[j].bufcp->buf_ptr);
466
+ }
467
+ else {
468
+ xfree(lp->xargs[j].bufcp->buf_ptr);
469
+ }
470
+ xfree(lp->xargs[j].bufcp->n);
471
+ xfree(lp->xargs[j].bufcp);
472
+ if (lp->xargs[j].free_user_iter) {
473
+ xfree(LARG(lp,j).iter);
474
+ }
475
+ }
476
+ }
477
+ xfree(lp->ptr);
478
+ return Qnil;
479
+ }
480
+
481
+
482
+ /*
483
+ set lp->n[i] (shape of n-d iteration) here
484
+ */
485
+ static void
486
+ ndloop_check_shape(na_md_loop_t *lp, int nf_dim, narray_t *na)
487
+ {
488
+ int i, k;
489
+ size_t n;
490
+ int dim_beg;
491
+
492
+ dim_beg = lp->ndim + nf_dim - na->ndim;
493
+
494
+ for (k = na->ndim - nf_dim - 1; k>=0; k--) {
495
+ i = lp->trans_map[k + dim_beg];
496
+ n = na->shape[k];
497
+ // if n==1 then repeat this dimension
498
+ if (n != 1) {
499
+ if (lp->n[i] == 1) {
500
+ lp->n[i] = n;
501
+ } else if (lp->n[i] != n) {
502
+ // inconsistent array shape
503
+ rb_raise(nary_eShapeError,"shape1[%d](=%"SZF"u) != shape2[%d](=%"SZF"u)",
504
+ i, lp->n[i], k, n);
505
+ }
506
+ }
507
+ }
508
+ }
509
+
510
+
511
+ /*
512
+ na->shape[i] == lp->n[ dim_map[i] ]
513
+ */
514
+ static void
515
+ ndloop_set_stepidx(na_md_loop_t *lp, int j, VALUE vna, int *dim_map, int rwflag)
516
+ {
517
+ size_t n, s;
518
+ int i, k, nd;
519
+ stridx_t sdx;
520
+ narray_t *na;
521
+
522
+ LARG(lp,j).value = vna;
523
+ LARG(lp,j).elmsz = nary_element_stride(vna);
524
+ if (rwflag == NDL_READ) {
525
+ LARG(lp,j).ptr = na_get_pointer_for_read(vna);
526
+ } else
527
+ if (rwflag == NDL_WRITE) {
528
+ LARG(lp,j).ptr = na_get_pointer_for_write(vna);
529
+ } else
530
+ if (rwflag == NDL_READ_WRITE) {
531
+ LARG(lp,j).ptr = na_get_pointer_for_read_write(vna);
532
+ } else {
533
+ rb_bug("invalid value for read-write flag");
534
+ }
535
+ GetNArray(vna,na);
536
+ nd = LARG(lp,j).ndim;
537
+
538
+ switch(NA_TYPE(na)) {
539
+ case NARRAY_DATA_T:
540
+ if (NA_DATA_PTR(na)==NULL && NA_SIZE(na)>0) {
541
+ rb_bug("cannot read no-data NArray");
542
+ rb_raise(rb_eRuntimeError,"cannot read no-data NArray");
543
+ }
544
+ // through
545
+ case NARRAY_FILEMAP_T:
546
+ s = LARG(lp,j).elmsz;
547
+ for (k=na->ndim; k--;) {
548
+ n = na->shape[k];
549
+ if (n > 1 || nd > 0) {
550
+ i = dim_map[k];
551
+ //printf("n=%d k=%d i=%d\n",n,k,i);
552
+ LITER(lp,i,j).step = s;
553
+ //LITER(lp,i,j).idx = NULL;
554
+ }
555
+ s *= n;
556
+ nd--;
557
+ }
558
+ LITER(lp,0,j).pos = 0;
559
+ break;
560
+ case NARRAY_VIEW_T:
561
+ LITER(lp,0,j).pos = NA_VIEW_OFFSET(na);
562
+ for (k=0; k<na->ndim; k++) {
563
+ n = na->shape[k];
564
+ sdx = NA_VIEW_STRIDX(na)[k];
565
+ if (n > 1 || nd > 0) {
566
+ i = dim_map[k];
567
+ if (SDX_IS_INDEX(sdx)) {
568
+ LITER(lp,i,j).step = 0;
569
+ LITER(lp,i,j).idx = SDX_GET_INDEX(sdx);
570
+ } else {
571
+ LITER(lp,i,j).step = SDX_GET_STRIDE(sdx);
572
+ //LITER(lp,i,j).idx = NULL;
573
+ }
574
+ } else if (n==1) {
575
+ if (SDX_IS_INDEX(sdx)) {
576
+ SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("ndloop_set_stepidx", "any");
577
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
578
+ LITER(lp,0,j).pos += SDX_GET_INDEX(sdx)[0];
579
+ }
580
+ }
581
+ nd--;
582
+ }
583
+ break;
584
+ default:
585
+ rb_bug("invalid narray internal type");
586
+ }
587
+ }
588
+
589
+
590
+
591
+ static void
592
+ ndloop_init_args(ndfunc_t *nf, na_md_loop_t *lp, VALUE args)
593
+ {
594
+ int i, j;
595
+ VALUE v;
596
+ narray_t *na;
597
+ int nf_dim;
598
+ int dim_beg;
599
+ int *dim_map;
600
+ int max_nd = lp->ndim + lp->user.ndim;
601
+ int flag;
602
+ size_t s;
603
+
604
+ /*
605
+ na->shape[i] == lp->n[ dim_map[i] ]
606
+ */
607
+ dim_map = ALLOCA_N(int, max_nd);
608
+
609
+ // input arguments
610
+ for (j=0; j<nf->nin; j++) {
611
+ if (TYPE(nf->ain[j].type)==T_SYMBOL) {
612
+ continue;
613
+ }
614
+ v = RARRAY_AREF(args,j);
615
+ if (IsNArray(v)) {
616
+ // set LARG(lp,j) with v
617
+ GetNArray(v,na);
618
+ nf_dim = nf->ain[j].dim;
619
+ if (nf_dim > na->ndim) {
620
+ rb_raise(nary_eDimensionError,"requires >= %d-dimensioal array "
621
+ "while %d-dimensional array is given",nf_dim,na->ndim);
622
+ }
623
+ ndloop_check_shape(lp, nf_dim, na);
624
+ dim_beg = lp->ndim + nf->ain[j].dim - na->ndim;
625
+ for (i=0; i<na->ndim; i++) {
626
+ dim_map[i] = lp->trans_map[i+dim_beg];
627
+ //printf("dim_map[%d]=%d na->shape[%d]=%d\n",i,dim_map[i],i,na->shape[i]);
628
+ }
629
+ if (nf->ain[j].type==OVERWRITE) {
630
+ lp->xargs[j].flag = flag = NDL_WRITE;
631
+ } else {
632
+ lp->xargs[j].flag = flag = NDL_READ;
633
+ }
634
+ LARG(lp,j).ndim = nf_dim;
635
+ ndloop_set_stepidx(lp, j, v, dim_map, flag);
636
+ if (nf_dim > 0) {
637
+ LARG(lp,j).shape = na->shape + (na->ndim - nf_dim);
638
+ }
639
+ } else if (TYPE(v)==T_ARRAY) {
640
+ LARG(lp,j).value = v;
641
+ LARG(lp,j).elmsz = sizeof(VALUE);
642
+ LARG(lp,j).ptr = NULL;
643
+ for (i=0; i<=max_nd; i++) {
644
+ LITER(lp,i,j).step = 1;
645
+ }
646
+ }
647
+ }
648
+ // check whether # of element is zero
649
+ for (s=1,i=0; i<=max_nd; i++) {
650
+ s *= lp->n[i];
651
+ }
652
+ if (s==0) {
653
+ for (i=0; i<=max_nd; i++) {
654
+ lp->n[i] = 0;
655
+ }
656
+ }
657
+ }
658
+
659
+
660
+ static int
661
+ ndloop_check_inplace(VALUE type, int na_ndim, size_t *na_shape, VALUE v)
662
+ {
663
+ int i;
664
+ narray_t *na;
665
+
666
+ // type check
667
+ if (type != CLASS_OF(v)) {
668
+ return 0;
669
+ }
670
+ GetNArray(v,na);
671
+ // shape check
672
+ if (na->ndim != na_ndim) {
673
+ return 0;
674
+ }
675
+ for (i=0; i<na_ndim; i++) {
676
+ if (na_shape[i] != na->shape[i]) {
677
+ return 0;
678
+ }
679
+ }
680
+ // v is selected as output
681
+ return 1;
682
+ }
683
+
684
+ static VALUE
685
+ ndloop_find_inplace(ndfunc_t *nf, na_md_loop_t *lp, VALUE type,
686
+ int na_ndim, size_t *na_shape, VALUE args)
687
+ {
688
+ int j;
689
+ VALUE v;
690
+
691
+ // find inplace
692
+ for (j=0; j<nf->nin; j++) {
693
+ v = RARRAY_AREF(args,j);
694
+ if (IsNArray(v)) {
695
+ if (TEST_INPLACE(v)) {
696
+ if (ndloop_check_inplace(type,na_ndim,na_shape,v)) {
697
+ // if already copied, create outary and write-back
698
+ if (lp->copy_flag & (1<<j)) {
699
+ lp->writeback = j;
700
+ }
701
+ return v;
702
+ }
703
+ }
704
+ }
705
+ }
706
+ // find casted or copied input array
707
+ for (j=0; j<nf->nin; j++) {
708
+ if (lp->copy_flag & (1<<j)) {
709
+ v = RARRAY_AREF(args,j);
710
+ if (ndloop_check_inplace(type,na_ndim,na_shape,v)) {
711
+ return v;
712
+ }
713
+ }
714
+ }
715
+ return Qnil;
716
+ }
717
+
718
+
719
+
720
+ static VALUE
721
+ ndloop_get_arg_type(ndfunc_t *nf, VALUE args, VALUE t)
722
+ {
723
+ int i;
724
+
725
+ // if type is FIXNUM, get the type of i-th argument
726
+ if (FIXNUM_P(t)) {
727
+ i = FIX2INT(t);
728
+ if (i<0 || i>=nf->nin) {
729
+ rb_bug("invalid type: index (%d) out of # of args",i);
730
+ }
731
+ t = nf->ain[i].type;
732
+ // if i-th type is Qnil, get the type of i-th input value
733
+ if (!CASTABLE(t)) {
734
+ t = CLASS_OF(RARRAY_AREF(args,i));
735
+ }
736
+ }
737
+ return t;
738
+ }
739
+
740
+
741
+ static VALUE
742
+ ndloop_set_output_narray(ndfunc_t *nf, na_md_loop_t *lp, int k,
743
+ VALUE type, VALUE args)
744
+ {
745
+ int i, j;
746
+ int na_ndim;
747
+ int lp_dim;
748
+ volatile VALUE v=Qnil;
749
+ size_t *na_shape;
750
+ int *dim_map;
751
+ int flag = NDL_READ_WRITE;
752
+ int nd;
753
+ int max_nd = lp->ndim + nf->aout[k].dim;
754
+
755
+ na_shape = ALLOCA_N(size_t, max_nd);
756
+ dim_map = ALLOCA_N(int, max_nd);
757
+
758
+ //printf("max_nd=%d lp->ndim=%d\n",max_nd,lp->ndim);
759
+
760
+ // md-loop shape
761
+ na_ndim = 0;
762
+ for (i=0; i<lp->ndim; i++) {
763
+ // na_shape[i] == lp->n[lp->trans_map[i]]
764
+ lp_dim = lp->trans_map[i];
765
+ //printf("i=%d lp_dim=%d\n",i,lp_dim);
766
+ if (NDF_TEST(nf,NDF_CUM)) { // cumulate with shape kept
767
+ na_shape[na_ndim] = lp->n[lp_dim];
768
+ } else
769
+ if (na_test_reduce(lp->reduce,lp_dim)) { // accumulate dimension
770
+ if (NDF_TEST(nf,NDF_KEEP_DIM)) {
771
+ na_shape[na_ndim] = 1; // leave it
772
+ } else {
773
+ continue; // delete dimension
774
+ }
775
+ } else {
776
+ na_shape[na_ndim] = lp->n[lp_dim];
777
+ }
778
+ //printf("i=%d lp_dim=%d na_shape[%d]=%ld\n",i,lp_dim,i,na_shape[i]);
779
+ dim_map[na_ndim++] = lp_dim;
780
+ //dim_map[lp_dim] = na_ndim++;
781
+ }
782
+
783
+ // user-specified shape
784
+ for (i=0; i<nf->aout[k].dim; i++) {
785
+ na_shape[na_ndim] = nf->aout[k].shape[i];
786
+ dim_map[na_ndim++] = i + lp->ndim;
787
+ }
788
+
789
+ // find inplace from input arrays
790
+ if (k==0 && NDF_TEST(nf,NDF_INPLACE)) {
791
+ v = ndloop_find_inplace(nf,lp,type,na_ndim,na_shape,args);
792
+ }
793
+ if (!RTEST(v)) {
794
+ // new object
795
+ v = nary_new(type, na_ndim, na_shape);
796
+ flag = NDL_WRITE;
797
+ }
798
+
799
+ j = lp->nin + k;
800
+ LARG(lp,j).ndim = nd = nf->aout[k].dim;
801
+ ndloop_set_stepidx(lp, j, v, dim_map, flag);
802
+ if (nd > 0) {
803
+ LARG(lp,j).shape = nf->aout[k].shape;
804
+ }
805
+
806
+ return v;
807
+ }
808
+
809
+ static VALUE
810
+ ndloop_set_output(ndfunc_t *nf, na_md_loop_t *lp, VALUE args)
811
+ {
812
+ int i, j, k, idx;
813
+ volatile VALUE v, t, results;
814
+ VALUE init;
815
+
816
+ int max_nd = lp->ndim + lp->user.ndim;
817
+
818
+ // output results
819
+ results = rb_ary_new2(nf->nout);
820
+
821
+ for (k=0; k<nf->nout; k++) {
822
+ t = nf->aout[k].type;
823
+ t = ndloop_get_arg_type(nf,args,t);
824
+
825
+ if (rb_obj_is_kind_of(t, rb_cClass)) {
826
+ if (RTEST(rb_class_inherited_p(t, cNArray))) {
827
+ // NArray
828
+ v = ndloop_set_output_narray(nf,lp,k,t,args);
829
+ rb_ary_push(results, v);
830
+ }
831
+ else if (RTEST(rb_class_inherited_p(t, rb_cArray))) {
832
+ // Ruby Array
833
+ j = lp->nin + k;
834
+ for (i=0; i<=max_nd; i++) {
835
+ LITER(lp,i,j).step = sizeof(VALUE);
836
+ }
837
+ LARG(lp,j).value = t;
838
+ LARG(lp,j).elmsz = sizeof(VALUE);
839
+ } else {
840
+ rb_raise(rb_eRuntimeError,"ndloop_set_output: invalid for type");
841
+ }
842
+ }
843
+ }
844
+
845
+ // initialilzer
846
+ k = lp->init_aidx;
847
+ if (k > -1) {
848
+ idx = nf->ain[k].dim;
849
+ v = RARRAY_AREF(results,idx);
850
+ init = RARRAY_AREF(args,k);
851
+ na_store(v,init);
852
+ }
853
+
854
+ return results;
855
+ }
856
+
857
+
858
+ // Compressing dimesions.
859
+ //
860
+ // For example, compressing [2,3] shape into [6] so that we can process
861
+ // all elements with one user loop.
862
+ static void
863
+ ndfunc_contract_loop(na_md_loop_t *lp)
864
+ {
865
+ int i,j,k,success,cnt=0;
866
+ int red0, redi;
867
+
868
+ redi = na_test_reduce(lp->reduce,0);
869
+
870
+ //for (i=0; i<lp->ndim; i++) {
871
+ // printf("lp->n[%d]=%lu\n",i,lp->n[i]);
872
+ //}
873
+
874
+ for (i=1; i<lp->ndim; i++) {
875
+ red0 = redi;
876
+ redi = na_test_reduce(lp->reduce,i);
877
+ //printf("contract i=%d reduce_cond=%d %d\n",i,red0,redi);
878
+ if (red0 != redi) {
879
+ continue;
880
+ }
881
+ success = 1;
882
+ for (j=0; j<lp->narg; j++) {
883
+ if (!(LITER(lp,i,j).idx == NULL &&
884
+ LITER(lp,i-1,j).idx == NULL &&
885
+ LITER(lp,i-1,j).step == LITER(lp,i,j).step*(ssize_t)(lp->n[i]))) {
886
+ success = 0;
887
+ break;
888
+ }
889
+ }
890
+ if (success) {
891
+ //printf("contract i=%d-th and %d-th, lp->n[%d]=%"SZF"d, lp->n[%d]=%"SZF"d\n",
892
+ // i-1,i, i,lp->n[i], i-1,lp->n[i-1]);
893
+ // contract (i-1)-th and i-th dimension
894
+ lp->n[i] *= lp->n[i-1];
895
+ // shift dimensions
896
+ for (k=i-1; k>cnt; k--) {
897
+ lp->n[k] = lp->n[k-1];
898
+ }
899
+ //printf("k=%d\n",k);
900
+ for (; k>=0; k--) {
901
+ lp->n[k] = 1;
902
+ }
903
+ for (j=0; j<lp->narg; j++) {
904
+ for (k=i-1; k>cnt; k--) {
905
+ LITER(lp,k,j) = LITER(lp,k-1,j);
906
+ }
907
+ }
908
+ if (redi) {
909
+ lp->reduce_dim--;
910
+ }
911
+ cnt++;
912
+ }
913
+ }
914
+ //printf("contract cnt=%d\n",cnt);
915
+ if (cnt>0) {
916
+ for (j=0; j<lp->narg; j++) {
917
+ LITER(lp,cnt,j).pos = LITER(lp,0,j).pos;
918
+ lp->xargs[j].iter = &LITER(lp,cnt,j);
919
+ }
920
+ lp->n = &(lp->n[cnt]);
921
+ lp->ndim -= cnt;
922
+ //for (i=0; i<lp->ndim; i++) {printf("lp->n[%d]=%lu\n",i,lp->n[i]);}
923
+ }
924
+ }
925
+
926
+
927
+ // Ndloop does loop at two places, loop_narray and user loop.
928
+ // loop_narray is an outer loop, and the user loop is an internal loop.
929
+ //
930
+ // lp->ndim: ndim to be looped at loop_narray
931
+ // lp->user.ndim: ndim to be looped at user function
932
+ //
933
+ // For example, for element-wise function, lp->user.ndim is 1, and lp->ndim -= 1.
934
+ static void
935
+ ndfunc_set_user_loop(ndfunc_t *nf, na_md_loop_t *lp)
936
+ {
937
+ int j, ud=0;
938
+
939
+ if (lp->reduce_dim > 0) {
940
+ // Increase user.ndim by number of dimensions to reduce for reduction function.
941
+ ud = lp->reduce_dim;
942
+ }
943
+ else if (lp->ndim > 0 && NDF_TEST(nf,NDF_HAS_LOOP)) {
944
+ // Set user.ndim to 1 (default is 0) for element-wise function.
945
+ ud = 1;
946
+ }
947
+ else {
948
+ goto skip_ud;
949
+ }
950
+ if (ud > lp->ndim) {
951
+ rb_bug("Reduce-dimension is larger than loop-dimension");
952
+ }
953
+ // Increase user loop dimension. NOTE: lp->ndim + lp->user.ndim is the total dimension.
954
+ lp->user.ndim += ud;
955
+ lp->ndim -= ud;
956
+ for (j=0; j<lp->narg; j++) {
957
+ if (LARG(lp,j).shape) {
958
+ rb_bug("HAS_LOOP or reduce-dimension=%d conflicts with user-dimension",lp->reduce_dim);
959
+ }
960
+ LARG(lp,j).ndim += ud;
961
+ LARG(lp,j).shape = &(lp->n[lp->ndim]);
962
+ //printf("LARG(lp,j).ndim=%d,LARG(lp,j).shape=%lx\n",LARG(lp,j).ndim,(size_t)LARG(lp,j).shape);
963
+ }
964
+ //printf("lp->reduce_dim=%d lp->user.ndim=%d lp->ndim=%d\n",lp->reduce_dim,lp->user.ndim,lp->ndim);
965
+
966
+ skip_ud:
967
+ // user function shape is the latter part of na_md_loop shape.
968
+ lp->user.n = &(lp->n[lp->ndim]);
969
+ for (j=0; j<lp->narg; j++) {
970
+ LARG(lp,j).iter = &LITER(lp,lp->ndim,j);
971
+ //printf("in ndfunc_set_user_loop: lp->user.args[%d].iter=%lx\n",j,(size_t)(LARG(lp,j).iter));
972
+ }
973
+ }
974
+
975
+
976
+ // Initialize lp->user for indexer loop.
977
+ static void
978
+ ndfunc_set_user_indexer_loop(ndfunc_t *nf, na_md_loop_t *lp)
979
+ {
980
+ int j;
981
+
982
+ lp->user.ndim = lp->ndim;
983
+ lp->ndim = 0;
984
+
985
+ if (NDF_TEST(nf,NDF_FLAT_REDUCE)) {
986
+ // in
987
+ LARG(lp,0).ndim = lp->user.ndim;
988
+ LARG(lp,0).shape = &(lp->n[lp->ndim]);
989
+ // out is constructed at na_make_reduction_arg from in and lp->reduce
990
+
991
+ lp->user.n = &(lp->n[lp->ndim]);
992
+ for (j=0; j<lp->narg; j++) {
993
+ LARG(lp,j).iter = &LITER(lp,lp->ndim,j);
994
+ }
995
+
996
+ lp->user.reduce_dim = lp->reduce_dim;
997
+ lp->user.reduce = lp->reduce;
998
+ } else { // element-wise
999
+ for (j=0; j<lp->narg; j++) {
1000
+ LARG(lp,j).ndim = lp->user.ndim;
1001
+ LARG(lp,j).shape = &(lp->n[lp->ndim]);
1002
+ }
1003
+
1004
+ lp->user.n = &(lp->n[lp->ndim]);
1005
+ for (j=0; j<lp->narg; j++) {
1006
+ LARG(lp,j).iter = &LITER(lp,lp->ndim,j);
1007
+ }
1008
+
1009
+ lp->user.reduce_dim = 0;
1010
+ lp->user.reduce = 0;
1011
+ }
1012
+ }
1013
+
1014
+
1015
+ // Judge whether a (contiguous) buffer copy is required or not, and malloc if it is required.
1016
+ //
1017
+ // CASES TO REQUIRE A BUFFER COPY:
1018
+ // 1) ndloop has `idx` but does not support NDF_INDEX_LOOP.
1019
+ // 2) ndloop has non-contiguous arrays but does not support NDF_STRIDE_LOOP.
1020
+ static void
1021
+ ndfunc_set_bufcp(ndfunc_t *nf, na_md_loop_t *lp)
1022
+ {
1023
+ unsigned int f;
1024
+ int i, j;
1025
+ int nd, ndim;
1026
+ bool zero_step;
1027
+ ssize_t n, sz, elmsz, stride, n_total; //, last_step;
1028
+ size_t *buf_shape;
1029
+ na_loop_iter_t *buf_iter=NULL, *src_iter;
1030
+
1031
+ unsigned int loop_spec = ndloop_func_loop_spec(nf, lp->user.ndim);
1032
+ //if (loop_spec==0) return;
1033
+
1034
+ n_total = lp->user.n[0];
1035
+ for (i=1; i<lp->user.ndim; i++) {
1036
+ n_total *= lp->user.n[i];
1037
+ }
1038
+
1039
+ //for (j=0; j<lp->nin; j++) {
1040
+ for (j=0; j<lp->narg; j++) {
1041
+ //ndim = nd = lp->user.ndim;
1042
+ ndim = nd = LARG(lp,j).ndim;
1043
+ sz = elmsz = LARG(lp,j).elmsz;
1044
+ src_iter = LARG(lp,j).iter;
1045
+ //last_step = src_iter[ndim-1].step;
1046
+ f = 0;
1047
+ zero_step = 1;
1048
+ for (i=ndim; i>0; ) {
1049
+ i--;
1050
+ if (LARG(lp,j).shape) {
1051
+ n = LARG(lp,j).shape[i];
1052
+ } else {
1053
+ //printf("shape is NULL\n");
1054
+ n = lp->user.n[i];
1055
+ }
1056
+ stride = sz * n;
1057
+ //printf("{j=%d,i=%d,ndim=%d,nd=%d,idx=%lx,step=%ld,n=%ld,sz=%ld,stride=%ld}\n",j,i,ndim,nd,(size_t)src_iter[i].idx,src_iter[i].step,n,sz,stride);
1058
+ if (src_iter[i].idx) {
1059
+ f |= 2; // INDEX LOOP
1060
+ zero_step = 0;
1061
+ } else {
1062
+ if (src_iter[i].step != sz) {
1063
+ f |= 1; // NON_CONTIGUOUS LOOP
1064
+ } else {
1065
+ // CONTIGUOUS LOOP
1066
+ if (i==ndim-1) { // contract if last dimension
1067
+ ndim = i;
1068
+ elmsz = stride;
1069
+ }
1070
+ }
1071
+ if (src_iter[i].step != 0) {
1072
+ zero_step = 0;
1073
+ }
1074
+ }
1075
+ sz = stride;
1076
+ }
1077
+ //printf("[j=%d f=%d loop_spec=%d zero_step=%d]\n",j,f,loop_spec,zero_step);
1078
+
1079
+ if (zero_step) {
1080
+ // no buffer needed
1081
+ continue;
1082
+ }
1083
+
1084
+ // should check flatten-able loop to avoid buffering
1085
+
1086
+
1087
+ // over loop_spec or reduce_loop is not contiguous
1088
+ if (f & loop_spec || (lp->reduce_dim > 1 && ndim > 0)) {
1089
+ //printf("(buf,nd=%d)",nd);
1090
+ buf_iter = ALLOC_N(na_loop_iter_t,nd+3);
1091
+ buf_shape = ALLOC_N(size_t,nd);
1092
+ buf_iter[nd].pos = 0;
1093
+ buf_iter[nd].step = 0;
1094
+ buf_iter[nd].idx = NULL;
1095
+ sz = LARG(lp,j).elmsz;
1096
+ //last_step = sz;
1097
+ for (i=nd; i>0; ) {
1098
+ i--;
1099
+ buf_iter[i].pos = 0;
1100
+ buf_iter[i].step = sz;
1101
+ buf_iter[i].idx = NULL;
1102
+ //n = lp->user.n[i];
1103
+ n = LARG(lp,j).shape[i];
1104
+ buf_shape[i] = n;
1105
+ sz *= n;
1106
+ }
1107
+ LBUFCP(lp,j) = ALLOC(na_buffer_copy_t);
1108
+ LBUFCP(lp,j)->ndim = ndim;
1109
+ LBUFCP(lp,j)->elmsz = elmsz;
1110
+ LBUFCP(lp,j)->n = buf_shape;
1111
+ LBUFCP(lp,j)->src_iter = src_iter;
1112
+ LBUFCP(lp,j)->buf_iter = buf_iter;
1113
+ LARG(lp,j).iter = buf_iter;
1114
+ //printf("in ndfunc_set_bufcp(1): lp->user.args[%d].iter=%lx\n",j,(size_t)(LARG(lp,j).iter));
1115
+ LBUFCP(lp,j)->src_ptr = LARG(lp,j).ptr;
1116
+ if (cumo_cuda_runtime_is_device_memory(LARG(lp,j).ptr)) {
1117
+ LARG(lp,j).ptr = LBUFCP(lp,j)->buf_ptr = cumo_cuda_runtime_malloc(sz);
1118
+ }
1119
+ else {
1120
+ LARG(lp,j).ptr = LBUFCP(lp,j)->buf_ptr = xmalloc(sz);
1121
+ }
1122
+ //printf("(LBUFCP(lp,%d)->buf_ptr=%lx)\n",j,(size_t)(LBUFCP(lp,j)->buf_ptr));
1123
+ }
1124
+ }
1125
+
1126
+ #if 0
1127
+ for (j=0; j<lp->narg; j++) {
1128
+ ndim = lp->user.ndim;
1129
+ src_iter = LARG(lp,j).iter;
1130
+ last_step = src_iter[ndim-1].step;
1131
+ if (lp->reduce_dim>1) {
1132
+ //printf("(reduce_dim=%d,ndim=%d,nd=%d,n=%ld,lst=%ld)\n",lp->reduce_dim,ndim,nd,n_total,last_step);
1133
+ buf_iter = ALLOC_N(na_loop_iter_t,2);
1134
+ buf_iter[0].pos = LARG(lp,j).iter[0].pos;
1135
+ buf_iter[0].step = last_step;
1136
+ buf_iter[0].idx = NULL;
1137
+ buf_iter[1].pos = 0;
1138
+ buf_iter[1].step = 0;
1139
+ buf_iter[1].idx = NULL;
1140
+ LARG(lp,j).iter = buf_iter;
1141
+ //printf("in ndfunc_set_bufcp(2): lp->user.args[%d].iter=%lx\n",j,(size_t)(LARG(lp,j).iter));
1142
+ lp->xargs[j].free_user_iter = 1;
1143
+ }
1144
+ }
1145
+ #endif
1146
+
1147
+ // flatten reduce dimensions
1148
+ if (lp->reduce_dim > 1) {
1149
+ #if 1
1150
+ for (j=0; j<lp->narg; j++) {
1151
+ ndim = lp->user.ndim;
1152
+ LARG(lp,j).iter[0].step = LARG(lp,j).iter[ndim-1].step;
1153
+ LARG(lp,j).iter[0].idx = NULL;
1154
+ }
1155
+ #endif
1156
+ lp->user.n[0] = n_total;
1157
+ lp->user.ndim = 1;
1158
+ }
1159
+ }
1160
+
1161
+
1162
+ // Make contiguous memory for ops not supporting index or stride (step) loop
1163
+ static void
1164
+ ndloop_copy_to_buffer(na_buffer_copy_t *lp)
1165
+ {
1166
+ size_t *c;
1167
+ char *src, *buf;
1168
+ int i;
1169
+ int nd = lp->ndim;
1170
+ size_t elmsz = lp->elmsz;
1171
+ size_t buf_pos = 0;
1172
+ DBG(size_t j);
1173
+
1174
+ //printf("\nto_buf nd=%d elmsz=%ld\n",nd,elmsz);
1175
+ DBG(printf("<to buf> ["));
1176
+ // zero-dimension
1177
+ if (nd==0) {
1178
+ src = lp->src_ptr + LITER_SRC(lp,0).pos;
1179
+ buf = lp->buf_ptr;
1180
+ if (cumo_cuda_runtime_is_device_memory(src) && cumo_cuda_runtime_is_device_memory(buf)) {
1181
+ DBG(printf("DtoD] ["));
1182
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(buf,src,elmsz,cudaMemcpyDeviceToDevice,0));
1183
+ } else {
1184
+ DBG(printf("HtoH] ["));
1185
+ memcpy(buf,src,elmsz);
1186
+ }
1187
+ DBG(for (j=0; j<elmsz/8; j++) {printf("%g,",((double*)(buf))[j]);});
1188
+ goto loop_end;
1189
+ }
1190
+ // initialize loop counter
1191
+ c = ALLOCA_N(size_t, nd+1);
1192
+ for (i=0; i<=nd; i++) c[i]=0;
1193
+ // loop body
1194
+ for (i=0;;) {
1195
+ // i-th dimension
1196
+ for (; i<nd; i++) {
1197
+ if (LITER_SRC(lp,i).idx) {
1198
+ SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("ndloop_copy_to_buffer", "any");
1199
+ cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
1200
+ LITER_SRC(lp,i+1).pos = LITER_SRC(lp,i).pos + LITER_SRC(lp,i).idx[c[i]];
1201
+ } else {
1202
+ LITER_SRC(lp,i+1).pos = LITER_SRC(lp,i).pos + LITER_SRC(lp,i).step*c[i];
1203
+ }
1204
+ }
1205
+ src = lp->src_ptr + LITER_SRC(lp,nd).pos;
1206
+ buf = lp->buf_ptr + buf_pos;
1207
+ if (cumo_cuda_runtime_is_device_memory(src) && cumo_cuda_runtime_is_device_memory(buf)) {
1208
+ DBG(printf("DtoD] ["));
1209
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(buf,src,elmsz,cudaMemcpyDeviceToDevice,0));
1210
+ } else {
1211
+ DBG(printf("HtoH] ["));
1212
+ memcpy(buf,src,elmsz);
1213
+ }
1214
+ DBG(for (j=0; j<elmsz/8; j++) {printf("%g,",((double*)(buf))[j]);});
1215
+ buf_pos += elmsz;
1216
+ // count up
1217
+ for (;;) {
1218
+ if (i<=0) goto loop_end;
1219
+ i--;
1220
+ if (++c[i] < lp->n[i]) break;
1221
+ c[i] = 0;
1222
+ }
1223
+ }
1224
+ loop_end:
1225
+ ;
1226
+ DBG(printf("]\n"));
1227
+ }
1228
+
1229
+ static void
1230
+ ndloop_copy_from_buffer(na_buffer_copy_t *lp)
1231
+ {
1232
+ size_t *c;
1233
+ char *src, *buf;
1234
+ int i;
1235
+ int nd = lp->ndim;
1236
+ size_t elmsz = lp->elmsz;
1237
+ size_t buf_pos = 0;
1238
+ DBG(size_t j);
1239
+
1240
+ //printf("\nfrom_buf nd=%d elmsz=%ld\n",nd,elmsz);
1241
+ DBG(printf("<from buf> ["));
1242
+ // zero-dimension
1243
+ if (nd==0) {
1244
+ src = lp->src_ptr + LITER_SRC(lp,0).pos;
1245
+ buf = lp->buf_ptr;
1246
+ if (cumo_cuda_runtime_is_device_memory(src) && cumo_cuda_runtime_is_device_memory(buf)) {
1247
+ DBG(printf("DtoD] ["));
1248
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(src,buf,elmsz,cudaMemcpyDeviceToDevice,0));
1249
+ } else {
1250
+ DBG(printf("HtoH] ["));
1251
+ memcpy(src,buf,elmsz);
1252
+ }
1253
+ DBG(for (j=0; j<elmsz/8; j++) {printf("%g,",((double*)(src))[j]);});
1254
+ goto loop_end;
1255
+ }
1256
+ // initialize loop counter
1257
+ c = ALLOCA_N(size_t, nd+1);
1258
+ for (i=0; i<=nd; i++) c[i]=0;
1259
+ // loop body
1260
+ for (i=0;;) {
1261
+ // i-th dimension
1262
+ for (; i<nd; i++) {
1263
+ if (LITER_SRC(lp,i).idx) {
1264
+ LITER_SRC(lp,i+1).pos = LITER_SRC(lp,i).pos + LITER_SRC(lp,i).idx[c[i]];
1265
+ } else {
1266
+ LITER_SRC(lp,i+1).pos = LITER_SRC(lp,i).pos + LITER_SRC(lp,i).step*c[i];
1267
+ }
1268
+ }
1269
+ src = lp->src_ptr + LITER_SRC(lp,nd).pos;
1270
+ buf = lp->buf_ptr + buf_pos;
1271
+ if (cumo_cuda_runtime_is_device_memory(src) && cumo_cuda_runtime_is_device_memory(buf)) {
1272
+ DBG(printf("DtoD] ["));
1273
+ cumo_cuda_runtime_check_status(cudaMemcpyAsync(src,buf,elmsz,cudaMemcpyDeviceToDevice,0));
1274
+ } else {
1275
+ DBG(printf("HtoH] ["));
1276
+ memcpy(src,buf,elmsz);
1277
+ }
1278
+ DBG(for (j=0; j<elmsz/8; j++) {printf("%g,",((double*)(src))[j]);});
1279
+ buf_pos += elmsz;
1280
+ // count up
1281
+ for (;;) {
1282
+ if (i<=0) goto loop_end;
1283
+ i--;
1284
+ if (++c[i] < lp->n[i]) break;
1285
+ c[i] = 0;
1286
+ }
1287
+ }
1288
+ loop_end:
1289
+ DBG(printf("]\n"));
1290
+ }
1291
+
1292
+
1293
+ static void
1294
+ ndfunc_write_back(ndfunc_t *nf, na_md_loop_t *lp, VALUE orig_args, VALUE results)
1295
+ {
1296
+ VALUE src, dst;
1297
+
1298
+ if (lp->writeback >= 0) {
1299
+ dst = RARRAY_AREF(orig_args,lp->writeback);
1300
+ src = RARRAY_AREF(results,0);
1301
+ na_store(dst,src);
1302
+ RARRAY_ASET(results,0,dst);
1303
+ }
1304
+ }
1305
+
1306
+
1307
+ static VALUE
1308
+ ndloop_extract(VALUE results, ndfunc_t *nf)
1309
+ {
1310
+ // long n, i;
1311
+ // VALUE x, y;
1312
+ // narray_t *na;
1313
+
1314
+ // extract result objects
1315
+ switch(nf->nout) {
1316
+ case 0:
1317
+ return Qnil;
1318
+ case 1:
1319
+ return RARRAY_AREF(results,0);
1320
+ // x = RARRAY_AREF(results,0);
1321
+ // if (NDF_TEST(nf,NDF_EXTRACT)) {
1322
+ // if (IsNArray(x)){
1323
+ // GetNArray(x,na);
1324
+ // if (NA_NDIM(na)==0) {
1325
+ // x = rb_funcall(x, id_extract, 0);
1326
+ // }
1327
+ // }
1328
+ // }
1329
+ // return x;
1330
+ }
1331
+ // if (NDF_TEST(nf,NDF_EXTRACT)) {
1332
+ // n = RARRAY_LEN(results);
1333
+ // for (i=0; i<n; i++) {
1334
+ // x = RARRAY_AREF(results,i);
1335
+ // if (IsNArray(x)){
1336
+ // GetNArray(x,na);
1337
+ // if (NA_NDIM(na)==0) {
1338
+ // y = rb_funcall(x, id_extract, 0);
1339
+ // RARRAY_ASET(results,i,y);
1340
+ // }
1341
+ // }
1342
+ // }
1343
+ // }
1344
+ return results;
1345
+ }
1346
+
1347
+ static bool
1348
+ loop_is_using_idx(na_md_loop_t *lp)
1349
+ {
1350
+ int i, j;
1351
+ int nd = lp->ndim;
1352
+
1353
+ if (nd<0) {
1354
+ rb_bug("bug? lp->ndim = %d\n", lp->ndim);
1355
+ }
1356
+
1357
+ // i-th dimension
1358
+ for (i=0; i<nd; i++) {
1359
+ // j-th argument
1360
+ for (j=0; j<lp->narg; j++) {
1361
+ if (LITER(lp,i,j).idx) {
1362
+ return true;
1363
+ }
1364
+ }
1365
+ }
1366
+ return false;
1367
+ }
1368
+
1369
+ static void
1370
+ loop_narray(ndfunc_t *nf, na_md_loop_t *lp);
1371
+
1372
+ static VALUE
1373
+ ndloop_run(VALUE vlp)
1374
+ {
1375
+ volatile VALUE args, orig_args, results;
1376
+ na_md_loop_t *lp = (na_md_loop_t*)(vlp);
1377
+ ndfunc_t *nf;
1378
+
1379
+ orig_args = lp->vargs;
1380
+ nf = lp->ndfunc;
1381
+
1382
+ args = rb_obj_dup(orig_args);
1383
+
1384
+ // setup ndloop iterator with arguments
1385
+ ndloop_init_args(nf, lp, args);
1386
+ results = ndloop_set_output(nf, lp, args);
1387
+ //if (na_debug_flag) {
1388
+ // printf("-- ndloop_set_output --\n");
1389
+ // print_ndloop(lp);
1390
+ //}
1391
+
1392
+ // contract loop (compact dimessions)
1393
+ if (NDF_TEST(nf,NDF_INDEXER_LOOP) && NDF_TEST(nf,NDF_FLAT_REDUCE)) {
1394
+ // do nothing
1395
+ // TODO(sonots): support compacting dimensions in reduction indexer loop if it allows speed up.
1396
+ } else {
1397
+ if (lp->loop_func == loop_narray) {
1398
+ ndfunc_contract_loop(lp);
1399
+ if (na_debug_flag) {
1400
+ printf("-- ndfunc_contract_loop --\n");
1401
+ print_ndloop(lp);
1402
+ }
1403
+ }
1404
+ }
1405
+
1406
+ // setup lp->user
1407
+ if (NDF_TEST(nf,NDF_INDEXER_LOOP)) {
1408
+ ndfunc_set_user_indexer_loop(nf, lp);
1409
+ if (na_debug_flag) {
1410
+ printf("-- ndfunc_set_user_indexer_loop --\n");
1411
+ print_ndloop(lp);
1412
+ }
1413
+ } else {
1414
+ ndfunc_set_user_loop(nf, lp);
1415
+ if (na_debug_flag) {
1416
+ printf("-- ndfunc_set_user_loop --\n");
1417
+ print_ndloop(lp);
1418
+ }
1419
+ }
1420
+
1421
+ // setup buffering during loop
1422
+ if (NDF_TEST(nf,NDF_INDEXER_LOOP) && NDF_TEST(nf,NDF_FLAT_REDUCE) && !loop_is_using_idx(lp)) {
1423
+ // do nothing
1424
+ } else {
1425
+ if (lp->loop_func == loop_narray) {
1426
+ ndfunc_set_bufcp(nf, lp);
1427
+ }
1428
+ if (na_debug_flag) {
1429
+ printf("-- ndfunc_set_bufcp --\n");
1430
+ print_ndloop(lp);
1431
+ }
1432
+ }
1433
+
1434
+ // loop
1435
+ (*(lp->loop_func))(nf, lp);
1436
+
1437
+ if (RTEST(lp->user.err_type)) {
1438
+ rb_raise(lp->user.err_type, "error in NArray operation");
1439
+ }
1440
+
1441
+ // write-back will be placed here
1442
+ ndfunc_write_back(nf, lp, orig_args, results);
1443
+
1444
+ // extract result objects
1445
+ return ndloop_extract(results, nf);
1446
+ }
1447
+
1448
+
1449
+ // ---------------------------------------------------------------------------
1450
+
1451
+ static void
1452
+ loop_narray(ndfunc_t *nf, na_md_loop_t *lp)
1453
+ {
1454
+ size_t *c;
1455
+ int i, j;
1456
+ int nd = lp->ndim;
1457
+
1458
+ if (nd<0) {
1459
+ rb_bug("bug? lp->ndim = %d\n", lp->ndim);
1460
+ }
1461
+
1462
+ if (nd==0 || NDF_TEST(nf,NDF_INDEXER_LOOP)) {
1463
+ for (j=0; j<lp->nin; j++) {
1464
+ if (lp->xargs[j].bufcp) {
1465
+ //printf("copy_to_buffer j=%d\n",j);
1466
+ ndloop_copy_to_buffer(lp->xargs[j].bufcp);
1467
+ }
1468
+ }
1469
+ (*(nf->func))(&(lp->user));
1470
+ for (j=0; j<lp->narg; j++) {
1471
+ if (lp->xargs[j].bufcp && (lp->xargs[j].flag & NDL_WRITE)) {
1472
+ //printf("copy_from_buffer j=%d\n",j);
1473
+ // copy data to work buffer
1474
+ ndloop_copy_from_buffer(lp->xargs[j].bufcp);
1475
+ }
1476
+ }
1477
+ return;
1478
+ }
1479
+
1480
+ // initialize loop counter
1481
+ c = ALLOCA_N(size_t, nd+1);
1482
+ for (i=0; i<=nd; i++) c[i]=0;
1483
+
1484
+ // loop body
1485
+ for (i=0;;) {
1486
+ // i-th dimension
1487
+ for (; i<nd; i++) {
1488
+ // j-th argument
1489
+ for (j=0; j<lp->narg; j++) {
1490
+ if (LITER(lp,i,j).idx) {
1491
+ LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
1492
+ } else {
1493
+ LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
1494
+ }
1495
+ //printf("j=%d c[i=%d]=%lu pos=%lu\n",j,i,c[i],LITER(lp,i+1,j).pos);
1496
+ }
1497
+ }
1498
+ for (j=0; j<lp->nin; j++) {
1499
+ if (lp->xargs[j].bufcp) {
1500
+ // copy data to work buffer
1501
+ // cp lp->iter[j][nd..*] to lp->user.args[j].iter[0..*]
1502
+ //printf("copy_to_buffer j=%d\n",j);
1503
+ ndloop_copy_to_buffer(lp->xargs[j].bufcp);
1504
+ }
1505
+ }
1506
+ (*(nf->func))(&(lp->user));
1507
+ for (j=0; j<lp->narg; j++) {
1508
+ if (lp->xargs[j].bufcp && (lp->xargs[j].flag & NDL_WRITE)) {
1509
+ // copy data to work buffer
1510
+ //printf("copy_from_buffer j=%d\n",j);
1511
+ ndloop_copy_from_buffer(lp->xargs[j].bufcp);
1512
+ }
1513
+ }
1514
+ if (RTEST(lp->user.err_type)) {return;}
1515
+
1516
+ for (;;) {
1517
+ if (i<=0) goto loop_end;
1518
+ i--;
1519
+ if (++c[i] < lp->n[i]) break;
1520
+ c[i] = 0;
1521
+ }
1522
+ }
1523
+ loop_end:
1524
+ ;
1525
+ }
1526
+
1527
+
1528
+ static VALUE
1529
+ na_ndloop_main(ndfunc_t *nf, VALUE args, void *opt_ptr)
1530
+ {
1531
+ unsigned int copy_flag;
1532
+ na_md_loop_t lp;
1533
+
1534
+ if (na_debug_flag) print_ndfunc(nf);
1535
+
1536
+ // cast arguments to NArray
1537
+ copy_flag = ndloop_cast_args(nf, args);
1538
+
1539
+ // allocate ndloop struct
1540
+ ndloop_alloc(&lp, nf, args, opt_ptr, copy_flag, loop_narray);
1541
+
1542
+ return rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
1543
+ }
1544
+
1545
+
1546
+ VALUE
1547
+ #ifdef HAVE_STDARG_PROTOTYPES
1548
+ na_ndloop(ndfunc_t *nf, int argc, ...)
1549
+ #else
1550
+ na_ndloop(nf, argc, va_alist)
1551
+ ndfunc_t *nf;
1552
+ int argc;
1553
+ va_dcl
1554
+ #endif
1555
+ {
1556
+ va_list ar;
1557
+
1558
+ int i;
1559
+ VALUE *argv;
1560
+ volatile VALUE args;
1561
+
1562
+ argv = ALLOCA_N(VALUE,argc);
1563
+
1564
+ va_init_list(ar, argc);
1565
+ for (i=0; i<argc; i++) {
1566
+ argv[i] = va_arg(ar, VALUE);
1567
+ }
1568
+ va_end(ar);
1569
+
1570
+ args = rb_ary_new4(argc, argv);
1571
+
1572
+ return na_ndloop_main(nf, args, NULL);
1573
+ }
1574
+
1575
+
1576
+ VALUE
1577
+ na_ndloop2(ndfunc_t *nf, VALUE args)
1578
+ {
1579
+ return na_ndloop_main(nf, args, NULL);
1580
+ }
1581
+
1582
+ VALUE
1583
+ #ifdef HAVE_STDARG_PROTOTYPES
1584
+ na_ndloop3(ndfunc_t *nf, void *ptr, int argc, ...)
1585
+ #else
1586
+ na_ndloop3(nf, ptr, argc, va_alist)
1587
+ ndfunc_t *nf;
1588
+ void *ptr;
1589
+ int argc;
1590
+ va_dcl
1591
+ #endif
1592
+ {
1593
+ va_list ar;
1594
+
1595
+ int i;
1596
+ VALUE *argv;
1597
+ volatile VALUE args;
1598
+
1599
+ argv = ALLOCA_N(VALUE,argc);
1600
+
1601
+ va_init_list(ar, argc);
1602
+ for (i=0; i<argc; i++) {
1603
+ argv[i] = va_arg(ar, VALUE);
1604
+ }
1605
+ va_end(ar);
1606
+
1607
+ args = rb_ary_new4(argc, argv);
1608
+
1609
+ return na_ndloop_main(nf, args, ptr);
1610
+ }
1611
+
1612
+ VALUE
1613
+ na_ndloop4(ndfunc_t *nf, void *ptr, VALUE args)
1614
+ {
1615
+ return na_ndloop_main(nf, args, ptr);
1616
+ }
1617
+
1618
+ //----------------------------------------------------------------------
1619
+
1620
+ VALUE
1621
+ na_info_str(VALUE ary)
1622
+ {
1623
+ int nd, i;
1624
+ char tmp[32];
1625
+ VALUE buf;
1626
+ narray_t *na;
1627
+
1628
+ GetNArray(ary,na);
1629
+ nd = na->ndim;
1630
+
1631
+ buf = rb_str_new2(rb_class2name(CLASS_OF(ary)));
1632
+ if (NA_TYPE(na) == NARRAY_VIEW_T) {
1633
+ rb_str_cat(buf,"(view)",6);
1634
+ }
1635
+ rb_str_cat(buf,"#shape=[",8);
1636
+ if (nd>0) {
1637
+ for (i=0;;) {
1638
+ sprintf(tmp,"%"SZF"u",na->shape[i]);
1639
+ rb_str_cat2(buf,tmp);
1640
+ if (++i==nd) break;
1641
+ rb_str_cat(buf,",",1);
1642
+ }
1643
+ }
1644
+ rb_str_cat(buf,"]",1);
1645
+ return buf;
1646
+ }
1647
+
1648
+
1649
+ //----------------------------------------------------------------------
1650
+
1651
+ #define ncol cumo_na_inspect_cols
1652
+ #define nrow cumo_na_inspect_rows
1653
+ extern int ncol, nrow;
1654
+
1655
+ static void
1656
+ loop_inspect(ndfunc_t *nf, na_md_loop_t *lp)
1657
+ {
1658
+ int nd, i, ii;
1659
+ size_t *c;
1660
+ int col=0, row=0;
1661
+ long len;
1662
+ VALUE str;
1663
+ na_text_func_t func = (na_text_func_t)(nf->func);
1664
+ VALUE buf, opt;
1665
+
1666
+ nd = lp->ndim;
1667
+ buf = lp->loop_opt;
1668
+ //opt = *(VALUE*)(lp->user.opt_ptr);
1669
+ opt = lp->user.option;
1670
+
1671
+ for (i=0; i<nd; i++) {
1672
+ if (lp->n[i] == 0) {
1673
+ rb_str_cat(buf,"[]",2);
1674
+ return;
1675
+ }
1676
+ }
1677
+
1678
+ rb_str_cat(buf,"\n",1);
1679
+
1680
+ c = ALLOCA_N(size_t, nd+1);
1681
+ for (i=0; i<=nd; i++) c[i]=0;
1682
+
1683
+ if (nd>0) {
1684
+ rb_str_cat(buf,"[",1);
1685
+ } else {
1686
+ rb_str_cat(buf,"",0);
1687
+ }
1688
+
1689
+ col = nd*2;
1690
+ for (i=0;;) {
1691
+ if (i<nd-1) {
1692
+ for (ii=0; ii<i; ii++) rb_str_cat(buf," ",1);
1693
+ for (; ii<nd-1; ii++) rb_str_cat(buf,"[",1);
1694
+ }
1695
+ for (; i<nd; i++) {
1696
+ if (LITER(lp,i,0).idx) {
1697
+ LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).idx[c[i]];
1698
+ } else {
1699
+ LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).step*c[i];
1700
+ }
1701
+ }
1702
+ str = (*func)(LARG(lp,0).ptr, LITER(lp,i,0).pos, opt);
1703
+
1704
+ len = RSTRING_LEN(str) + 2;
1705
+ if (ncol>0 && col+len > ncol-3) {
1706
+ rb_str_cat(buf,"...",3);
1707
+ c[i-1] = lp->n[i-1];
1708
+ } else {
1709
+ rb_str_append(buf, str);
1710
+ col += len;
1711
+ }
1712
+ for (;;) {
1713
+ if (i==0) goto loop_end;
1714
+ i--;
1715
+ if (++c[i] < lp->n[i]) break;
1716
+ rb_str_cat(buf,"]",1);
1717
+ c[i] = 0;
1718
+ }
1719
+ //line_break:
1720
+ rb_str_cat(buf,", ",2);
1721
+ if (i<nd-1) {
1722
+ rb_str_cat(buf,"\n ",2);
1723
+ col = nd*2;
1724
+ row++;
1725
+ if (row==nrow) {
1726
+ rb_str_cat(buf,"...",3);
1727
+ goto loop_end;
1728
+ }
1729
+ }
1730
+ }
1731
+ loop_end:
1732
+ ;
1733
+ }
1734
+
1735
+
1736
+ VALUE
1737
+ na_ndloop_inspect(VALUE nary, na_text_func_t func, VALUE opt)
1738
+ {
1739
+ volatile VALUE args;
1740
+ na_md_loop_t lp;
1741
+ VALUE buf;
1742
+ ndfunc_arg_in_t ain[3] = {{Qnil,0},{sym_loop_opt},{sym_option}};
1743
+ ndfunc_t nf = { (na_iter_func_t)func, NO_LOOP, 3, 0, ain, 0 };
1744
+ //nf = ndfunc_alloc(NULL, NO_LOOP, 1, 0, Qnil);
1745
+
1746
+ buf = na_info_str(nary);
1747
+
1748
+ if (na_get_pointer(nary)==NULL) {
1749
+ return rb_str_cat(buf,"(empty)",7);
1750
+ }
1751
+
1752
+ //rb_p(args);
1753
+ //if (na_debug_flag) print_ndfunc(&nf);
1754
+
1755
+ args = rb_ary_new3(3,nary,buf,opt);
1756
+
1757
+ // cast arguments to NArray
1758
+ //ndloop_cast_args(nf, args);
1759
+
1760
+ // allocate ndloop struct
1761
+ ndloop_alloc(&lp, &nf, args, NULL, 0, loop_inspect);
1762
+
1763
+ rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
1764
+
1765
+ return buf;
1766
+ }
1767
+
1768
+
1769
+ //----------------------------------------------------------------------
1770
+
1771
+ static void
1772
+ loop_store_subnarray(ndfunc_t *nf, na_md_loop_t *lp, int i0, size_t *c, VALUE a)
1773
+ {
1774
+ int nd = lp->ndim;
1775
+ int i, j;
1776
+ narray_t *na;
1777
+ int *dim_map;
1778
+ VALUE a_type;
1779
+
1780
+ a_type = CLASS_OF(LARG(lp,0).value);
1781
+ if (CLASS_OF(a) != a_type) {
1782
+ a = rb_funcall(a_type, id_cast, 1, a);
1783
+ }
1784
+ GetNArray(a,na);
1785
+ if (na->ndim != nd-i0+1) {
1786
+ rb_raise(nary_eShapeError, "mismatched dimension of sub-narray: "
1787
+ "nd_src=%d, nd_dst=%d", na->ndim, nd-i0+1);
1788
+ }
1789
+ dim_map = ALLOCA_N(int, na->ndim);
1790
+ for (i=0; i<na->ndim; i++) {
1791
+ dim_map[i] = lp->trans_map[i+i0];
1792
+ //printf("dim_map[i=%d] = %d, i0=%d\n", i, dim_map[i], i0);
1793
+ }
1794
+ ndloop_set_stepidx(lp, 1, a, dim_map, NDL_READ);
1795
+ LARG(lp,1).shape = &(na->shape[na->ndim-1]);
1796
+
1797
+ // loop body
1798
+ for (i=i0;;) {
1799
+ LARG(lp,1).value = Qtrue;
1800
+ for (; i<nd; i++) {
1801
+ for (j=0; j<2; j++) {
1802
+ if (LITER(lp,i,j).idx) {
1803
+ LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
1804
+ } else {
1805
+ LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
1806
+ }
1807
+ }
1808
+ if (c[i] >= na->shape[i-i0]) {
1809
+ LARG(lp,1).value = Qfalse;
1810
+ }
1811
+ }
1812
+
1813
+ (*(nf->func))(&(lp->user));
1814
+
1815
+ for (;;) {
1816
+ if (i<=i0) goto loop_end;
1817
+ i--; c[i]++;
1818
+ if (c[i] < lp->n[i]) break;
1819
+ c[i] = 0;
1820
+ }
1821
+ }
1822
+ loop_end:
1823
+ LARG(lp,1).ptr = NULL;
1824
+ }
1825
+
1826
+
1827
+ static void
1828
+ loop_store_rarray(ndfunc_t *nf, na_md_loop_t *lp)
1829
+ {
1830
+ size_t *c;
1831
+ int i;
1832
+ VALUE *a;
1833
+ int nd = lp->ndim;
1834
+
1835
+ // counter
1836
+ c = ALLOCA_N(size_t, nd+1);
1837
+ for (i=0; i<=nd; i++) c[i]=0;
1838
+
1839
+ // array at each dimension
1840
+ a = ALLOCA_N(VALUE, nd+1);
1841
+ a[0] = LARG(lp,1).value;
1842
+
1843
+ //print_ndloop(lp);
1844
+
1845
+ // loop body
1846
+ for (i=0;;) {
1847
+ for (; i<nd; i++) {
1848
+ if (LITER(lp,i,0).idx) {
1849
+ LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).idx[c[i]];
1850
+ } else {
1851
+ LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).step*c[i];
1852
+ }
1853
+ if (TYPE(a[i])==T_ARRAY) {
1854
+ if (c[i] < (size_t)RARRAY_LEN(a[i])) {
1855
+ a[i+1] = RARRAY_AREF(a[i],c[i]);
1856
+ } else {
1857
+ a[i+1] = Qnil;
1858
+ }
1859
+ } else if (IsNArray(a[i])) {
1860
+ //printf("a[i=%d]=0x%lx\n",i,a[i]);
1861
+ loop_store_subnarray(nf,lp,i,c,a[i]);
1862
+ goto loop_next;
1863
+ } else {
1864
+ if (c[i]==0) {
1865
+ a[i+1] = a[i];
1866
+ } else {
1867
+ a[i+1] = Qnil;
1868
+ }
1869
+ }
1870
+ //printf("c[%d]=%lu\n",i,c[i]);
1871
+ }
1872
+
1873
+ //printf("a[i=%d]=0x%lx\n",i,a[i]);
1874
+ if (IsNArray(a[i])) {
1875
+ loop_store_subnarray(nf,lp,i,c,a[i]);
1876
+ } else {
1877
+ LARG(lp,1).value = a[i];
1878
+ (*(nf->func))(&(lp->user));
1879
+ }
1880
+
1881
+ loop_next:
1882
+ for (;;) {
1883
+ if (i<=0) goto loop_end;
1884
+ i--; c[i]++;
1885
+ if (c[i] < lp->n[i]) break;
1886
+ c[i] = 0;
1887
+ }
1888
+ }
1889
+ loop_end:
1890
+ ;
1891
+ }
1892
+
1893
+ VALUE
1894
+ na_ndloop_store_rarray(ndfunc_t *nf, VALUE nary, VALUE rary)
1895
+ {
1896
+ na_md_loop_t lp;
1897
+ VALUE args;
1898
+
1899
+ //rb_p(args);
1900
+ if (na_debug_flag) print_ndfunc(nf);
1901
+
1902
+ args = rb_assoc_new(nary,rary);
1903
+
1904
+ // cast arguments to NArray
1905
+ //ndloop_cast_args(nf, args);
1906
+
1907
+ // allocate ndloop struct
1908
+ ndloop_alloc(&lp, nf, args, NULL, 0, loop_store_rarray);
1909
+
1910
+ return rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
1911
+ }
1912
+
1913
+
1914
+ VALUE
1915
+ na_ndloop_store_rarray2(ndfunc_t *nf, VALUE nary, VALUE rary, VALUE opt)
1916
+ {
1917
+ na_md_loop_t lp;
1918
+ VALUE args;
1919
+
1920
+ //rb_p(args);
1921
+ if (na_debug_flag) print_ndfunc(nf);
1922
+
1923
+ //args = rb_assoc_new(rary,nary);
1924
+ args = rb_ary_new3(3,nary,rary,opt);
1925
+
1926
+ // cast arguments to NArray
1927
+ //ndloop_cast_args(nf, args);
1928
+
1929
+ // allocate ndloop struct
1930
+ ndloop_alloc(&lp, nf, args, NULL, 0, loop_store_rarray);
1931
+
1932
+ return rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
1933
+ }
1934
+
1935
+
1936
+ //----------------------------------------------------------------------
1937
+
1938
+ static void
1939
+ loop_narray_to_rarray(ndfunc_t *nf, na_md_loop_t *lp)
1940
+ {
1941
+ size_t *c;
1942
+ int i;
1943
+ //int nargs = nf->narg + nf->nres;
1944
+ int nd = lp->ndim;
1945
+ VALUE *a;
1946
+ volatile VALUE a0;
1947
+
1948
+ // alloc counter
1949
+ c = ALLOCA_N(size_t, nd+1);
1950
+ for (i=0; i<=nd; i++) c[i]=0;
1951
+ //c[i]=1; // for zero-dim
1952
+ //fprintf(stderr,"in loop_narray_to_rarray, nd=%d\n",nd);
1953
+
1954
+ a = ALLOCA_N(VALUE, nd+1);
1955
+ a[0] = a0 = lp->loop_opt;
1956
+
1957
+ // loop body
1958
+ for (i=0;;) {
1959
+ for (; i<nd; i++) {
1960
+ if (LITER(lp,i,0).idx) {
1961
+ LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).idx[c[i]];
1962
+ } else {
1963
+ LITER(lp,i+1,0).pos = LITER(lp,i,0).pos + LITER(lp,i,0).step*c[i];
1964
+ }
1965
+ if (c[i]==0) {
1966
+ a[i+1] = rb_ary_new2(lp->n[i]);
1967
+ rb_ary_push(a[i],a[i+1]);
1968
+ }
1969
+ }
1970
+
1971
+ //lp->user.info = a[i];
1972
+ LARG(lp,1).value = a[i];
1973
+ (*(nf->func))(&(lp->user));
1974
+
1975
+ for (;;) {
1976
+ if (i<=0) goto loop_end;
1977
+ i--;
1978
+ if (++c[i] < lp->n[i]) break;
1979
+ c[i] = 0;
1980
+ }
1981
+ }
1982
+ loop_end:
1983
+ ;
1984
+ }
1985
+
1986
+ VALUE
1987
+ na_ndloop_cast_narray_to_rarray(ndfunc_t *nf, VALUE nary, VALUE fmt)
1988
+ {
1989
+ na_md_loop_t lp;
1990
+ VALUE args, a0;
1991
+
1992
+ //rb_p(args);
1993
+ if (na_debug_flag) print_ndfunc(nf);
1994
+
1995
+ a0 = rb_ary_new();
1996
+ args = rb_ary_new3(3,nary,a0,fmt);
1997
+
1998
+ // cast arguments to NArray
1999
+ //ndloop_cast_args(nf, args);
2000
+
2001
+ // allocate ndloop struct
2002
+ ndloop_alloc(&lp, nf, args, NULL, 0, loop_narray_to_rarray);
2003
+
2004
+ rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
2005
+ return RARRAY_AREF(a0,0);
2006
+ }
2007
+
2008
+
2009
+ //----------------------------------------------------------------------
2010
+
2011
+ static void
2012
+ loop_narray_with_index(ndfunc_t *nf, na_md_loop_t *lp)
2013
+ {
2014
+ size_t *c;
2015
+ int i,j;
2016
+ int nd = lp->ndim;
2017
+
2018
+ if (nd < 0) {
2019
+ rb_bug("bug? lp->ndim = %d\n", lp->ndim);
2020
+ }
2021
+ if (lp->n[0] == 0) { // empty array
2022
+ return;
2023
+ }
2024
+
2025
+ // pass total ndim to iterator
2026
+ lp->user.ndim += nd;
2027
+
2028
+ // alloc counter
2029
+ lp->user.opt_ptr = c = ALLOCA_N(size_t, nd+1);
2030
+ for (i=0; i<=nd; i++) c[i]=0;
2031
+
2032
+ // loop body
2033
+ for (i=0;;) {
2034
+ for (; i<nd; i++) {
2035
+ // j-th argument
2036
+ for (j=0; j<lp->narg; j++) {
2037
+ if (LITER(lp,i,j).idx) {
2038
+ LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).idx[c[i]];
2039
+ } else {
2040
+ LITER(lp,i+1,j).pos = LITER(lp,i,j).pos + LITER(lp,i,j).step*c[i];
2041
+ }
2042
+ //printf("j=%d c[i=%d]=%lu pos=%lu\n",j,i,c[i],LITER(lp,i+1,j).pos);
2043
+ }
2044
+ }
2045
+
2046
+ (*(nf->func))(&(lp->user));
2047
+
2048
+ for (;;) {
2049
+ if (i<=0) goto loop_end;
2050
+ i--;
2051
+ if (++c[i] < lp->n[i]) break;
2052
+ c[i] = 0;
2053
+ }
2054
+ }
2055
+ loop_end:
2056
+ ;
2057
+ }
2058
+
2059
+
2060
+ VALUE
2061
+ #ifdef HAVE_STDARG_PROTOTYPES
2062
+ na_ndloop_with_index(ndfunc_t *nf, int argc, ...)
2063
+ #else
2064
+ na_ndloop_with_index(nf, argc, va_alist)
2065
+ ndfunc_t *nf;
2066
+ int argc;
2067
+ va_dcl
2068
+ #endif
2069
+ {
2070
+ va_list ar;
2071
+
2072
+ int i;
2073
+ VALUE *argv;
2074
+ volatile VALUE args;
2075
+ na_md_loop_t lp;
2076
+
2077
+ argv = ALLOCA_N(VALUE,argc);
2078
+
2079
+ va_init_list(ar, argc);
2080
+ for (i=0; i<argc; i++) {
2081
+ argv[i] = va_arg(ar, VALUE);
2082
+ }
2083
+ va_end(ar);
2084
+
2085
+ args = rb_ary_new4(argc, argv);
2086
+
2087
+ //return na_ndloop_main(nf, args, NULL);
2088
+ if (na_debug_flag) print_ndfunc(nf);
2089
+
2090
+ // cast arguments to NArray
2091
+ //copy_flag = ndloop_cast_args(nf, args);
2092
+
2093
+ // allocate ndloop struct
2094
+ ndloop_alloc(&lp, nf, args, 0, 0, loop_narray_with_index);
2095
+
2096
+ return rb_ensure(ndloop_run, (VALUE)&lp, ndloop_release, (VALUE)&lp);
2097
+ }
2098
+
2099
+
2100
+ void
2101
+ Init_cumo_nary_ndloop()
2102
+ {
2103
+ id_cast = rb_intern("cast");
2104
+ id_extract = rb_intern("extract");
2105
+ }