cumo 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (158) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/3rd_party/LICENSE.txt +60 -0
  4. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +13 -1
  5. data/LICENSE.txt +1 -62
  6. data/README.md +33 -29
  7. data/bench/cumo_bench.rb +47 -25
  8. data/bench/numo_bench.rb +27 -25
  9. data/docs/src-tree.md +16 -0
  10. data/ext/cumo/cuda/cublas.c +69 -219
  11. data/ext/cumo/cuda/memory_pool_impl.hpp +1 -0
  12. data/ext/cumo/cuda/runtime.c +2 -14
  13. data/ext/cumo/cumo.c +16 -16
  14. data/ext/cumo/include/cumo.h +2 -2
  15. data/ext/cumo/include/cumo/cuda/cublas.h +6 -129
  16. data/ext/cumo/include/cumo/cuda/runtime.h +16 -0
  17. data/ext/cumo/include/cumo/indexer.h +46 -63
  18. data/ext/cumo/include/cumo/intern.h +58 -112
  19. data/ext/cumo/include/cumo/narray.h +214 -185
  20. data/ext/cumo/include/cumo/narray_kernel.h +66 -37
  21. data/ext/cumo/include/cumo/ndloop.h +42 -42
  22. data/ext/cumo/include/cumo/reduce_kernel.h +55 -71
  23. data/ext/cumo/include/cumo/template.h +56 -51
  24. data/ext/cumo/include/cumo/template_kernel.h +31 -31
  25. data/ext/cumo/include/cumo/types/bit.h +3 -3
  26. data/ext/cumo/include/cumo/types/bit_kernel.h +2 -2
  27. data/ext/cumo/include/cumo/types/complex.h +126 -126
  28. data/ext/cumo/include/cumo/types/complex_kernel.h +126 -126
  29. data/ext/cumo/include/cumo/types/complex_macro.h +28 -28
  30. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +20 -20
  31. data/ext/cumo/include/cumo/types/dcomplex.h +5 -5
  32. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +1 -1
  33. data/ext/cumo/include/cumo/types/int_macro.h +1 -1
  34. data/ext/cumo/include/cumo/types/int_macro_kernel.h +1 -1
  35. data/ext/cumo/include/cumo/types/robj_macro.h +30 -30
  36. data/ext/cumo/include/cumo/types/scomplex.h +5 -5
  37. data/ext/cumo/include/cumo/types/scomplex_kernel.h +1 -1
  38. data/ext/cumo/narray/array.c +143 -143
  39. data/ext/cumo/narray/data.c +184 -184
  40. data/ext/cumo/narray/gen/cogen.rb +5 -2
  41. data/ext/cumo/narray/gen/cogen_kernel.rb +5 -2
  42. data/ext/cumo/narray/gen/def/dcomplex.rb +1 -1
  43. data/ext/cumo/narray/gen/def/scomplex.rb +1 -1
  44. data/ext/cumo/narray/gen/erbln.rb +132 -0
  45. data/ext/cumo/narray/gen/erbpp2.rb +18 -13
  46. data/ext/cumo/narray/gen/narray_def.rb +3 -3
  47. data/ext/cumo/narray/gen/spec.rb +2 -2
  48. data/ext/cumo/narray/gen/tmpl/accum.c +15 -15
  49. data/ext/cumo/narray/gen/tmpl/accum_binary.c +22 -22
  50. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +3 -3
  51. data/ext/cumo/narray/gen/tmpl/accum_index.c +30 -30
  52. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +2 -2
  53. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +3 -3
  54. data/ext/cumo/narray/gen/tmpl/alloc_func.c +14 -14
  55. data/ext/cumo/narray/gen/tmpl/allocate.c +11 -11
  56. data/ext/cumo/narray/gen/tmpl/aref.c +2 -2
  57. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +4 -4
  58. data/ext/cumo/narray/gen/tmpl/aset.c +2 -2
  59. data/ext/cumo/narray/gen/tmpl/binary.c +28 -28
  60. data/ext/cumo/narray/gen/tmpl/binary2.c +18 -18
  61. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +3 -3
  62. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +6 -6
  63. data/ext/cumo/narray/gen/tmpl/binary_s.c +13 -13
  64. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +3 -3
  65. data/ext/cumo/narray/gen/tmpl/bincount.c +23 -23
  66. data/ext/cumo/narray/gen/tmpl/cast.c +7 -7
  67. data/ext/cumo/narray/gen/tmpl/cast_array.c +3 -3
  68. data/ext/cumo/narray/gen/tmpl/clip.c +38 -38
  69. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +2 -2
  70. data/ext/cumo/narray/gen/tmpl/cond_binary.c +19 -19
  71. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +7 -7
  72. data/ext/cumo/narray/gen/tmpl/cond_unary.c +15 -15
  73. data/ext/cumo/narray/gen/tmpl/cum.c +15 -15
  74. data/ext/cumo/narray/gen/tmpl/each.c +9 -9
  75. data/ext/cumo/narray/gen/tmpl/each_with_index.c +9 -9
  76. data/ext/cumo/narray/gen/tmpl/ewcomp.c +15 -15
  77. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +3 -3
  78. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +5 -5
  79. data/ext/cumo/narray/gen/tmpl/extract_data.c +12 -12
  80. data/ext/cumo/narray/gen/tmpl/eye.c +9 -9
  81. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +3 -3
  82. data/ext/cumo/narray/gen/tmpl/fill.c +9 -9
  83. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +6 -6
  84. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +1 -1
  85. data/ext/cumo/narray/gen/tmpl/format.c +11 -11
  86. data/ext/cumo/narray/gen/tmpl/format_to_a.c +8 -8
  87. data/ext/cumo/narray/gen/tmpl/frexp.c +13 -13
  88. data/ext/cumo/narray/gen/tmpl/gemm.c +252 -108
  89. data/ext/cumo/narray/gen/tmpl/inspect.c +1 -1
  90. data/ext/cumo/narray/gen/tmpl/lib.c +2 -2
  91. data/ext/cumo/narray/gen/tmpl/logseq.c +7 -7
  92. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +6 -6
  93. data/ext/cumo/narray/gen/tmpl/map_with_index.c +17 -17
  94. data/ext/cumo/narray/gen/tmpl/median.c +10 -10
  95. data/ext/cumo/narray/gen/tmpl/minmax.c +10 -10
  96. data/ext/cumo/narray/gen/tmpl/new_dim0.c +3 -3
  97. data/ext/cumo/narray/gen/tmpl/poly.c +6 -6
  98. data/ext/cumo/narray/gen/tmpl/pow.c +28 -28
  99. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +6 -6
  100. data/ext/cumo/narray/gen/tmpl/rand.c +10 -10
  101. data/ext/cumo/narray/gen/tmpl/rand_norm.c +7 -7
  102. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +6 -6
  103. data/ext/cumo/narray/gen/tmpl/seq.c +7 -7
  104. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +6 -6
  105. data/ext/cumo/narray/gen/tmpl/set2.c +20 -20
  106. data/ext/cumo/narray/gen/tmpl/sort.c +11 -11
  107. data/ext/cumo/narray/gen/tmpl/sort_index.c +18 -18
  108. data/ext/cumo/narray/gen/tmpl/store.c +6 -6
  109. data/ext/cumo/narray/gen/tmpl/store_array.c +19 -19
  110. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +12 -12
  111. data/ext/cumo/narray/gen/tmpl/store_bit.c +23 -23
  112. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +28 -28
  113. data/ext/cumo/narray/gen/tmpl/store_from.c +16 -16
  114. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +12 -12
  115. data/ext/cumo/narray/gen/tmpl/to_a.c +10 -10
  116. data/ext/cumo/narray/gen/tmpl/unary.c +25 -25
  117. data/ext/cumo/narray/gen/tmpl/unary2.c +17 -17
  118. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +15 -15
  119. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +13 -13
  120. data/ext/cumo/narray/gen/tmpl/unary_s.c +17 -17
  121. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +12 -12
  122. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +9 -9
  123. data/ext/cumo/narray/gen/tmpl_bit/aref.c +2 -2
  124. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +5 -5
  125. data/ext/cumo/narray/gen/tmpl_bit/aset.c +2 -2
  126. data/ext/cumo/narray/gen/tmpl_bit/binary.c +29 -29
  127. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +14 -14
  128. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +21 -21
  129. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +28 -28
  130. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +29 -29
  131. data/ext/cumo/narray/gen/tmpl_bit/each.c +10 -10
  132. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +10 -10
  133. data/ext/cumo/narray/gen/tmpl_bit/extract.c +8 -8
  134. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +8 -8
  135. data/ext/cumo/narray/gen/tmpl_bit/fill.c +17 -17
  136. data/ext/cumo/narray/gen/tmpl_bit/format.c +14 -14
  137. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +11 -11
  138. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +3 -3
  139. data/ext/cumo/narray/gen/tmpl_bit/mask.c +33 -33
  140. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +19 -19
  141. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +22 -22
  142. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +18 -18
  143. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +12 -12
  144. data/ext/cumo/narray/gen/tmpl_bit/unary.c +24 -24
  145. data/ext/cumo/narray/gen/tmpl_bit/where.c +16 -16
  146. data/ext/cumo/narray/gen/tmpl_bit/where2.c +20 -20
  147. data/ext/cumo/narray/index.c +213 -213
  148. data/ext/cumo/narray/math.c +27 -27
  149. data/ext/cumo/narray/narray.c +484 -484
  150. data/ext/cumo/narray/ndloop.c +259 -258
  151. data/ext/cumo/narray/rand.c +3 -3
  152. data/ext/cumo/narray/step.c +70 -70
  153. data/ext/cumo/narray/struct.c +139 -139
  154. metadata +6 -7
  155. data/ext/cumo/include/cumo/intern_fwd.h +0 -38
  156. data/lib/erbpp.rb +0 -294
  157. data/lib/erbpp/line_number.rb +0 -137
  158. data/lib/erbpp/narray_def.rb +0 -381
@@ -1,24 +1,24 @@
1
1
  #undef int_t
2
2
  #define int_t uint64_t
3
3
 
4
- void <%="cumo_#{c_iter}_index_kernel_launch"%>(size_t p1, char *p2, BIT_DIGIT *a1, size_t *idx1, uint64_t n);
5
- void <%="cumo_#{c_iter}_stride_kernel_launch"%>(size_t p1, char *p2, BIT_DIGIT *a1, ssize_t s1, uint64_t n);
6
- void <%="cumo_#{c_iter}_index_stride_kernel_launch"%>(size_t p1, char *p2, BIT_DIGIT *a1, size_t *idx1, ssize_t s2, uint64_t n);
7
- void <%="cumo_#{c_iter}_stride_stride_kernel_launch"%>(size_t p1, char *p2, BIT_DIGIT *a1, ssize_t s1, ssize_t s2, uint64_t n);
4
+ void <%="cumo_#{c_iter}_index_kernel_launch"%>(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, size_t *idx1, uint64_t n);
5
+ void <%="cumo_#{c_iter}_stride_kernel_launch"%>(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, ssize_t s1, uint64_t n);
6
+ void <%="cumo_#{c_iter}_index_stride_kernel_launch"%>(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, size_t *idx1, ssize_t s2, uint64_t n);
7
+ void <%="cumo_#{c_iter}_stride_stride_kernel_launch"%>(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, ssize_t s1, ssize_t s2, uint64_t n);
8
8
 
9
9
  static void
10
- <%=c_iter%>(na_loop_t *const lp)
10
+ <%=c_iter%>(cumo_na_loop_t *const lp)
11
11
  {
12
12
  size_t i;
13
- BIT_DIGIT *a1;
13
+ CUMO_BIT_DIGIT *a1;
14
14
  size_t p1;
15
15
  char *p2;
16
16
  ssize_t s1, s2;
17
17
  size_t *idx1;
18
18
 
19
- INIT_COUNTER(lp, i);
20
- INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
21
- INIT_PTR(lp, 1, p2, s2);
19
+ CUMO_INIT_COUNTER(lp, i);
20
+ CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
21
+ CUMO_INIT_PTR(lp, 1, p2, s2);
22
22
 
23
23
  if (s2==0) {
24
24
  if (idx1) {
@@ -53,12 +53,12 @@ static VALUE
53
53
  return <%=c_func(-1)%>_cpu(argc, argv, self);
54
54
  } else {
55
55
  VALUE v, reduce;
56
- ndfunc_arg_in_t ain[3] = {{cT,0},{sym_reduce,0},{sym_init,0}};
57
- ndfunc_arg_out_t aout[1] = {{cumo_cUInt64,0}};
58
- ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP_NIP, 3, 1, ain, aout };
56
+ cumo_ndfunc_arg_in_t ain[3] = {{cT,0},{cumo_sym_reduce,0},{cumo_sym_init,0}};
57
+ cumo_ndfunc_arg_out_t aout[1] = {{cumo_cUInt64,0}};
58
+ cumo_ndfunc_t ndf = { <%=c_iter%>, CUMO_FULL_LOOP_NIP, 3, 1, ain, aout };
59
59
 
60
- reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
61
- v = na_ndloop(&ndf, 3, self, reduce, INT2FIX(0));
60
+ reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
61
+ v = cumo_na_ndloop(&ndf, 3, self, reduce, INT2FIX(0));
62
62
  return v;
63
63
  }
64
64
  }
@@ -2,28 +2,28 @@
2
2
  #define int_t int64_t
3
3
 
4
4
  static void
5
- <%=c_iter%>(na_loop_t *const lp)
5
+ <%=c_iter%>(cumo_na_loop_t *const lp)
6
6
  {
7
7
  size_t i;
8
- BIT_DIGIT *a1;
8
+ CUMO_BIT_DIGIT *a1;
9
9
  size_t p1;
10
10
  char *p2;
11
11
  ssize_t s1, s2;
12
12
  size_t *idx1;
13
- BIT_DIGIT x=0;
13
+ CUMO_BIT_DIGIT x=0;
14
14
  int_t y;
15
15
 
16
- SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
16
+ CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
17
17
  cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
18
18
 
19
- INIT_COUNTER(lp, i);
20
- INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
21
- INIT_PTR(lp, 1, p2, s2);
19
+ CUMO_INIT_COUNTER(lp, i);
20
+ CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
21
+ CUMO_INIT_PTR(lp, 1, p2, s2);
22
22
  if (s2==0) {
23
- GET_DATA(p2, int_t, y);
23
+ CUMO_GET_DATA(p2, int_t, y);
24
24
  if (idx1) {
25
25
  for (; i--;) {
26
- LOAD_BIT(a1, p1+*idx1, x);
26
+ CUMO_LOAD_BIT(a1, p1+*idx1, x);
27
27
  idx1++;
28
28
  if (m_<%=name%>(x)) {
29
29
  y++;
@@ -31,7 +31,7 @@ static void
31
31
  }
32
32
  } else {
33
33
  for (; i--;) {
34
- LOAD_BIT(a1, p1, x);
34
+ CUMO_LOAD_BIT(a1, p1, x);
35
35
  p1 += s1;
36
36
  if (m_<%=name%>(x)) {
37
37
  y++;
@@ -42,23 +42,23 @@ static void
42
42
  } else {
43
43
  if (idx1) {
44
44
  for (; i--;) {
45
- LOAD_BIT(a1, p1+*idx1, x);
45
+ CUMO_LOAD_BIT(a1, p1+*idx1, x);
46
46
  idx1++;
47
47
  if (m_<%=name%>(x)) {
48
- GET_DATA(p2, int_t, y);
48
+ CUMO_GET_DATA(p2, int_t, y);
49
49
  y++;
50
- SET_DATA(p2, int_t, y);
50
+ CUMO_SET_DATA(p2, int_t, y);
51
51
  }
52
52
  p2+=s2;
53
53
  }
54
54
  } else {
55
55
  for (; i--;) {
56
- LOAD_BIT(a1, p1, x);
56
+ CUMO_LOAD_BIT(a1, p1, x);
57
57
  p1+=s1;
58
58
  if (m_<%=name%>(x)) {
59
- GET_DATA(p2, int_t, y);
59
+ CUMO_GET_DATA(p2, int_t, y);
60
60
  y++;
61
- SET_DATA(p2, int_t, y);
61
+ CUMO_SET_DATA(p2, int_t, y);
62
62
  }
63
63
  p2+=s2;
64
64
  }
@@ -78,11 +78,11 @@ static VALUE
78
78
  <%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
79
79
  {
80
80
  VALUE v, reduce;
81
- ndfunc_arg_in_t ain[3] = {{cT,0},{sym_reduce,0},{sym_init,0}};
82
- ndfunc_arg_out_t aout[1] = {{cumo_cInt64,0}};
83
- ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP_NIP, 3, 1, ain, aout };
81
+ cumo_ndfunc_arg_in_t ain[3] = {{cT,0},{cumo_sym_reduce,0},{cumo_sym_init,0}};
82
+ cumo_ndfunc_arg_out_t aout[1] = {{cumo_cInt64,0}};
83
+ cumo_ndfunc_t ndf = { <%=c_iter%>, CUMO_FULL_LOOP_NIP, 3, 1, ain, aout };
84
84
 
85
- reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
86
- v = na_ndloop(&ndf, 3, self, reduce, INT2FIX(0));
85
+ reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
86
+ v = cumo_na_ndloop(&ndf, 3, self, reduce, INT2FIX(0));
87
87
  return rb_funcall(v,rb_intern("extract_cpu"),0);
88
88
  }
@@ -1,76 +1,76 @@
1
1
  #undef int_t
2
2
  #define int_t unsigned long long int
3
3
 
4
- __global__ void <%="cumo_#{c_iter}_index_kernel"%>(size_t p1, char* p2, BIT_DIGIT *a1, size_t *idx1, uint64_t n)
4
+ __global__ void <%="cumo_#{c_iter}_index_kernel"%>(size_t p1, char* p2, CUMO_BIT_DIGIT *a1, size_t *idx1, uint64_t n)
5
5
  {
6
6
  for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
7
- BIT_DIGIT x=0;
8
- LOAD_BIT(a1, p1 + idx1[i], x);
7
+ CUMO_BIT_DIGIT x=0;
8
+ CUMO_LOAD_BIT(a1, p1 + idx1[i], x);
9
9
  if (m_<%=name%>(x)) {
10
10
  atomicAdd((int_t*)p2, (int_t)1);
11
11
  }
12
12
  }
13
13
  }
14
14
 
15
- __global__ void <%="cumo_#{c_iter}_stride_kernel"%>(size_t p1, char* p2, BIT_DIGIT *a1, ssize_t s1, uint64_t n)
15
+ __global__ void <%="cumo_#{c_iter}_stride_kernel"%>(size_t p1, char* p2, CUMO_BIT_DIGIT *a1, ssize_t s1, uint64_t n)
16
16
  {
17
17
  for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
18
- BIT_DIGIT x=0;
19
- LOAD_BIT(a1, p1 + i * s1, x);
18
+ CUMO_BIT_DIGIT x=0;
19
+ CUMO_LOAD_BIT(a1, p1 + i * s1, x);
20
20
  if (m_<%=name%>(x)) {
21
21
  atomicAdd((int_t*)p2, (int_t)1);
22
22
  }
23
23
  }
24
24
  }
25
25
 
26
- __global__ void <%="cumo_#{c_iter}_index_stride_kernel"%>(size_t p1, char* p2, BIT_DIGIT *a1, size_t *idx1, ssize_t s2, uint64_t n)
26
+ __global__ void <%="cumo_#{c_iter}_index_stride_kernel"%>(size_t p1, char* p2, CUMO_BIT_DIGIT *a1, size_t *idx1, ssize_t s2, uint64_t n)
27
27
  {
28
28
  for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
29
- BIT_DIGIT x=0;
30
- LOAD_BIT(a1, p1 + idx1[i], x);
29
+ CUMO_BIT_DIGIT x=0;
30
+ CUMO_LOAD_BIT(a1, p1 + idx1[i], x);
31
31
  if (m_<%=name%>(x)) {
32
32
  atomicAdd((int_t*)(p2 + i * s2), (int_t)1);
33
33
  }
34
34
  }
35
35
  }
36
36
 
37
- __global__ void <%="cumo_#{c_iter}_stride_stride_kernel"%>(size_t p1, char* p2, BIT_DIGIT *a1, ssize_t s1, ssize_t s2, uint64_t n)
37
+ __global__ void <%="cumo_#{c_iter}_stride_stride_kernel"%>(size_t p1, char* p2, CUMO_BIT_DIGIT *a1, ssize_t s1, ssize_t s2, uint64_t n)
38
38
  {
39
39
  for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
40
- BIT_DIGIT x=0;
41
- LOAD_BIT(a1, p1 + i * s1, x);
40
+ CUMO_BIT_DIGIT x=0;
41
+ CUMO_LOAD_BIT(a1, p1 + i * s1, x);
42
42
  if (m_<%=name%>(x)) {
43
43
  atomicAdd((int_t*)(p2 + i * s2), (int_t)1);
44
44
  }
45
45
  }
46
46
  }
47
47
 
48
- void <%="cumo_#{c_iter}_index_kernel_launch"%>(size_t p1, char *p2, BIT_DIGIT *a1, size_t *idx1, uint64_t n)
48
+ void <%="cumo_#{c_iter}_index_kernel_launch"%>(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, size_t *idx1, uint64_t n)
49
49
  {
50
- size_t gridDim = get_gridDim(n);
51
- size_t blockDim = get_blockDim(n);
52
- <%="cumo_#{c_iter}_index_kernel"%><<<gridDim, blockDim>>>(p1,p2,a1,idx1,n);
50
+ size_t grid_dim = cumo_get_grid_dim(n);
51
+ size_t block_dim = cumo_get_block_dim(n);
52
+ <%="cumo_#{c_iter}_index_kernel"%><<<grid_dim, block_dim>>>(p1,p2,a1,idx1,n);
53
53
  }
54
54
 
55
- void <%="cumo_#{c_iter}_stride_kernel_launch"%>(size_t p1, char *p2, BIT_DIGIT *a1, ssize_t s1, uint64_t n)
55
+ void <%="cumo_#{c_iter}_stride_kernel_launch"%>(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, ssize_t s1, uint64_t n)
56
56
  {
57
- size_t gridDim = get_gridDim(n);
58
- size_t blockDim = get_blockDim(n);
59
- <%="cumo_#{c_iter}_stride_kernel"%><<<gridDim, blockDim>>>(p1,p2,a1,s1,n);
57
+ size_t grid_dim = cumo_get_grid_dim(n);
58
+ size_t block_dim = cumo_get_block_dim(n);
59
+ <%="cumo_#{c_iter}_stride_kernel"%><<<grid_dim, block_dim>>>(p1,p2,a1,s1,n);
60
60
  }
61
61
 
62
- void <%="cumo_#{c_iter}_index_stride_kernel_launch"%>(size_t p1, char *p2, BIT_DIGIT *a1, size_t *idx1, ssize_t s2, uint64_t n)
62
+ void <%="cumo_#{c_iter}_index_stride_kernel_launch"%>(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, size_t *idx1, ssize_t s2, uint64_t n)
63
63
  {
64
- size_t gridDim = get_gridDim(n);
65
- size_t blockDim = get_blockDim(n);
66
- <%="cumo_#{c_iter}_index_stride_kernel"%><<<gridDim, blockDim>>>(p1,p2,a1,idx1,s2,n);
64
+ size_t grid_dim = cumo_get_grid_dim(n);
65
+ size_t block_dim = cumo_get_block_dim(n);
66
+ <%="cumo_#{c_iter}_index_stride_kernel"%><<<grid_dim, block_dim>>>(p1,p2,a1,idx1,s2,n);
67
67
  }
68
68
 
69
- void <%="cumo_#{c_iter}_stride_stride_kernel_launch"%>(size_t p1, char *p2, BIT_DIGIT *a1, ssize_t s1, ssize_t s2, uint64_t n)
69
+ void <%="cumo_#{c_iter}_stride_stride_kernel_launch"%>(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, ssize_t s1, ssize_t s2, uint64_t n)
70
70
  {
71
- size_t gridDim = get_gridDim(n);
72
- size_t blockDim = get_blockDim(n);
73
- <%="cumo_#{c_iter}_stride_stride_kernel"%><<<gridDim, blockDim>>>(p1,p2,a1,s1,s2,n);
71
+ size_t grid_dim = cumo_get_grid_dim(n);
72
+ size_t block_dim = cumo_get_block_dim(n);
73
+ <%="cumo_#{c_iter}_stride_stride_kernel"%><<<grid_dim, block_dim>>>(p1,p2,a1,s1,s2,n);
74
74
  }
75
75
 
76
76
  #undef int_t
@@ -1,28 +1,28 @@
1
1
  static void
2
- <%=c_iter%>(na_loop_t *const lp)
2
+ <%=c_iter%>(cumo_na_loop_t *const lp)
3
3
  {
4
4
  size_t i;
5
- BIT_DIGIT *a1, *a2;
5
+ CUMO_BIT_DIGIT *a1, *a2;
6
6
  size_t p1, p2;
7
7
  ssize_t s1, s2;
8
8
  size_t *idx1, *idx2;
9
- BIT_DIGIT x=0, y=0;
9
+ CUMO_BIT_DIGIT x=0, y=0;
10
10
 
11
11
  // TODO(sonots): CUDA kernelize
12
- SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
12
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
13
13
  cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
14
14
 
15
- INIT_COUNTER(lp, i);
16
- INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
17
- INIT_PTR_BIT_IDX(lp, 1, a2, p2, s2, idx2);
15
+ CUMO_INIT_COUNTER(lp, i);
16
+ CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
17
+ CUMO_INIT_PTR_BIT_IDX(lp, 1, a2, p2, s2, idx2);
18
18
  if (idx2) {
19
19
  if (idx1) {
20
20
  for (; i--;) {
21
- LOAD_BIT(a2, p2+*idx2, y);
21
+ CUMO_LOAD_BIT(a2, p2+*idx2, y);
22
22
  if (y == <%=init_bit%>) {
23
- LOAD_BIT(a1, p1+*idx1, x);
23
+ CUMO_LOAD_BIT(a1, p1+*idx1, x);
24
24
  if (x != <%=init_bit%>) {
25
- STORE_BIT(a2, p2+*idx2, x);
25
+ CUMO_STORE_BIT(a2, p2+*idx2, x);
26
26
  }
27
27
  }
28
28
  idx1++;
@@ -30,11 +30,11 @@ static void
30
30
  }
31
31
  } else {
32
32
  for (; i--;) {
33
- LOAD_BIT(a2, p2+*idx2, y);
33
+ CUMO_LOAD_BIT(a2, p2+*idx2, y);
34
34
  if (y == <%=init_bit%>) {
35
- LOAD_BIT(a1, p1, x);
35
+ CUMO_LOAD_BIT(a1, p1, x);
36
36
  if (x != <%=init_bit%>) {
37
- STORE_BIT(a2, p2+*idx2, x);
37
+ CUMO_STORE_BIT(a2, p2+*idx2, x);
38
38
  }
39
39
  }
40
40
  p1 += s1;
@@ -44,11 +44,11 @@ static void
44
44
  } else if (s2) {
45
45
  if (idx1) {
46
46
  for (; i--;) {
47
- LOAD_BIT(a2, p2, y);
47
+ CUMO_LOAD_BIT(a2, p2, y);
48
48
  if (y == <%=init_bit%>) {
49
- LOAD_BIT(a1, p1+*idx1, x);
49
+ CUMO_LOAD_BIT(a1, p1+*idx1, x);
50
50
  if (x != <%=init_bit%>) {
51
- STORE_BIT(a2, p2, x);
51
+ CUMO_STORE_BIT(a2, p2, x);
52
52
  }
53
53
  }
54
54
  idx1++;
@@ -56,11 +56,11 @@ static void
56
56
  }
57
57
  } else {
58
58
  for (; i--;) {
59
- LOAD_BIT(a2, p2, y);
59
+ CUMO_LOAD_BIT(a2, p2, y);
60
60
  if (y == <%=init_bit%>) {
61
- LOAD_BIT(a1, p1, x);
61
+ CUMO_LOAD_BIT(a1, p1, x);
62
62
  if (x != <%=init_bit%>) {
63
- STORE_BIT(a2, p2, x);
63
+ CUMO_STORE_BIT(a2, p2, x);
64
64
  }
65
65
  }
66
66
  p1 += s1;
@@ -68,24 +68,24 @@ static void
68
68
  }
69
69
  }
70
70
  } else {
71
- LOAD_BIT(a2, p2, x);
71
+ CUMO_LOAD_BIT(a2, p2, x);
72
72
  if (x != <%=init_bit%>) {
73
73
  return;
74
74
  }
75
75
  if (idx1) {
76
76
  for (; i--;) {
77
- LOAD_BIT(a1, p1+*idx1, y);
77
+ CUMO_LOAD_BIT(a1, p1+*idx1, y);
78
78
  if (y != <%=init_bit%>) {
79
- STORE_BIT(a2, p2, y);
79
+ CUMO_STORE_BIT(a2, p2, y);
80
80
  return;
81
81
  }
82
82
  idx1++;
83
83
  }
84
84
  } else {
85
85
  for (; i--;) {
86
- LOAD_BIT(a1, p1, y);
86
+ CUMO_LOAD_BIT(a1, p1, y);
87
87
  if (y != <%=init_bit%>) {
88
- STORE_BIT(a2, p2, y);
88
+ CUMO_STORE_BIT(a2, p2, y);
89
89
  return;
90
90
  }
91
91
  p1 += s1;
@@ -111,12 +111,12 @@ static VALUE
111
111
  <%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
112
112
  {
113
113
  VALUE v, reduce;
114
- ndfunc_arg_in_t ain[3] = {{cT,0},{sym_reduce,0},{sym_init,0}};
115
- ndfunc_arg_out_t aout[1] = {{cumo_cBit,0}};
116
- ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 3,1, ain,aout};
114
+ cumo_ndfunc_arg_in_t ain[3] = {{cT,0},{cumo_sym_reduce,0},{cumo_sym_init,0}};
115
+ cumo_ndfunc_arg_out_t aout[1] = {{cumo_cBit,0}};
116
+ cumo_ndfunc_t ndf = {<%=c_iter%>, CUMO_FULL_LOOP_NIP, 3,1, ain,aout};
117
117
 
118
- reduce = na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
119
- v = na_ndloop(&ndf, 3, self, reduce, INT2FIX(<%=init_bit%>));
118
+ reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
119
+ v = cumo_na_ndloop(&ndf, 3, self, reduce, INT2FIX(<%=init_bit%>));
120
120
  if (argc > 0) {
121
121
  return v;
122
122
  }
@@ -1,28 +1,28 @@
1
1
  static void
2
- <%=c_iter%>(na_loop_t *const lp)
2
+ <%=c_iter%>(cumo_na_loop_t *const lp)
3
3
  {
4
4
  size_t i;
5
- BIT_DIGIT *a1, x=0;
5
+ CUMO_BIT_DIGIT *a1, x=0;
6
6
  size_t p1;
7
7
  ssize_t s1;
8
8
  size_t *idx1;
9
9
  VALUE y;
10
10
 
11
- INIT_COUNTER(lp, i);
12
- INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
11
+ CUMO_INIT_COUNTER(lp, i);
12
+ CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
13
13
 
14
- SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
14
+ CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
15
15
  cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
16
16
 
17
17
  if (idx1) {
18
18
  for (; i--;) {
19
- LOAD_BIT(a1, p1+*idx1, x); idx1++;
19
+ CUMO_LOAD_BIT(a1, p1+*idx1, x); idx1++;
20
20
  y = m_data_to_num(x);
21
21
  rb_yield(y);
22
22
  }
23
23
  } else {
24
24
  for (; i--;) {
25
- LOAD_BIT(a1, p1, x); p1+=s1;
25
+ CUMO_LOAD_BIT(a1, p1, x); p1+=s1;
26
26
  y = m_data_to_num(x);
27
27
  rb_yield(y);
28
28
  }
@@ -40,9 +40,9 @@ static void
40
40
  static VALUE
41
41
  <%=c_func(0)%>(VALUE self)
42
42
  {
43
- ndfunc_arg_in_t ain[1] = {{Qnil,0}};
44
- ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 1,0, ain,0};
43
+ cumo_ndfunc_arg_in_t ain[1] = {{Qnil,0}};
44
+ cumo_ndfunc_t ndf = {<%=c_iter%>, CUMO_FULL_LOOP_NIP, 1,0, ain,0};
45
45
 
46
- na_ndloop(&ndf, 1, self);
46
+ cumo_na_ndloop(&ndf, 1, self);
47
47
  return self;
48
48
  }
@@ -12,10 +12,10 @@ yield_each_with_index(dtype x, size_t *c, VALUE *a, int nd, int md)
12
12
 
13
13
 
14
14
  static void
15
- <%=c_iter%>(na_loop_t *const lp)
15
+ <%=c_iter%>(cumo_na_loop_t *const lp)
16
16
  {
17
17
  size_t i;
18
- BIT_DIGIT *a1, x=0;
18
+ CUMO_BIT_DIGIT *a1, x=0;
19
19
  size_t p1;
20
20
  ssize_t s1;
21
21
  size_t *idx1;
@@ -29,22 +29,22 @@ static void
29
29
  md = lp->ndim + 1;
30
30
  a = ALLOCA_N(VALUE,md);
31
31
 
32
- INIT_COUNTER(lp, i);
33
- INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
32
+ CUMO_INIT_COUNTER(lp, i);
33
+ CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
34
34
  c[nd] = 0;
35
35
 
36
- SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
36
+ CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
37
37
  cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
38
38
 
39
39
  if (idx1) {
40
40
  for (; i--;) {
41
- LOAD_BIT(a1, p1+*idx1, x); idx1++;
41
+ CUMO_LOAD_BIT(a1, p1+*idx1, x); idx1++;
42
42
  yield_each_with_index(x,c,a,nd,md);
43
43
  c[nd]++;
44
44
  }
45
45
  } else {
46
46
  for (; i--;) {
47
- LOAD_BIT(a1, p1, x); p1+=s1;
47
+ CUMO_LOAD_BIT(a1, p1, x); p1+=s1;
48
48
  yield_each_with_index(x,c,a,nd,md);
49
49
  c[nd]++;
50
50
  }
@@ -62,9 +62,9 @@ static void
62
62
  static VALUE
63
63
  <%=c_func(0)%>(VALUE self)
64
64
  {
65
- ndfunc_arg_in_t ain[1] = {{Qnil,0}};
66
- ndfunc_t ndf = {<%=c_iter%>, FULL_LOOP_NIP, 1,0, ain,0};
65
+ cumo_ndfunc_arg_in_t ain[1] = {{Qnil,0}};
66
+ cumo_ndfunc_t ndf = {<%=c_iter%>, CUMO_FULL_LOOP_NIP, 1,0, ain,0};
67
67
 
68
- na_ndloop_with_index(&ndf, 1, self);
68
+ cumo_na_ndloop_with_index(&ndf, 1, self);
69
69
  return self;
70
70
  }