cumo 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (158) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/3rd_party/LICENSE.txt +60 -0
  4. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +13 -1
  5. data/LICENSE.txt +1 -62
  6. data/README.md +33 -29
  7. data/bench/cumo_bench.rb +47 -25
  8. data/bench/numo_bench.rb +27 -25
  9. data/docs/src-tree.md +16 -0
  10. data/ext/cumo/cuda/cublas.c +69 -219
  11. data/ext/cumo/cuda/memory_pool_impl.hpp +1 -0
  12. data/ext/cumo/cuda/runtime.c +2 -14
  13. data/ext/cumo/cumo.c +16 -16
  14. data/ext/cumo/include/cumo.h +2 -2
  15. data/ext/cumo/include/cumo/cuda/cublas.h +6 -129
  16. data/ext/cumo/include/cumo/cuda/runtime.h +16 -0
  17. data/ext/cumo/include/cumo/indexer.h +46 -63
  18. data/ext/cumo/include/cumo/intern.h +58 -112
  19. data/ext/cumo/include/cumo/narray.h +214 -185
  20. data/ext/cumo/include/cumo/narray_kernel.h +66 -37
  21. data/ext/cumo/include/cumo/ndloop.h +42 -42
  22. data/ext/cumo/include/cumo/reduce_kernel.h +55 -71
  23. data/ext/cumo/include/cumo/template.h +56 -51
  24. data/ext/cumo/include/cumo/template_kernel.h +31 -31
  25. data/ext/cumo/include/cumo/types/bit.h +3 -3
  26. data/ext/cumo/include/cumo/types/bit_kernel.h +2 -2
  27. data/ext/cumo/include/cumo/types/complex.h +126 -126
  28. data/ext/cumo/include/cumo/types/complex_kernel.h +126 -126
  29. data/ext/cumo/include/cumo/types/complex_macro.h +28 -28
  30. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +20 -20
  31. data/ext/cumo/include/cumo/types/dcomplex.h +5 -5
  32. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +1 -1
  33. data/ext/cumo/include/cumo/types/int_macro.h +1 -1
  34. data/ext/cumo/include/cumo/types/int_macro_kernel.h +1 -1
  35. data/ext/cumo/include/cumo/types/robj_macro.h +30 -30
  36. data/ext/cumo/include/cumo/types/scomplex.h +5 -5
  37. data/ext/cumo/include/cumo/types/scomplex_kernel.h +1 -1
  38. data/ext/cumo/narray/array.c +143 -143
  39. data/ext/cumo/narray/data.c +184 -184
  40. data/ext/cumo/narray/gen/cogen.rb +5 -2
  41. data/ext/cumo/narray/gen/cogen_kernel.rb +5 -2
  42. data/ext/cumo/narray/gen/def/dcomplex.rb +1 -1
  43. data/ext/cumo/narray/gen/def/scomplex.rb +1 -1
  44. data/ext/cumo/narray/gen/erbln.rb +132 -0
  45. data/ext/cumo/narray/gen/erbpp2.rb +18 -13
  46. data/ext/cumo/narray/gen/narray_def.rb +3 -3
  47. data/ext/cumo/narray/gen/spec.rb +2 -2
  48. data/ext/cumo/narray/gen/tmpl/accum.c +15 -15
  49. data/ext/cumo/narray/gen/tmpl/accum_binary.c +22 -22
  50. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +3 -3
  51. data/ext/cumo/narray/gen/tmpl/accum_index.c +30 -30
  52. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +2 -2
  53. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +3 -3
  54. data/ext/cumo/narray/gen/tmpl/alloc_func.c +14 -14
  55. data/ext/cumo/narray/gen/tmpl/allocate.c +11 -11
  56. data/ext/cumo/narray/gen/tmpl/aref.c +2 -2
  57. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +4 -4
  58. data/ext/cumo/narray/gen/tmpl/aset.c +2 -2
  59. data/ext/cumo/narray/gen/tmpl/binary.c +28 -28
  60. data/ext/cumo/narray/gen/tmpl/binary2.c +18 -18
  61. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +3 -3
  62. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +6 -6
  63. data/ext/cumo/narray/gen/tmpl/binary_s.c +13 -13
  64. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +3 -3
  65. data/ext/cumo/narray/gen/tmpl/bincount.c +23 -23
  66. data/ext/cumo/narray/gen/tmpl/cast.c +7 -7
  67. data/ext/cumo/narray/gen/tmpl/cast_array.c +3 -3
  68. data/ext/cumo/narray/gen/tmpl/clip.c +38 -38
  69. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +2 -2
  70. data/ext/cumo/narray/gen/tmpl/cond_binary.c +19 -19
  71. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +7 -7
  72. data/ext/cumo/narray/gen/tmpl/cond_unary.c +15 -15
  73. data/ext/cumo/narray/gen/tmpl/cum.c +15 -15
  74. data/ext/cumo/narray/gen/tmpl/each.c +9 -9
  75. data/ext/cumo/narray/gen/tmpl/each_with_index.c +9 -9
  76. data/ext/cumo/narray/gen/tmpl/ewcomp.c +15 -15
  77. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +3 -3
  78. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +5 -5
  79. data/ext/cumo/narray/gen/tmpl/extract_data.c +12 -12
  80. data/ext/cumo/narray/gen/tmpl/eye.c +9 -9
  81. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +3 -3
  82. data/ext/cumo/narray/gen/tmpl/fill.c +9 -9
  83. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +6 -6
  84. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +1 -1
  85. data/ext/cumo/narray/gen/tmpl/format.c +11 -11
  86. data/ext/cumo/narray/gen/tmpl/format_to_a.c +8 -8
  87. data/ext/cumo/narray/gen/tmpl/frexp.c +13 -13
  88. data/ext/cumo/narray/gen/tmpl/gemm.c +252 -108
  89. data/ext/cumo/narray/gen/tmpl/inspect.c +1 -1
  90. data/ext/cumo/narray/gen/tmpl/lib.c +2 -2
  91. data/ext/cumo/narray/gen/tmpl/logseq.c +7 -7
  92. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +6 -6
  93. data/ext/cumo/narray/gen/tmpl/map_with_index.c +17 -17
  94. data/ext/cumo/narray/gen/tmpl/median.c +10 -10
  95. data/ext/cumo/narray/gen/tmpl/minmax.c +10 -10
  96. data/ext/cumo/narray/gen/tmpl/new_dim0.c +3 -3
  97. data/ext/cumo/narray/gen/tmpl/poly.c +6 -6
  98. data/ext/cumo/narray/gen/tmpl/pow.c +28 -28
  99. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +6 -6
  100. data/ext/cumo/narray/gen/tmpl/rand.c +10 -10
  101. data/ext/cumo/narray/gen/tmpl/rand_norm.c +7 -7
  102. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +6 -6
  103. data/ext/cumo/narray/gen/tmpl/seq.c +7 -7
  104. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +6 -6
  105. data/ext/cumo/narray/gen/tmpl/set2.c +20 -20
  106. data/ext/cumo/narray/gen/tmpl/sort.c +11 -11
  107. data/ext/cumo/narray/gen/tmpl/sort_index.c +18 -18
  108. data/ext/cumo/narray/gen/tmpl/store.c +6 -6
  109. data/ext/cumo/narray/gen/tmpl/store_array.c +19 -19
  110. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +12 -12
  111. data/ext/cumo/narray/gen/tmpl/store_bit.c +23 -23
  112. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +28 -28
  113. data/ext/cumo/narray/gen/tmpl/store_from.c +16 -16
  114. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +12 -12
  115. data/ext/cumo/narray/gen/tmpl/to_a.c +10 -10
  116. data/ext/cumo/narray/gen/tmpl/unary.c +25 -25
  117. data/ext/cumo/narray/gen/tmpl/unary2.c +17 -17
  118. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +15 -15
  119. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +13 -13
  120. data/ext/cumo/narray/gen/tmpl/unary_s.c +17 -17
  121. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +12 -12
  122. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +9 -9
  123. data/ext/cumo/narray/gen/tmpl_bit/aref.c +2 -2
  124. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +5 -5
  125. data/ext/cumo/narray/gen/tmpl_bit/aset.c +2 -2
  126. data/ext/cumo/narray/gen/tmpl_bit/binary.c +29 -29
  127. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +14 -14
  128. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +21 -21
  129. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +28 -28
  130. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +29 -29
  131. data/ext/cumo/narray/gen/tmpl_bit/each.c +10 -10
  132. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +10 -10
  133. data/ext/cumo/narray/gen/tmpl_bit/extract.c +8 -8
  134. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +8 -8
  135. data/ext/cumo/narray/gen/tmpl_bit/fill.c +17 -17
  136. data/ext/cumo/narray/gen/tmpl_bit/format.c +14 -14
  137. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +11 -11
  138. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +3 -3
  139. data/ext/cumo/narray/gen/tmpl_bit/mask.c +33 -33
  140. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +19 -19
  141. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +22 -22
  142. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +18 -18
  143. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +12 -12
  144. data/ext/cumo/narray/gen/tmpl_bit/unary.c +24 -24
  145. data/ext/cumo/narray/gen/tmpl_bit/where.c +16 -16
  146. data/ext/cumo/narray/gen/tmpl_bit/where2.c +20 -20
  147. data/ext/cumo/narray/index.c +213 -213
  148. data/ext/cumo/narray/math.c +27 -27
  149. data/ext/cumo/narray/narray.c +484 -484
  150. data/ext/cumo/narray/ndloop.c +259 -258
  151. data/ext/cumo/narray/rand.c +3 -3
  152. data/ext/cumo/narray/step.c +70 -70
  153. data/ext/cumo/narray/struct.c +139 -139
  154. metadata +6 -7
  155. data/ext/cumo/include/cumo/intern_fwd.h +0 -38
  156. data/lib/erbpp.rb +0 -294
  157. data/lib/erbpp/line_number.rb +0 -137
  158. data/lib/erbpp/narray_def.rb +0 -381
@@ -5,7 +5,7 @@ void <%="cumo_#{c_iter}_stride_kernel_launch"%>(char *ptr, ssize_t s0, ssize_t s
5
5
  #include <cuda_runtime.h>
6
6
 
7
7
  static void
8
- <%=c_iter%>(na_loop_t *const lp)
8
+ <%=c_iter%>(cumo_na_loop_t *const lp)
9
9
  {
10
10
  size_t n0, n1;
11
11
  ssize_t s0, s1;
@@ -22,13 +22,13 @@ static void
22
22
  n1 = lp->args[0].shape[1];
23
23
  s0 = lp->args[0].iter[0].step;
24
24
  s1 = lp->args[0].iter[1].step;
25
- p0 = NDL_PTR(lp,0);
25
+ p0 = CUMO_NDL_PTR(lp,0);
26
26
 
27
27
  <% if type_name == 'robject' %>
28
28
  {
29
29
  size_t i0, i1;
30
30
  char *p1;
31
- SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
31
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
32
32
  for (i0=0; i0 < n0; i0++) {
33
33
  p1 = p0;
34
34
  for (i1=0; i1 < n1; i1++) {
@@ -55,13 +55,13 @@ static void
55
55
  static VALUE
56
56
  <%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
57
57
  {
58
- ndfunc_arg_in_t ain[1] = {{OVERWRITE,2}};
59
- ndfunc_t ndf = {<%=c_iter%>, NO_LOOP, 1,0, ain,0};
58
+ cumo_ndfunc_arg_in_t ain[1] = {{CUMO_OVERWRITE,2}};
59
+ cumo_ndfunc_t ndf = {<%=c_iter%>, CUMO_NO_LOOP, 1,0, ain,0};
60
60
  ssize_t kofs;
61
61
  dtype data;
62
62
  char *g;
63
63
  int nd;
64
- narray_t *na;
64
+ cumo_narray_t *na;
65
65
 
66
66
  // check arguments
67
67
  if (argc > 2) {
@@ -77,10 +77,10 @@ static VALUE
77
77
  kofs = 0;
78
78
  }
79
79
 
80
- GetNArray(self,na);
80
+ CumoGetNArray(self,na);
81
81
  nd = na->ndim;
82
82
  if (nd < 2) {
83
- rb_raise(nary_eDimensionError,"less than 2-d array");
83
+ rb_raise(cumo_na_eDimensionError,"less than 2-d array");
84
84
  }
85
85
 
86
86
  // Diagonal offset from the main diagonal.
@@ -100,6 +100,6 @@ static VALUE
100
100
  *(ssize_t*)g = kofs;
101
101
  *(dtype*)(g+sizeof(ssize_t)) = data;
102
102
 
103
- na_ndloop3(&ndf, g, 1, self);
103
+ cumo_na_ndloop3(&ndf, g, 1, self);
104
104
  return self;
105
105
  }
@@ -11,9 +11,9 @@ __global__ void <%="cumo_#{c_iter}_stride_kernel"%>(char*ptr, ssize_t s0, ssize_
11
11
  void <%="cumo_#{c_iter}_stride_kernel_launch"%>(char *ptr, ssize_t s0, ssize_t s1, ssize_t kofs, dtype data, uint64_t n0, uint64_t n1)
12
12
  {
13
13
  uint64_t n = n0 * n1;
14
- size_t gridDim = get_gridDim(n);
15
- size_t blockDim = get_blockDim(n);
16
- <%="cumo_#{c_iter}_stride_kernel"%><<<gridDim, blockDim>>>(ptr,s0,s1,kofs,data,n0,n1,n);
14
+ size_t grid_dim = cumo_get_grid_dim(n);
15
+ size_t block_dim = cumo_get_block_dim(n);
16
+ <%="cumo_#{c_iter}_stride_kernel"%><<<grid_dim, block_dim>>>(ptr,s0,s1,kofs,data,n0,n1,n);
17
17
  }
18
18
  <% end %>
19
19
 
@@ -4,7 +4,7 @@ void <%="cumo_#{c_iter}_stride_kernel_launch"%>(char *ptr, ssize_t step, dtype v
4
4
  <% end %>
5
5
 
6
6
  static void
7
- <%=c_iter%>(na_loop_t *const lp)
7
+ <%=c_iter%>(cumo_na_loop_t *const lp)
8
8
  {
9
9
  size_t i;
10
10
  char *p1;
@@ -12,18 +12,18 @@ static void
12
12
  size_t *idx1;
13
13
  VALUE x = lp->option;
14
14
  dtype y;
15
- INIT_COUNTER(lp, i);
16
- INIT_PTR_IDX(lp, 0, p1, s1, idx1);
15
+ CUMO_INIT_COUNTER(lp, i);
16
+ CUMO_INIT_PTR_IDX(lp, 0, p1, s1, idx1);
17
17
  y = m_num_to_data(x);
18
18
  <% if type_name == 'robject' %>
19
- SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
19
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
20
20
  if (idx1) {
21
21
  for (; i--;) {
22
- SET_DATA_INDEX(p1,idx1,dtype,y);
22
+ CUMO_SET_DATA_INDEX(p1,idx1,dtype,y);
23
23
  }
24
24
  } else {
25
25
  for (; i--;) {
26
- SET_DATA_STRIDE(p1,s1,dtype,y);
26
+ CUMO_SET_DATA_STRIDE(p1,s1,dtype,y);
27
27
  }
28
28
  }
29
29
  <% else %>
@@ -44,9 +44,9 @@ static void
44
44
  static VALUE
45
45
  <%=c_func(1)%>(VALUE self, VALUE val)
46
46
  {
47
- ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{sym_option}};
48
- ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP, 2, 0, ain, 0 };
47
+ cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{cumo_sym_option}};
48
+ cumo_ndfunc_t ndf = { <%=c_iter%>, CUMO_FULL_LOOP, 2, 0, ain, 0 };
49
49
 
50
- na_ndloop(&ndf, 2, self, val);
50
+ cumo_na_ndloop(&ndf, 2, self, val);
51
51
  return self;
52
52
  }
@@ -15,15 +15,15 @@ __global__ void <%="cumo_#{c_iter}_stride_kernel"%>(char*ptr, ssize_t step, dtyp
15
15
 
16
16
  void <%="cumo_#{c_iter}_index_kernel_launch"%>(char *ptr, size_t *idx, dtype val, uint64_t n)
17
17
  {
18
- size_t gridDim = get_gridDim(n);
19
- size_t blockDim = get_blockDim(n);
20
- <%="cumo_#{c_iter}_index_kernel"%><<<gridDim, blockDim>>>(ptr,idx,val,n);
18
+ size_t grid_dim = cumo_get_grid_dim(n);
19
+ size_t block_dim = cumo_get_block_dim(n);
20
+ <%="cumo_#{c_iter}_index_kernel"%><<<grid_dim, block_dim>>>(ptr,idx,val,n);
21
21
  }
22
22
 
23
23
  void <%="cumo_#{c_iter}_stride_kernel_launch"%>(char *ptr, ssize_t step, dtype val, uint64_t n)
24
24
  {
25
- size_t gridDim = get_gridDim(n);
26
- size_t blockDim = get_blockDim(n);
27
- <%="cumo_#{c_iter}_stride_kernel"%><<<gridDim, blockDim>>>(ptr,step,val,n);
25
+ size_t grid_dim = cumo_get_grid_dim(n);
26
+ size_t block_dim = cumo_get_block_dim(n);
27
+ <%="cumo_#{c_iter}_stride_kernel"%><<<grid_dim, block_dim>>>(ptr,step,val,n);
28
28
  }
29
29
  <% end %>
@@ -1,4 +1,4 @@
1
- <% f = File.join(File.dirname(__FILE__), 'real_accum_kernel.cu'); ERB.new(File.read(f)).tap {|erb| erb.filename = f }.result(binding) %>
1
+ <%= load_erb('real_accum').result(binding) %>
2
2
 
3
3
  #if defined(__cplusplus)
4
4
  #if 0
@@ -13,7 +13,7 @@ format_<%=type_name%>(VALUE fmt, dtype* x)
13
13
  }
14
14
 
15
15
  static void
16
- <%=c_iter%>(na_loop_t *const lp)
16
+ <%=c_iter%>(cumo_na_loop_t *const lp)
17
17
  {
18
18
  size_t i;
19
19
  char *p1, *p2;
@@ -22,21 +22,21 @@ static void
22
22
  dtype *x;
23
23
  VALUE y;
24
24
  VALUE fmt = lp->option;
25
- INIT_COUNTER(lp, i);
26
- INIT_PTR_IDX(lp, 0, p1, s1, idx1);
27
- INIT_PTR(lp, 1, p2, s2);
28
- //SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
25
+ CUMO_INIT_COUNTER(lp, i);
26
+ CUMO_INIT_PTR_IDX(lp, 0, p1, s1, idx1);
27
+ CUMO_INIT_PTR(lp, 1, p2, s2);
28
+ //CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
29
29
  if (idx1) {
30
30
  for (; i--;) {
31
31
  x = (dtype*)(p1+*idx1); idx1++;
32
32
  y = format_<%=type_name%>(fmt, x);
33
- SET_DATA_STRIDE(p2, s2, VALUE, y);
33
+ CUMO_SET_DATA_STRIDE(p2, s2, VALUE, y);
34
34
  }
35
35
  } else {
36
36
  for (; i--;) {
37
37
  x = (dtype*)p1; p1+=s1;
38
38
  y = format_<%=type_name%>(fmt, x);
39
- SET_DATA_STRIDE(p2, s2, VALUE, y);
39
+ CUMO_SET_DATA_STRIDE(p2, s2, VALUE, y);
40
40
  }
41
41
  }
42
42
  }
@@ -52,11 +52,11 @@ static VALUE
52
52
  {
53
53
  VALUE fmt=Qnil;
54
54
 
55
- ndfunc_arg_in_t ain[2] = {{Qnil,0},{sym_option}};
56
- ndfunc_arg_out_t aout[1] = {{cumo_cRObject,0}};
57
- ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP_NIP, 2, 1, ain, aout };
55
+ cumo_ndfunc_arg_in_t ain[2] = {{Qnil,0},{cumo_sym_option}};
56
+ cumo_ndfunc_arg_out_t aout[1] = {{cumo_cRObject,0}};
57
+ cumo_ndfunc_t ndf = { <%=c_iter%>, CUMO_FULL_LOOP_NIP, 2, 1, ain, aout };
58
58
 
59
59
  rb_scan_args(argc, argv, "01", &fmt);
60
60
  cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
61
- return na_ndloop(&ndf, 2, self, fmt);
61
+ return cumo_na_ndloop(&ndf, 2, self, fmt);
62
62
  }
@@ -1,5 +1,5 @@
1
1
  static void
2
- <%=c_iter%>(na_loop_t *const lp)
2
+ <%=c_iter%>(cumo_na_loop_t *const lp)
3
3
  {
4
4
  size_t i;
5
5
  char *p1;
@@ -9,11 +9,11 @@ static void
9
9
  VALUE y;
10
10
  volatile VALUE a;
11
11
  VALUE fmt = lp->option;
12
- INIT_COUNTER(lp, i);
13
- INIT_PTR_IDX(lp, 0, p1, s1, idx1);
12
+ CUMO_INIT_COUNTER(lp, i);
13
+ CUMO_INIT_PTR_IDX(lp, 0, p1, s1, idx1);
14
14
  a = rb_ary_new2(i);
15
15
  rb_ary_push(lp->args[1].value, a);
16
- //SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
16
+ //CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
17
17
  if (idx1) {
18
18
  for (; i--;) {
19
19
  x = (dtype*)(p1 + *idx1); idx1++;
@@ -39,11 +39,11 @@ static VALUE
39
39
  <%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
40
40
  {
41
41
  VALUE fmt=Qnil;
42
- ndfunc_arg_in_t ain[3] = {{Qnil,0},{sym_loop_opt},{sym_option}};
43
- ndfunc_arg_out_t aout[1] = {{rb_cArray,0}}; // dummy?
44
- ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP_NIP, 3, 1, ain, aout };
42
+ cumo_ndfunc_arg_in_t ain[3] = {{Qnil,0},{cumo_sym_loop_opt},{cumo_sym_option}};
43
+ cumo_ndfunc_arg_out_t aout[1] = {{rb_cArray,0}}; // dummy?
44
+ cumo_ndfunc_t ndf = { <%=c_iter%>, CUMO_FULL_LOOP_NIP, 3, 1, ain, aout };
45
45
 
46
46
  rb_scan_args(argc, argv, "01", &fmt);
47
47
  cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
48
- return na_ndloop_cast_narray_to_rarray(&ndf, self, fmt);
48
+ return cumo_na_ndloop_cast_narray_to_rarray(&ndf, self, fmt);
49
49
  }
@@ -1,21 +1,21 @@
1
1
  static void
2
- <%=c_iter%>(na_loop_t *const lp)
2
+ <%=c_iter%>(cumo_na_loop_t *const lp)
3
3
  {
4
4
  size_t i;
5
5
  char *p1, *p2, *p3;
6
6
  ssize_t s1, s2, s3;
7
7
  dtype x;
8
8
  int y;
9
- INIT_COUNTER(lp, i);
10
- INIT_PTR(lp, 0, p1, s1);
11
- INIT_PTR(lp, 1, p2, s2);
12
- INIT_PTR(lp, 2, p3, s3);
13
- SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
9
+ CUMO_INIT_COUNTER(lp, i);
10
+ CUMO_INIT_PTR(lp, 0, p1, s1);
11
+ CUMO_INIT_PTR(lp, 1, p2, s2);
12
+ CUMO_INIT_PTR(lp, 2, p3, s3);
13
+ CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
14
14
  for (; i--;) {
15
- GET_DATA_STRIDE(p1,s1,dtype,x);
15
+ CUMO_GET_DATA_STRIDE(p1,s1,dtype,x);
16
16
  x = m_<%=name%>(x,&y);
17
- SET_DATA_STRIDE(p2,s2,dtype,x);
18
- SET_DATA_STRIDE(p3,s3,int32_t,y);
17
+ CUMO_SET_DATA_STRIDE(p2,s2,dtype,x);
18
+ CUMO_SET_DATA_STRIDE(p3,s3,int32_t,y);
19
19
  }
20
20
  }
21
21
 
@@ -31,8 +31,8 @@ static void
31
31
  static VALUE
32
32
  <%=c_func(1)%>(VALUE mod, VALUE a1)
33
33
  {
34
- ndfunc_arg_in_t ain[1] = {{cT,0}};
35
- ndfunc_arg_out_t aout[2] = {{cT,0},{cumo_cInt32,0}};
36
- ndfunc_t ndf = { <%=c_iter%>, STRIDE_LOOP, 1,2, ain,aout };
37
- return na_ndloop(&ndf, 1, a1);
34
+ cumo_ndfunc_arg_in_t ain[1] = {{cT,0}};
35
+ cumo_ndfunc_arg_out_t aout[2] = {{cT,0},{cumo_cInt32,0}};
36
+ cumo_ndfunc_t ndf = { <%=c_iter%>, CUMO_STRIDE_LOOP, 1,2, ain,aout };
37
+ return cumo_na_ndloop(&ndf, 1, a1);
38
38
  }
@@ -23,38 +23,191 @@
23
23
  end
24
24
  %>
25
25
 
26
- #define args_t <%=name%>_args_t
26
+ #define ROW_SIZE(na) ((na)->shape[(na)->ndim-2])
27
+ #define COL_SIZE(na) ((na)->shape[(na)->ndim-1])
28
+
29
+ #define CHECK_NARRAY_TYPE(x,t) \
30
+ if (rb_obj_class(x)!=(t)) { \
31
+ rb_raise(rb_eTypeError,"invalid NArray type (class)"); \
32
+ }
33
+
34
+ // Error Class ??
35
+ #define CHECK_DIM_GE(na,nd) \
36
+ if ((na)->ndim<(nd)) { \
37
+ rb_raise(cumo_na_eShapeError, \
38
+ "n-dimension=%d, but >=%d is expected", \
39
+ (na)->ndim, (nd)); \
40
+ }
41
+
42
+ #define CHECK_DIM_EQ(na1,nd) \
43
+ if ((na1)->ndim != (nd)) { \
44
+ rb_raise(cumo_na_eShapeError, \
45
+ "dimention mismatch: %d != %d", \
46
+ (na1)->ndim, (nd)); \
47
+ }
48
+
49
+ #define CHECK_SQUARE(name,na) \
50
+ if ((na)->shape[(na)->ndim-1] != (na)->shape[(na)->ndim-2]) { \
51
+ rb_raise(cumo_na_eShapeError,"%s is not square matrix",name); \
52
+ }
53
+
54
+ #define CHECK_SIZE_GE(na,sz) \
55
+ if ((na)->size < (size_t)(sz)) { \
56
+ rb_raise(cumo_na_eShapeError, \
57
+ "NArray size must be >= %"SZF"u",(size_t)(sz));\
58
+ }
59
+ #define CHECK_NON_EMPTY(na) \
60
+ if ((na)->size==0) { \
61
+ rb_raise(cumo_na_eShapeError,"empty NArray"); \
62
+ }
63
+
64
+ #define CHECK_SIZE_EQ(n,m) \
65
+ if ((n)!=(m)) { \
66
+ rb_raise(cumo_na_eShapeError, \
67
+ "size mismatch: %"SZF"d != %"SZF"d", \
68
+ (size_t)(n),(size_t)(m)); \
69
+ }
70
+
71
+ #define CHECK_SAME_SHAPE(na1,na2) \
72
+ { int i; \
73
+ CHECK_DIM_EQ(na1,na2->ndim); \
74
+ for (i=0; i<na1->ndim; i++) { \
75
+ CHECK_SIZE_EQ(na1->shape[i],na2->shape[i]); \
76
+ } \
77
+ }
78
+
79
+ #define CHECK_INT_EQ(sm,m,sn,n) \
80
+ if ((m) != (n)) { \
81
+ rb_raise(cumo_na_eShapeError, \
82
+ "%s must be == %s: %s=%d %s=%d", \
83
+ sm,sn,sm,m,sn,n); \
84
+ }
85
+
86
+ // Error Class ??
87
+ #define CHECK_LEADING_GE(sld,ld,sn,n) \
88
+ if ((ld) < (n)) { \
89
+ rb_raise(cumo_na_eShapeError, \
90
+ "%s must be >= max(%s,1): %s=%d %s=%d", \
91
+ sld,sn,sld,ld,sn,n); \
92
+ }
93
+
94
+ #define COPY_OR_CAST_TO(a,T) \
95
+ { \
96
+ if (rb_obj_class(a) == (T)) { \
97
+ if (!CUMO_TEST_INPLACE(a)) { \
98
+ a = cumo_na_copy(a); \
99
+ } \
100
+ } else { \
101
+ a = rb_funcall(T,rb_intern("cast"),1,a); \
102
+ } \
103
+ }
27
104
 
28
105
  typedef struct {
29
- // enum CBLAS_ORDER order; // cuBLAS does not have order (row-major or column-major) option
30
- cublasOperation_t transa, transb;
31
- cublasSideMode_t side;
32
- cublasFillMode_t uplo;
33
- cublasDiagType_t diag;
34
- dtype alpha, beta;
35
- int m, n, k;
36
- } args_t;
106
+ dtype alpha, beta;
107
+ int m, n, k;
108
+ } gemm_args_t;
37
109
 
38
- static void
39
- <%=c_iter%>(na_loop_t *const lp)
110
+ typedef struct {
111
+ int ld;
112
+ int stride; // in element count
113
+ cublasOperation_t trans;
114
+ VALUE a;
115
+ } gemm_layout_t;
116
+
117
+ static bool
118
+ is_f_contiguous(VALUE a)
119
+ {
120
+ int i;
121
+ ssize_t s0;
122
+ cumo_narray_t *na;
123
+
124
+ switch(CUMO_RNARRAY_TYPE(a)) {
125
+ case CUMO_NARRAY_DATA_T:
126
+ case CUMO_NARRAY_FILEMAP_T:
127
+ return CUMO_TEST_COLUMN_MAJOR(a);
128
+ case CUMO_NARRAY_VIEW_T:
129
+ CumoGetNArray(a, na);
130
+
131
+ // not contiguous if it has index
132
+ for (i = 0; i < CUMO_NA_NDIM(na); ++i) {
133
+ if (CUMO_NA_IS_INDEX_AT(na, i)) return false;
134
+ }
135
+
136
+ // check f-contiguous
137
+ s0 = cumo_na_element_stride(a);
138
+ for (i = 0; i < CUMO_NA_NDIM(na); ++i) {
139
+ if (CUMO_NA_SHAPE(na)[i] == 1) continue;
140
+ if (CUMO_NA_STRIDE_AT(na, i) != s0) return false;
141
+ s0 *= CUMO_NA_SHAPE(na)[i];
142
+ }
143
+ return true;
144
+ default:
145
+ rb_raise(rb_eArgError, "NArray type : %d is not supported", CUMO_RNARRAY_TYPE(a));
146
+ }
147
+ }
148
+
149
+ static bool
150
+ is_c_contiguous(VALUE a)
151
+ {
152
+ return cumo_na_check_contiguous(a) == Qtrue;
153
+ }
154
+
155
+ static gemm_layout_t
156
+ make_gemm_layout(VALUE a)
40
157
  {
41
- dtype *a, *b;
42
- int lda, ldb;
43
- dtype *c;
44
- int ldc;
45
- args_t *g;
46
- static cublasHandle_t handle = 0;
158
+ cumo_narray_t *na;
159
+ gemm_layout_t layout;
47
160
 
48
- a = (dtype*)NDL_PTR(lp,0);
49
- b = (dtype*)NDL_PTR(lp,1);
50
- c = (dtype*)NDL_PTR(lp,2);
51
- g = (args_t*)(lp->opt_ptr);
161
+ CumoGetNArray(a, na);
162
+
163
+ if (cumo_na_debug_flag) {
164
+ printf("ndim==2 && f_contiguous:%d, c_contiguous:%d\n",
165
+ CUMO_NA_NDIM(na) == 2 && is_f_contiguous(a), is_c_contiguous(a));
166
+ }
52
167
 
53
- lda = NDL_STEP(lp,0) / sizeof(dtype);
54
- ldb = NDL_STEP(lp,1) / sizeof(dtype);
55
- ldc = NDL_STEP(lp,2) / sizeof(dtype);
168
+ if (CUMO_NA_NDIM(na) == 2 && is_f_contiguous(a)) {
169
+ layout.ld = ROW_SIZE(na);
170
+ layout.trans = CUBLAS_OP_T;
171
+ layout.a = a;
172
+ } else {
173
+ layout.ld = COL_SIZE(na);
174
+ layout.trans = CUBLAS_OP_N; // transposed
175
+ // force c-contiguous
176
+ layout.a = is_c_contiguous(a) ? a : rb_funcall(a, rb_intern("dup"), 0);
177
+ }
178
+ layout.stride = ROW_SIZE(na) * COL_SIZE(na);
179
+ return layout;
180
+ }
181
+
182
+ extern int cumo_na_debug_flag; // narray.c
183
+
184
+ static void
185
+ print_gemm_args(gemm_args_t* g, gemm_layout_t* a_layout, gemm_layout_t* b_layout, int stridec, int batch_count)
186
+ {
187
+ printf("transb=%d transa=%d, n=%d, m=%d, k=%d, ldb=%d, lda=%d, ldc=n=%d, strideb=%d, stridea=%d stridec=%d batch_count=%d\n",
188
+ (int)b_layout->trans,
189
+ (int)a_layout->trans,
190
+ (int)g->n,
191
+ (int)g->m,
192
+ (int)g->k,
193
+ (int)b_layout->ld,
194
+ (int)a_layout->ld,
195
+ (int)g->n,
196
+ (int)b_layout->stride,
197
+ (int)a_layout->stride,
198
+ (int)stridec,
199
+ (int)batch_count);
200
+ }
56
201
 
57
- //printf("transa=%d transb=%d m=%d n=%d k=%d lda=%d ldb=%d ldc=%d\n",g->transa,g->transb,g->m,g->n,g->k,lda,ldb,ldc);
202
+ static void
203
+ <%=c_iter%>(VALUE a, VALUE b, VALUE c, gemm_args_t *g)
204
+ {
205
+ gemm_layout_t a_layout, b_layout;
206
+ cublasHandle_t handle = 0;
207
+ cublasStatus_t status = 0;
208
+ cumo_narray_t* nc;
209
+ int stridec = 0;
210
+ int batch_count = 0;
58
211
 
59
212
  // Note that cuBLAS uses the column major matrix representation.
60
213
  // We use technic which following site describes:
@@ -65,15 +218,37 @@ static void
65
218
  // c^T = nxm matrix
66
219
  // c^T = b^T * a^T
67
220
  //
68
- // cublasSgemm(handle,transb,transa,n,m,k,&alpha,b,n,a,k,&beta,c,n);
221
+ // cublasSgemm(handle,transb,transa,n,m,k,&alpha,b,ldb,a,lda,&beta,c,ldc=n);
69
222
 
70
- // TODO(sonots): Create another handle for another cuda device or cpu thread
71
- if (!handle) {
72
- cublasCreate(&handle);
73
- }
74
- cublas<%=func_prefix%>gemm(handle, g->transb, g->transa, g->n, g->m, g->k, (<%=cutype%>*)(&g->alpha), (<%=cutype%>*)b, ldb, (<%=cutype%>*)a, lda, (<%=cutype%>*)(&g->beta), (<%=cutype%>*)c, ldc);
75
- // TODO(sonots): Destroy correctly
76
- //cublasDestroy(handle);
223
+ a_layout = make_gemm_layout(a);
224
+ b_layout = make_gemm_layout(b);
225
+
226
+ CumoGetNArray(c, nc);
227
+ stridec = ROW_SIZE(nc) * COL_SIZE(nc);
228
+ batch_count = CUMO_NA_SIZE(nc) / stridec;
229
+
230
+ if (cumo_na_debug_flag) print_gemm_args(g, &a_layout, &b_layout, stridec, batch_count);
231
+ handle = cumo_cuda_cublas_handle();
232
+ status = cublas<%=func_prefix%>gemmStridedBatched(
233
+ handle,
234
+ b_layout.trans,
235
+ a_layout.trans,
236
+ g->n,
237
+ g->m,
238
+ g->k,
239
+ (<%=cutype%>*)(&g->alpha),
240
+ (<%=cutype%>*)(cumo_na_get_pointer_for_read(b_layout.a) + cumo_na_get_offset(b_layout.a)),
241
+ b_layout.ld,
242
+ b_layout.stride,
243
+ (<%=cutype%>*)(cumo_na_get_pointer_for_read(a_layout.a) + cumo_na_get_offset(a_layout.a)),
244
+ a_layout.ld,
245
+ a_layout.stride,
246
+ (<%=cutype%>*)(&g->beta),
247
+ (<%=cutype%>*)(cumo_na_get_pointer_for_write(c) + cumo_na_get_offset(c)),
248
+ g->n,
249
+ stridec,
250
+ batch_count);
251
+ cumo_cuda_cublas_check_status(status);
77
252
  }
78
253
 
79
254
  /*
@@ -90,17 +265,6 @@ static void
90
265
  def opt(v,tp=nil,*a)
91
266
  tp ||= "String or Symbol"
92
267
  case v
93
- when /^order$/
94
- "@param #{v} [#{tp}] if 'R': Row-major, if 'C': Column-major. (default='R')"
95
- when /^uplo$/
96
- "@param #{v} [#{tp}] if 'U': Upper triangle, if 'L': Lower triangle. (default='U')"
97
- when /^side$/
98
- "@param #{v} [#{tp}] if 'L': op(A)\\*B (left-side op), if 'R': B\\*op(A) (right-side op). (default='L')"
99
- when /^diag$/
100
- "@param #{v} [#{tp}] if 'U': assumed to be unit triangular, if 'N': not assumed to be unit triangular. (default='U')"
101
- when /^trans(\w+)?$/
102
- b = a[0] || $1
103
- "@param #{v} [#{tp}] if 'N': Not transpose #{b}, if 'T': Transpose #{b}. (default='N')"
104
268
  when "alpha"
105
269
  "@param #{v} [Float] (default=1.0)"
106
270
  when "beta"
@@ -111,15 +275,13 @@ static void
111
275
  end
112
276
  %>
113
277
  <%
114
- args_v = "a, b, [c, alpha:1, beta:0, transa:'N', transb:'N']"
278
+ args_v = "a, b, [c, alpha:1, beta:0]"
115
279
  params = [
116
280
  mat("a"),
117
281
  mat("b"),
118
282
  mat("c","optional",:inplace),
119
283
  opt("alpha"),
120
284
  opt("beta"),
121
- opt("transa"),
122
- opt("transb"),
123
285
  ].select{|x| x}.join("\n ")
124
286
  %>
125
287
  @overload <%=name%>(<%=args_v%>)
@@ -130,74 +292,56 @@ static void
130
292
  static VALUE
131
293
  <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
132
294
  {
133
- VALUE a=self, b, c=Qnil, alpha, beta;
134
- narray_t *na1, *na2;
135
- int ma, ka, kb, nb, tmp;
136
- size_t shape[2];
137
- ndfunc_arg_in_t ain[3] = {{cT,2},{cT,2},{OVERWRITE,2}};
138
- ndfunc_arg_out_t aout[1] = {{cT,2,shape}};
139
- ndfunc_t ndf = {<%=c_iter%>, NO_LOOP, 3, 0, ain, aout};
140
-
141
- args_t g;
295
+ VALUE a=self, b, c=Qnil, alpha, beta;
296
+ cumo_narray_t *na, *nb;
297
+
298
+ gemm_args_t g;
142
299
  VALUE kw_hash = Qnil;
143
- ID kw_table[4] = {rb_intern("alpha"),rb_intern("beta"),rb_intern("transa"),rb_intern("transb")};
144
- VALUE opts[4] = {Qundef,Qundef,Qundef,Qundef};
300
+ ID kw_table[2] = {rb_intern("alpha"), rb_intern("beta")};
301
+ VALUE opts[2] = {Qundef, Qundef};
145
302
 
146
303
  rb_scan_args(argc, argv, "11:", &b, &c, &kw_hash);
147
- rb_get_kwargs(kw_hash, kw_table, 0, 4, opts);
148
- alpha = option_value(opts[0],Qnil);
149
- g.alpha = RTEST(alpha) ? m_num_to_data(alpha) : m_one;
150
- beta = option_value(opts[1],Qnil);
151
- g.beta = RTEST(beta) ? m_num_to_data(beta) : m_zero;
152
- g.transa = option_trans(opts[2]);
153
- g.transb = option_trans(opts[3]);
154
-
155
- GetNArray(a,na1);
156
- GetNArray(b,na2);
157
- CHECK_DIM_GE(na1,2);
158
- CHECK_DIM_GE(na2,2);
159
- ma = ROW_SIZE(na1); // m
160
- ka = COL_SIZE(na1); // k
161
- kb = ROW_SIZE(na2); // k
162
- nb = COL_SIZE(na2); // n
163
-
164
- SWAP_IFTR(g.transa, ma, ka, tmp);
165
- SWAP_IFTR(g.transb, kb, nb, tmp);
166
- CHECK_INT_EQ("ka",ka,"kb",kb);
167
- g.m = ma;
168
- g.n = nb;
169
- g.k = ka;
170
-
171
- SWAP_IFROW(ma, nb, tmp);
304
+ rb_get_kwargs(kw_hash, kw_table, 0, 2, opts);
305
+ alpha = cumo_cuda_cublas_option_value(opts[0],Qnil);
306
+ g.alpha = RTEST(alpha) ? m_num_to_data(alpha) : m_one;
307
+ beta = cumo_cuda_cublas_option_value(opts[1],Qnil);
308
+ g.beta = RTEST(beta) ? m_num_to_data(beta) : m_zero;
309
+
310
+ CumoGetNArray(a, na);
311
+ CumoGetNArray(b, nb);
312
+ CHECK_DIM_GE(na, 2);
313
+ CHECK_DIM_GE(nb, 2);
314
+
315
+ if (ROW_SIZE(nb) != COL_SIZE(na)) {
316
+ rb_raise(cumo_na_eShapeError,"ROW_SIZE(b)=%d must equal to COL_SIZE(a)=%d",
317
+ (int)ROW_SIZE(nb), (int)COL_SIZE(na));
318
+ }
319
+
320
+ g.m = ROW_SIZE(na);
321
+ g.k = COL_SIZE(na);
322
+ g.n = COL_SIZE(nb);
172
323
 
173
324
  if (c == Qnil) { // c is not given.
174
- ndfunc_arg_in_t ain_init = {sym_init,0};
175
- ain[2] = ain_init;
176
- ndf.nout = 1;
177
- c = INT2FIX(0);
178
- shape[0] = nb;
179
- shape[1] = ma;
325
+ int ndim = CUMO_NA_NDIM(na);
326
+ size_t *shape = ALLOCA_N(size_t, ndim);
327
+ memcpy(shape, CUMO_NA_SHAPE(na), sizeof(size_t) * (ndim - 1)); // ... x m x k
328
+ shape[ndim - 1] = g.n; // ... x m x n
329
+ c = cumo_na_new(cT, ndim, shape);
180
330
  } else {
181
- narray_t *na3;
182
- int nc;
183
- COPY_OR_CAST_TO(c,cT);
184
- GetNArray(c,na3);
185
- CHECK_DIM_GE(na3,2);
186
- nc = ROW_SIZE(na3);
187
- if (nc < nb) {
188
- rb_raise(nary_eShapeError,"nc=%d must be >= nb=%d",nc,nb);
331
+ cumo_narray_t *nc;
332
+ COPY_OR_CAST_TO(c, cT);
333
+ CumoGetNArray(c, nc);
334
+ CHECK_DIM_GE(nc, 2);
335
+ if (ROW_SIZE(nc) != ROW_SIZE(na)) {
336
+ rb_raise(cumo_na_eShapeError,"ROW_SIZE(c)=%d must equal to ROW_SIZE(a)=%d",
337
+ (int)ROW_SIZE(nc), (int)ROW_SIZE(na));
189
338
  }
190
- //CHECK_LEADING_GE("ldc",g.ldc,"m",ma);
191
- }
192
- {
193
- VALUE ans = na_ndloop3(&ndf, &g, 3, a, b, c);
194
-
195
- if (ndf.nout == 1) { // c is not given.
196
- return ans;
197
- } else {
198
- return c;
339
+ if (COL_SIZE(nc) != COL_SIZE(nb)) {
340
+ rb_raise(cumo_na_eShapeError,"COL_SIZE(c)=%d must equal to COL_SIZE(b)=%d",
341
+ (int)COL_SIZE(nc), (int)COL_SIZE(nc));
199
342
  }
200
343
  }
201
- }
202
344
 
203
- #undef args_t
345
+ <%=c_iter%>(a, b, c, &g);
346
+ return c;
347
+ }