cumo 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/3rd_party/LICENSE.txt +60 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +13 -1
- data/LICENSE.txt +1 -62
- data/README.md +33 -29
- data/bench/cumo_bench.rb +47 -25
- data/bench/numo_bench.rb +27 -25
- data/docs/src-tree.md +16 -0
- data/ext/cumo/cuda/cublas.c +69 -219
- data/ext/cumo/cuda/memory_pool_impl.hpp +1 -0
- data/ext/cumo/cuda/runtime.c +2 -14
- data/ext/cumo/cumo.c +16 -16
- data/ext/cumo/include/cumo.h +2 -2
- data/ext/cumo/include/cumo/cuda/cublas.h +6 -129
- data/ext/cumo/include/cumo/cuda/runtime.h +16 -0
- data/ext/cumo/include/cumo/indexer.h +46 -63
- data/ext/cumo/include/cumo/intern.h +58 -112
- data/ext/cumo/include/cumo/narray.h +214 -185
- data/ext/cumo/include/cumo/narray_kernel.h +66 -37
- data/ext/cumo/include/cumo/ndloop.h +42 -42
- data/ext/cumo/include/cumo/reduce_kernel.h +55 -71
- data/ext/cumo/include/cumo/template.h +56 -51
- data/ext/cumo/include/cumo/template_kernel.h +31 -31
- data/ext/cumo/include/cumo/types/bit.h +3 -3
- data/ext/cumo/include/cumo/types/bit_kernel.h +2 -2
- data/ext/cumo/include/cumo/types/complex.h +126 -126
- data/ext/cumo/include/cumo/types/complex_kernel.h +126 -126
- data/ext/cumo/include/cumo/types/complex_macro.h +28 -28
- data/ext/cumo/include/cumo/types/complex_macro_kernel.h +20 -20
- data/ext/cumo/include/cumo/types/dcomplex.h +5 -5
- data/ext/cumo/include/cumo/types/dcomplex_kernel.h +1 -1
- data/ext/cumo/include/cumo/types/int_macro.h +1 -1
- data/ext/cumo/include/cumo/types/int_macro_kernel.h +1 -1
- data/ext/cumo/include/cumo/types/robj_macro.h +30 -30
- data/ext/cumo/include/cumo/types/scomplex.h +5 -5
- data/ext/cumo/include/cumo/types/scomplex_kernel.h +1 -1
- data/ext/cumo/narray/array.c +143 -143
- data/ext/cumo/narray/data.c +184 -184
- data/ext/cumo/narray/gen/cogen.rb +5 -2
- data/ext/cumo/narray/gen/cogen_kernel.rb +5 -2
- data/ext/cumo/narray/gen/def/dcomplex.rb +1 -1
- data/ext/cumo/narray/gen/def/scomplex.rb +1 -1
- data/ext/cumo/narray/gen/erbln.rb +132 -0
- data/ext/cumo/narray/gen/erbpp2.rb +18 -13
- data/ext/cumo/narray/gen/narray_def.rb +3 -3
- data/ext/cumo/narray/gen/spec.rb +2 -2
- data/ext/cumo/narray/gen/tmpl/accum.c +15 -15
- data/ext/cumo/narray/gen/tmpl/accum_binary.c +22 -22
- data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/accum_index.c +30 -30
- data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +2 -2
- data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/alloc_func.c +14 -14
- data/ext/cumo/narray/gen/tmpl/allocate.c +11 -11
- data/ext/cumo/narray/gen/tmpl/aref.c +2 -2
- data/ext/cumo/narray/gen/tmpl/aref_cpu.c +4 -4
- data/ext/cumo/narray/gen/tmpl/aset.c +2 -2
- data/ext/cumo/narray/gen/tmpl/binary.c +28 -28
- data/ext/cumo/narray/gen/tmpl/binary2.c +18 -18
- data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/binary_s.c +13 -13
- data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/bincount.c +23 -23
- data/ext/cumo/narray/gen/tmpl/cast.c +7 -7
- data/ext/cumo/narray/gen/tmpl/cast_array.c +3 -3
- data/ext/cumo/narray/gen/tmpl/clip.c +38 -38
- data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +2 -2
- data/ext/cumo/narray/gen/tmpl/cond_binary.c +19 -19
- data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +7 -7
- data/ext/cumo/narray/gen/tmpl/cond_unary.c +15 -15
- data/ext/cumo/narray/gen/tmpl/cum.c +15 -15
- data/ext/cumo/narray/gen/tmpl/each.c +9 -9
- data/ext/cumo/narray/gen/tmpl/each_with_index.c +9 -9
- data/ext/cumo/narray/gen/tmpl/ewcomp.c +15 -15
- data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/extract_cpu.c +5 -5
- data/ext/cumo/narray/gen/tmpl/extract_data.c +12 -12
- data/ext/cumo/narray/gen/tmpl/eye.c +9 -9
- data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/fill.c +9 -9
- data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +1 -1
- data/ext/cumo/narray/gen/tmpl/format.c +11 -11
- data/ext/cumo/narray/gen/tmpl/format_to_a.c +8 -8
- data/ext/cumo/narray/gen/tmpl/frexp.c +13 -13
- data/ext/cumo/narray/gen/tmpl/gemm.c +252 -108
- data/ext/cumo/narray/gen/tmpl/inspect.c +1 -1
- data/ext/cumo/narray/gen/tmpl/lib.c +2 -2
- data/ext/cumo/narray/gen/tmpl/logseq.c +7 -7
- data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/map_with_index.c +17 -17
- data/ext/cumo/narray/gen/tmpl/median.c +10 -10
- data/ext/cumo/narray/gen/tmpl/minmax.c +10 -10
- data/ext/cumo/narray/gen/tmpl/new_dim0.c +3 -3
- data/ext/cumo/narray/gen/tmpl/poly.c +6 -6
- data/ext/cumo/narray/gen/tmpl/pow.c +28 -28
- data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/rand.c +10 -10
- data/ext/cumo/narray/gen/tmpl/rand_norm.c +7 -7
- data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/seq.c +7 -7
- data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/set2.c +20 -20
- data/ext/cumo/narray/gen/tmpl/sort.c +11 -11
- data/ext/cumo/narray/gen/tmpl/sort_index.c +18 -18
- data/ext/cumo/narray/gen/tmpl/store.c +6 -6
- data/ext/cumo/narray/gen/tmpl/store_array.c +19 -19
- data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl/store_bit.c +23 -23
- data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +28 -28
- data/ext/cumo/narray/gen/tmpl/store_from.c +16 -16
- data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl/to_a.c +10 -10
- data/ext/cumo/narray/gen/tmpl/unary.c +25 -25
- data/ext/cumo/narray/gen/tmpl/unary2.c +17 -17
- data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +15 -15
- data/ext/cumo/narray/gen/tmpl/unary_ret2.c +13 -13
- data/ext/cumo/narray/gen/tmpl/unary_s.c +17 -17
- data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl_bit/allocate.c +9 -9
- data/ext/cumo/narray/gen/tmpl_bit/aref.c +2 -2
- data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +5 -5
- data/ext/cumo/narray/gen/tmpl_bit/aset.c +2 -2
- data/ext/cumo/narray/gen/tmpl_bit/binary.c +29 -29
- data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +14 -14
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +21 -21
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +28 -28
- data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +29 -29
- data/ext/cumo/narray/gen/tmpl_bit/each.c +10 -10
- data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +10 -10
- data/ext/cumo/narray/gen/tmpl_bit/extract.c +8 -8
- data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +8 -8
- data/ext/cumo/narray/gen/tmpl_bit/fill.c +17 -17
- data/ext/cumo/narray/gen/tmpl_bit/format.c +14 -14
- data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +11 -11
- data/ext/cumo/narray/gen/tmpl_bit/inspect.c +3 -3
- data/ext/cumo/narray/gen/tmpl_bit/mask.c +33 -33
- data/ext/cumo/narray/gen/tmpl_bit/store_array.c +19 -19
- data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +22 -22
- data/ext/cumo/narray/gen/tmpl_bit/store_from.c +18 -18
- data/ext/cumo/narray/gen/tmpl_bit/to_a.c +12 -12
- data/ext/cumo/narray/gen/tmpl_bit/unary.c +24 -24
- data/ext/cumo/narray/gen/tmpl_bit/where.c +16 -16
- data/ext/cumo/narray/gen/tmpl_bit/where2.c +20 -20
- data/ext/cumo/narray/index.c +213 -213
- data/ext/cumo/narray/math.c +27 -27
- data/ext/cumo/narray/narray.c +484 -484
- data/ext/cumo/narray/ndloop.c +259 -258
- data/ext/cumo/narray/rand.c +3 -3
- data/ext/cumo/narray/step.c +70 -70
- data/ext/cumo/narray/struct.c +139 -139
- metadata +6 -7
- data/ext/cumo/include/cumo/intern_fwd.h +0 -38
- data/lib/erbpp.rb +0 -294
- data/lib/erbpp/line_number.rb +0 -137
- data/lib/erbpp/narray_def.rb +0 -381
@@ -5,7 +5,7 @@ void <%="cumo_#{c_iter}_stride_kernel_launch"%>(char *ptr, ssize_t s0, ssize_t s
|
|
5
5
|
#include <cuda_runtime.h>
|
6
6
|
|
7
7
|
static void
|
8
|
-
<%=c_iter%>(
|
8
|
+
<%=c_iter%>(cumo_na_loop_t *const lp)
|
9
9
|
{
|
10
10
|
size_t n0, n1;
|
11
11
|
ssize_t s0, s1;
|
@@ -22,13 +22,13 @@ static void
|
|
22
22
|
n1 = lp->args[0].shape[1];
|
23
23
|
s0 = lp->args[0].iter[0].step;
|
24
24
|
s1 = lp->args[0].iter[1].step;
|
25
|
-
p0 =
|
25
|
+
p0 = CUMO_NDL_PTR(lp,0);
|
26
26
|
|
27
27
|
<% if type_name == 'robject' %>
|
28
28
|
{
|
29
29
|
size_t i0, i1;
|
30
30
|
char *p1;
|
31
|
-
|
31
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
|
32
32
|
for (i0=0; i0 < n0; i0++) {
|
33
33
|
p1 = p0;
|
34
34
|
for (i1=0; i1 < n1; i1++) {
|
@@ -55,13 +55,13 @@ static void
|
|
55
55
|
static VALUE
|
56
56
|
<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
|
57
57
|
{
|
58
|
-
|
59
|
-
|
58
|
+
cumo_ndfunc_arg_in_t ain[1] = {{CUMO_OVERWRITE,2}};
|
59
|
+
cumo_ndfunc_t ndf = {<%=c_iter%>, CUMO_NO_LOOP, 1,0, ain,0};
|
60
60
|
ssize_t kofs;
|
61
61
|
dtype data;
|
62
62
|
char *g;
|
63
63
|
int nd;
|
64
|
-
|
64
|
+
cumo_narray_t *na;
|
65
65
|
|
66
66
|
// check arguments
|
67
67
|
if (argc > 2) {
|
@@ -77,10 +77,10 @@ static VALUE
|
|
77
77
|
kofs = 0;
|
78
78
|
}
|
79
79
|
|
80
|
-
|
80
|
+
CumoGetNArray(self,na);
|
81
81
|
nd = na->ndim;
|
82
82
|
if (nd < 2) {
|
83
|
-
rb_raise(
|
83
|
+
rb_raise(cumo_na_eDimensionError,"less than 2-d array");
|
84
84
|
}
|
85
85
|
|
86
86
|
// Diagonal offset from the main diagonal.
|
@@ -100,6 +100,6 @@ static VALUE
|
|
100
100
|
*(ssize_t*)g = kofs;
|
101
101
|
*(dtype*)(g+sizeof(ssize_t)) = data;
|
102
102
|
|
103
|
-
|
103
|
+
cumo_na_ndloop3(&ndf, g, 1, self);
|
104
104
|
return self;
|
105
105
|
}
|
@@ -11,9 +11,9 @@ __global__ void <%="cumo_#{c_iter}_stride_kernel"%>(char*ptr, ssize_t s0, ssize_
|
|
11
11
|
void <%="cumo_#{c_iter}_stride_kernel_launch"%>(char *ptr, ssize_t s0, ssize_t s1, ssize_t kofs, dtype data, uint64_t n0, uint64_t n1)
|
12
12
|
{
|
13
13
|
uint64_t n = n0 * n1;
|
14
|
-
size_t
|
15
|
-
size_t
|
16
|
-
<%="cumo_#{c_iter}_stride_kernel"%><<<
|
14
|
+
size_t grid_dim = cumo_get_grid_dim(n);
|
15
|
+
size_t block_dim = cumo_get_block_dim(n);
|
16
|
+
<%="cumo_#{c_iter}_stride_kernel"%><<<grid_dim, block_dim>>>(ptr,s0,s1,kofs,data,n0,n1,n);
|
17
17
|
}
|
18
18
|
<% end %>
|
19
19
|
|
@@ -4,7 +4,7 @@ void <%="cumo_#{c_iter}_stride_kernel_launch"%>(char *ptr, ssize_t step, dtype v
|
|
4
4
|
<% end %>
|
5
5
|
|
6
6
|
static void
|
7
|
-
<%=c_iter%>(
|
7
|
+
<%=c_iter%>(cumo_na_loop_t *const lp)
|
8
8
|
{
|
9
9
|
size_t i;
|
10
10
|
char *p1;
|
@@ -12,18 +12,18 @@ static void
|
|
12
12
|
size_t *idx1;
|
13
13
|
VALUE x = lp->option;
|
14
14
|
dtype y;
|
15
|
-
|
16
|
-
|
15
|
+
CUMO_INIT_COUNTER(lp, i);
|
16
|
+
CUMO_INIT_PTR_IDX(lp, 0, p1, s1, idx1);
|
17
17
|
y = m_num_to_data(x);
|
18
18
|
<% if type_name == 'robject' %>
|
19
|
-
|
19
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
|
20
20
|
if (idx1) {
|
21
21
|
for (; i--;) {
|
22
|
-
|
22
|
+
CUMO_SET_DATA_INDEX(p1,idx1,dtype,y);
|
23
23
|
}
|
24
24
|
} else {
|
25
25
|
for (; i--;) {
|
26
|
-
|
26
|
+
CUMO_SET_DATA_STRIDE(p1,s1,dtype,y);
|
27
27
|
}
|
28
28
|
}
|
29
29
|
<% else %>
|
@@ -44,9 +44,9 @@ static void
|
|
44
44
|
static VALUE
|
45
45
|
<%=c_func(1)%>(VALUE self, VALUE val)
|
46
46
|
{
|
47
|
-
|
48
|
-
|
47
|
+
cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{cumo_sym_option}};
|
48
|
+
cumo_ndfunc_t ndf = { <%=c_iter%>, CUMO_FULL_LOOP, 2, 0, ain, 0 };
|
49
49
|
|
50
|
-
|
50
|
+
cumo_na_ndloop(&ndf, 2, self, val);
|
51
51
|
return self;
|
52
52
|
}
|
@@ -15,15 +15,15 @@ __global__ void <%="cumo_#{c_iter}_stride_kernel"%>(char*ptr, ssize_t step, dtyp
|
|
15
15
|
|
16
16
|
void <%="cumo_#{c_iter}_index_kernel_launch"%>(char *ptr, size_t *idx, dtype val, uint64_t n)
|
17
17
|
{
|
18
|
-
size_t
|
19
|
-
size_t
|
20
|
-
<%="cumo_#{c_iter}_index_kernel"%><<<
|
18
|
+
size_t grid_dim = cumo_get_grid_dim(n);
|
19
|
+
size_t block_dim = cumo_get_block_dim(n);
|
20
|
+
<%="cumo_#{c_iter}_index_kernel"%><<<grid_dim, block_dim>>>(ptr,idx,val,n);
|
21
21
|
}
|
22
22
|
|
23
23
|
void <%="cumo_#{c_iter}_stride_kernel_launch"%>(char *ptr, ssize_t step, dtype val, uint64_t n)
|
24
24
|
{
|
25
|
-
size_t
|
26
|
-
size_t
|
27
|
-
<%="cumo_#{c_iter}_stride_kernel"%><<<
|
25
|
+
size_t grid_dim = cumo_get_grid_dim(n);
|
26
|
+
size_t block_dim = cumo_get_block_dim(n);
|
27
|
+
<%="cumo_#{c_iter}_stride_kernel"%><<<grid_dim, block_dim>>>(ptr,step,val,n);
|
28
28
|
}
|
29
29
|
<% end %>
|
@@ -13,7 +13,7 @@ format_<%=type_name%>(VALUE fmt, dtype* x)
|
|
13
13
|
}
|
14
14
|
|
15
15
|
static void
|
16
|
-
<%=c_iter%>(
|
16
|
+
<%=c_iter%>(cumo_na_loop_t *const lp)
|
17
17
|
{
|
18
18
|
size_t i;
|
19
19
|
char *p1, *p2;
|
@@ -22,21 +22,21 @@ static void
|
|
22
22
|
dtype *x;
|
23
23
|
VALUE y;
|
24
24
|
VALUE fmt = lp->option;
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
//
|
25
|
+
CUMO_INIT_COUNTER(lp, i);
|
26
|
+
CUMO_INIT_PTR_IDX(lp, 0, p1, s1, idx1);
|
27
|
+
CUMO_INIT_PTR(lp, 1, p2, s2);
|
28
|
+
//CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
|
29
29
|
if (idx1) {
|
30
30
|
for (; i--;) {
|
31
31
|
x = (dtype*)(p1+*idx1); idx1++;
|
32
32
|
y = format_<%=type_name%>(fmt, x);
|
33
|
-
|
33
|
+
CUMO_SET_DATA_STRIDE(p2, s2, VALUE, y);
|
34
34
|
}
|
35
35
|
} else {
|
36
36
|
for (; i--;) {
|
37
37
|
x = (dtype*)p1; p1+=s1;
|
38
38
|
y = format_<%=type_name%>(fmt, x);
|
39
|
-
|
39
|
+
CUMO_SET_DATA_STRIDE(p2, s2, VALUE, y);
|
40
40
|
}
|
41
41
|
}
|
42
42
|
}
|
@@ -52,11 +52,11 @@ static VALUE
|
|
52
52
|
{
|
53
53
|
VALUE fmt=Qnil;
|
54
54
|
|
55
|
-
|
56
|
-
|
57
|
-
|
55
|
+
cumo_ndfunc_arg_in_t ain[2] = {{Qnil,0},{cumo_sym_option}};
|
56
|
+
cumo_ndfunc_arg_out_t aout[1] = {{cumo_cRObject,0}};
|
57
|
+
cumo_ndfunc_t ndf = { <%=c_iter%>, CUMO_FULL_LOOP_NIP, 2, 1, ain, aout };
|
58
58
|
|
59
59
|
rb_scan_args(argc, argv, "01", &fmt);
|
60
60
|
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
61
|
-
return
|
61
|
+
return cumo_na_ndloop(&ndf, 2, self, fmt);
|
62
62
|
}
|
@@ -1,5 +1,5 @@
|
|
1
1
|
static void
|
2
|
-
<%=c_iter%>(
|
2
|
+
<%=c_iter%>(cumo_na_loop_t *const lp)
|
3
3
|
{
|
4
4
|
size_t i;
|
5
5
|
char *p1;
|
@@ -9,11 +9,11 @@ static void
|
|
9
9
|
VALUE y;
|
10
10
|
volatile VALUE a;
|
11
11
|
VALUE fmt = lp->option;
|
12
|
-
|
13
|
-
|
12
|
+
CUMO_INIT_COUNTER(lp, i);
|
13
|
+
CUMO_INIT_PTR_IDX(lp, 0, p1, s1, idx1);
|
14
14
|
a = rb_ary_new2(i);
|
15
15
|
rb_ary_push(lp->args[1].value, a);
|
16
|
-
//
|
16
|
+
//CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
|
17
17
|
if (idx1) {
|
18
18
|
for (; i--;) {
|
19
19
|
x = (dtype*)(p1 + *idx1); idx1++;
|
@@ -39,11 +39,11 @@ static VALUE
|
|
39
39
|
<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
|
40
40
|
{
|
41
41
|
VALUE fmt=Qnil;
|
42
|
-
|
43
|
-
|
44
|
-
|
42
|
+
cumo_ndfunc_arg_in_t ain[3] = {{Qnil,0},{cumo_sym_loop_opt},{cumo_sym_option}};
|
43
|
+
cumo_ndfunc_arg_out_t aout[1] = {{rb_cArray,0}}; // dummy?
|
44
|
+
cumo_ndfunc_t ndf = { <%=c_iter%>, CUMO_FULL_LOOP_NIP, 3, 1, ain, aout };
|
45
45
|
|
46
46
|
rb_scan_args(argc, argv, "01", &fmt);
|
47
47
|
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
48
|
-
return
|
48
|
+
return cumo_na_ndloop_cast_narray_to_rarray(&ndf, self, fmt);
|
49
49
|
}
|
@@ -1,21 +1,21 @@
|
|
1
1
|
static void
|
2
|
-
<%=c_iter%>(
|
2
|
+
<%=c_iter%>(cumo_na_loop_t *const lp)
|
3
3
|
{
|
4
4
|
size_t i;
|
5
5
|
char *p1, *p2, *p3;
|
6
6
|
ssize_t s1, s2, s3;
|
7
7
|
dtype x;
|
8
8
|
int y;
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
9
|
+
CUMO_INIT_COUNTER(lp, i);
|
10
|
+
CUMO_INIT_PTR(lp, 0, p1, s1);
|
11
|
+
CUMO_INIT_PTR(lp, 1, p2, s2);
|
12
|
+
CUMO_INIT_PTR(lp, 2, p3, s3);
|
13
|
+
CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
|
14
14
|
for (; i--;) {
|
15
|
-
|
15
|
+
CUMO_GET_DATA_STRIDE(p1,s1,dtype,x);
|
16
16
|
x = m_<%=name%>(x,&y);
|
17
|
-
|
18
|
-
|
17
|
+
CUMO_SET_DATA_STRIDE(p2,s2,dtype,x);
|
18
|
+
CUMO_SET_DATA_STRIDE(p3,s3,int32_t,y);
|
19
19
|
}
|
20
20
|
}
|
21
21
|
|
@@ -31,8 +31,8 @@ static void
|
|
31
31
|
static VALUE
|
32
32
|
<%=c_func(1)%>(VALUE mod, VALUE a1)
|
33
33
|
{
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
return
|
34
|
+
cumo_ndfunc_arg_in_t ain[1] = {{cT,0}};
|
35
|
+
cumo_ndfunc_arg_out_t aout[2] = {{cT,0},{cumo_cInt32,0}};
|
36
|
+
cumo_ndfunc_t ndf = { <%=c_iter%>, CUMO_STRIDE_LOOP, 1,2, ain,aout };
|
37
|
+
return cumo_na_ndloop(&ndf, 1, a1);
|
38
38
|
}
|
@@ -23,38 +23,191 @@
|
|
23
23
|
end
|
24
24
|
%>
|
25
25
|
|
26
|
-
#define
|
26
|
+
#define ROW_SIZE(na) ((na)->shape[(na)->ndim-2])
|
27
|
+
#define COL_SIZE(na) ((na)->shape[(na)->ndim-1])
|
28
|
+
|
29
|
+
#define CHECK_NARRAY_TYPE(x,t) \
|
30
|
+
if (rb_obj_class(x)!=(t)) { \
|
31
|
+
rb_raise(rb_eTypeError,"invalid NArray type (class)"); \
|
32
|
+
}
|
33
|
+
|
34
|
+
// Error Class ??
|
35
|
+
#define CHECK_DIM_GE(na,nd) \
|
36
|
+
if ((na)->ndim<(nd)) { \
|
37
|
+
rb_raise(cumo_na_eShapeError, \
|
38
|
+
"n-dimension=%d, but >=%d is expected", \
|
39
|
+
(na)->ndim, (nd)); \
|
40
|
+
}
|
41
|
+
|
42
|
+
#define CHECK_DIM_EQ(na1,nd) \
|
43
|
+
if ((na1)->ndim != (nd)) { \
|
44
|
+
rb_raise(cumo_na_eShapeError, \
|
45
|
+
"dimention mismatch: %d != %d", \
|
46
|
+
(na1)->ndim, (nd)); \
|
47
|
+
}
|
48
|
+
|
49
|
+
#define CHECK_SQUARE(name,na) \
|
50
|
+
if ((na)->shape[(na)->ndim-1] != (na)->shape[(na)->ndim-2]) { \
|
51
|
+
rb_raise(cumo_na_eShapeError,"%s is not square matrix",name); \
|
52
|
+
}
|
53
|
+
|
54
|
+
#define CHECK_SIZE_GE(na,sz) \
|
55
|
+
if ((na)->size < (size_t)(sz)) { \
|
56
|
+
rb_raise(cumo_na_eShapeError, \
|
57
|
+
"NArray size must be >= %"SZF"u",(size_t)(sz));\
|
58
|
+
}
|
59
|
+
#define CHECK_NON_EMPTY(na) \
|
60
|
+
if ((na)->size==0) { \
|
61
|
+
rb_raise(cumo_na_eShapeError,"empty NArray"); \
|
62
|
+
}
|
63
|
+
|
64
|
+
#define CHECK_SIZE_EQ(n,m) \
|
65
|
+
if ((n)!=(m)) { \
|
66
|
+
rb_raise(cumo_na_eShapeError, \
|
67
|
+
"size mismatch: %"SZF"d != %"SZF"d", \
|
68
|
+
(size_t)(n),(size_t)(m)); \
|
69
|
+
}
|
70
|
+
|
71
|
+
#define CHECK_SAME_SHAPE(na1,na2) \
|
72
|
+
{ int i; \
|
73
|
+
CHECK_DIM_EQ(na1,na2->ndim); \
|
74
|
+
for (i=0; i<na1->ndim; i++) { \
|
75
|
+
CHECK_SIZE_EQ(na1->shape[i],na2->shape[i]); \
|
76
|
+
} \
|
77
|
+
}
|
78
|
+
|
79
|
+
#define CHECK_INT_EQ(sm,m,sn,n) \
|
80
|
+
if ((m) != (n)) { \
|
81
|
+
rb_raise(cumo_na_eShapeError, \
|
82
|
+
"%s must be == %s: %s=%d %s=%d", \
|
83
|
+
sm,sn,sm,m,sn,n); \
|
84
|
+
}
|
85
|
+
|
86
|
+
// Error Class ??
|
87
|
+
#define CHECK_LEADING_GE(sld,ld,sn,n) \
|
88
|
+
if ((ld) < (n)) { \
|
89
|
+
rb_raise(cumo_na_eShapeError, \
|
90
|
+
"%s must be >= max(%s,1): %s=%d %s=%d", \
|
91
|
+
sld,sn,sld,ld,sn,n); \
|
92
|
+
}
|
93
|
+
|
94
|
+
#define COPY_OR_CAST_TO(a,T) \
|
95
|
+
{ \
|
96
|
+
if (rb_obj_class(a) == (T)) { \
|
97
|
+
if (!CUMO_TEST_INPLACE(a)) { \
|
98
|
+
a = cumo_na_copy(a); \
|
99
|
+
} \
|
100
|
+
} else { \
|
101
|
+
a = rb_funcall(T,rb_intern("cast"),1,a); \
|
102
|
+
} \
|
103
|
+
}
|
27
104
|
|
28
105
|
typedef struct {
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
cublasFillMode_t uplo;
|
33
|
-
cublasDiagType_t diag;
|
34
|
-
dtype alpha, beta;
|
35
|
-
int m, n, k;
|
36
|
-
} args_t;
|
106
|
+
dtype alpha, beta;
|
107
|
+
int m, n, k;
|
108
|
+
} gemm_args_t;
|
37
109
|
|
38
|
-
|
39
|
-
|
110
|
+
typedef struct {
|
111
|
+
int ld;
|
112
|
+
int stride; // in element count
|
113
|
+
cublasOperation_t trans;
|
114
|
+
VALUE a;
|
115
|
+
} gemm_layout_t;
|
116
|
+
|
117
|
+
static bool
|
118
|
+
is_f_contiguous(VALUE a)
|
119
|
+
{
|
120
|
+
int i;
|
121
|
+
ssize_t s0;
|
122
|
+
cumo_narray_t *na;
|
123
|
+
|
124
|
+
switch(CUMO_RNARRAY_TYPE(a)) {
|
125
|
+
case CUMO_NARRAY_DATA_T:
|
126
|
+
case CUMO_NARRAY_FILEMAP_T:
|
127
|
+
return CUMO_TEST_COLUMN_MAJOR(a);
|
128
|
+
case CUMO_NARRAY_VIEW_T:
|
129
|
+
CumoGetNArray(a, na);
|
130
|
+
|
131
|
+
// not contiguous if it has index
|
132
|
+
for (i = 0; i < CUMO_NA_NDIM(na); ++i) {
|
133
|
+
if (CUMO_NA_IS_INDEX_AT(na, i)) return false;
|
134
|
+
}
|
135
|
+
|
136
|
+
// check f-contiguous
|
137
|
+
s0 = cumo_na_element_stride(a);
|
138
|
+
for (i = 0; i < CUMO_NA_NDIM(na); ++i) {
|
139
|
+
if (CUMO_NA_SHAPE(na)[i] == 1) continue;
|
140
|
+
if (CUMO_NA_STRIDE_AT(na, i) != s0) return false;
|
141
|
+
s0 *= CUMO_NA_SHAPE(na)[i];
|
142
|
+
}
|
143
|
+
return true;
|
144
|
+
default:
|
145
|
+
rb_raise(rb_eArgError, "NArray type : %d is not supported", CUMO_RNARRAY_TYPE(a));
|
146
|
+
}
|
147
|
+
}
|
148
|
+
|
149
|
+
static bool
|
150
|
+
is_c_contiguous(VALUE a)
|
151
|
+
{
|
152
|
+
return cumo_na_check_contiguous(a) == Qtrue;
|
153
|
+
}
|
154
|
+
|
155
|
+
static gemm_layout_t
|
156
|
+
make_gemm_layout(VALUE a)
|
40
157
|
{
|
41
|
-
|
42
|
-
|
43
|
-
dtype *c;
|
44
|
-
int ldc;
|
45
|
-
args_t *g;
|
46
|
-
static cublasHandle_t handle = 0;
|
158
|
+
cumo_narray_t *na;
|
159
|
+
gemm_layout_t layout;
|
47
160
|
|
48
|
-
a
|
49
|
-
|
50
|
-
|
51
|
-
|
161
|
+
CumoGetNArray(a, na);
|
162
|
+
|
163
|
+
if (cumo_na_debug_flag) {
|
164
|
+
printf("ndim==2 && f_contiguous:%d, c_contiguous:%d\n",
|
165
|
+
CUMO_NA_NDIM(na) == 2 && is_f_contiguous(a), is_c_contiguous(a));
|
166
|
+
}
|
52
167
|
|
53
|
-
|
54
|
-
|
55
|
-
|
168
|
+
if (CUMO_NA_NDIM(na) == 2 && is_f_contiguous(a)) {
|
169
|
+
layout.ld = ROW_SIZE(na);
|
170
|
+
layout.trans = CUBLAS_OP_T;
|
171
|
+
layout.a = a;
|
172
|
+
} else {
|
173
|
+
layout.ld = COL_SIZE(na);
|
174
|
+
layout.trans = CUBLAS_OP_N; // transposed
|
175
|
+
// force c-contiguous
|
176
|
+
layout.a = is_c_contiguous(a) ? a : rb_funcall(a, rb_intern("dup"), 0);
|
177
|
+
}
|
178
|
+
layout.stride = ROW_SIZE(na) * COL_SIZE(na);
|
179
|
+
return layout;
|
180
|
+
}
|
181
|
+
|
182
|
+
extern int cumo_na_debug_flag; // narray.c
|
183
|
+
|
184
|
+
static void
|
185
|
+
print_gemm_args(gemm_args_t* g, gemm_layout_t* a_layout, gemm_layout_t* b_layout, int stridec, int batch_count)
|
186
|
+
{
|
187
|
+
printf("transb=%d transa=%d, n=%d, m=%d, k=%d, ldb=%d, lda=%d, ldc=n=%d, strideb=%d, stridea=%d stridec=%d batch_count=%d\n",
|
188
|
+
(int)b_layout->trans,
|
189
|
+
(int)a_layout->trans,
|
190
|
+
(int)g->n,
|
191
|
+
(int)g->m,
|
192
|
+
(int)g->k,
|
193
|
+
(int)b_layout->ld,
|
194
|
+
(int)a_layout->ld,
|
195
|
+
(int)g->n,
|
196
|
+
(int)b_layout->stride,
|
197
|
+
(int)a_layout->stride,
|
198
|
+
(int)stridec,
|
199
|
+
(int)batch_count);
|
200
|
+
}
|
56
201
|
|
57
|
-
|
202
|
+
static void
|
203
|
+
<%=c_iter%>(VALUE a, VALUE b, VALUE c, gemm_args_t *g)
|
204
|
+
{
|
205
|
+
gemm_layout_t a_layout, b_layout;
|
206
|
+
cublasHandle_t handle = 0;
|
207
|
+
cublasStatus_t status = 0;
|
208
|
+
cumo_narray_t* nc;
|
209
|
+
int stridec = 0;
|
210
|
+
int batch_count = 0;
|
58
211
|
|
59
212
|
// Note that cuBLAS uses the column major matrix representation.
|
60
213
|
// We use technic which following site describes:
|
@@ -65,15 +218,37 @@ static void
|
|
65
218
|
// c^T = nxm matrix
|
66
219
|
// c^T = b^T * a^T
|
67
220
|
//
|
68
|
-
// cublasSgemm(handle,transb,transa,n,m,k,&alpha,b,
|
221
|
+
// cublasSgemm(handle,transb,transa,n,m,k,&alpha,b,ldb,a,lda,&beta,c,ldc=n);
|
69
222
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
223
|
+
a_layout = make_gemm_layout(a);
|
224
|
+
b_layout = make_gemm_layout(b);
|
225
|
+
|
226
|
+
CumoGetNArray(c, nc);
|
227
|
+
stridec = ROW_SIZE(nc) * COL_SIZE(nc);
|
228
|
+
batch_count = CUMO_NA_SIZE(nc) / stridec;
|
229
|
+
|
230
|
+
if (cumo_na_debug_flag) print_gemm_args(g, &a_layout, &b_layout, stridec, batch_count);
|
231
|
+
handle = cumo_cuda_cublas_handle();
|
232
|
+
status = cublas<%=func_prefix%>gemmStridedBatched(
|
233
|
+
handle,
|
234
|
+
b_layout.trans,
|
235
|
+
a_layout.trans,
|
236
|
+
g->n,
|
237
|
+
g->m,
|
238
|
+
g->k,
|
239
|
+
(<%=cutype%>*)(&g->alpha),
|
240
|
+
(<%=cutype%>*)(cumo_na_get_pointer_for_read(b_layout.a) + cumo_na_get_offset(b_layout.a)),
|
241
|
+
b_layout.ld,
|
242
|
+
b_layout.stride,
|
243
|
+
(<%=cutype%>*)(cumo_na_get_pointer_for_read(a_layout.a) + cumo_na_get_offset(a_layout.a)),
|
244
|
+
a_layout.ld,
|
245
|
+
a_layout.stride,
|
246
|
+
(<%=cutype%>*)(&g->beta),
|
247
|
+
(<%=cutype%>*)(cumo_na_get_pointer_for_write(c) + cumo_na_get_offset(c)),
|
248
|
+
g->n,
|
249
|
+
stridec,
|
250
|
+
batch_count);
|
251
|
+
cumo_cuda_cublas_check_status(status);
|
77
252
|
}
|
78
253
|
|
79
254
|
/*
|
@@ -90,17 +265,6 @@ static void
|
|
90
265
|
def opt(v,tp=nil,*a)
|
91
266
|
tp ||= "String or Symbol"
|
92
267
|
case v
|
93
|
-
when /^order$/
|
94
|
-
"@param #{v} [#{tp}] if 'R': Row-major, if 'C': Column-major. (default='R')"
|
95
|
-
when /^uplo$/
|
96
|
-
"@param #{v} [#{tp}] if 'U': Upper triangle, if 'L': Lower triangle. (default='U')"
|
97
|
-
when /^side$/
|
98
|
-
"@param #{v} [#{tp}] if 'L': op(A)\\*B (left-side op), if 'R': B\\*op(A) (right-side op). (default='L')"
|
99
|
-
when /^diag$/
|
100
|
-
"@param #{v} [#{tp}] if 'U': assumed to be unit triangular, if 'N': not assumed to be unit triangular. (default='U')"
|
101
|
-
when /^trans(\w+)?$/
|
102
|
-
b = a[0] || $1
|
103
|
-
"@param #{v} [#{tp}] if 'N': Not transpose #{b}, if 'T': Transpose #{b}. (default='N')"
|
104
268
|
when "alpha"
|
105
269
|
"@param #{v} [Float] (default=1.0)"
|
106
270
|
when "beta"
|
@@ -111,15 +275,13 @@ static void
|
|
111
275
|
end
|
112
276
|
%>
|
113
277
|
<%
|
114
|
-
args_v = "a, b, [c, alpha:1, beta:0
|
278
|
+
args_v = "a, b, [c, alpha:1, beta:0]"
|
115
279
|
params = [
|
116
280
|
mat("a"),
|
117
281
|
mat("b"),
|
118
282
|
mat("c","optional",:inplace),
|
119
283
|
opt("alpha"),
|
120
284
|
opt("beta"),
|
121
|
-
opt("transa"),
|
122
|
-
opt("transb"),
|
123
285
|
].select{|x| x}.join("\n ")
|
124
286
|
%>
|
125
287
|
@overload <%=name%>(<%=args_v%>)
|
@@ -130,74 +292,56 @@ static void
|
|
130
292
|
static VALUE
|
131
293
|
<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
|
132
294
|
{
|
133
|
-
VALUE
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
ndfunc_arg_in_t ain[3] = {{cT,2},{cT,2},{OVERWRITE,2}};
|
138
|
-
ndfunc_arg_out_t aout[1] = {{cT,2,shape}};
|
139
|
-
ndfunc_t ndf = {<%=c_iter%>, NO_LOOP, 3, 0, ain, aout};
|
140
|
-
|
141
|
-
args_t g;
|
295
|
+
VALUE a=self, b, c=Qnil, alpha, beta;
|
296
|
+
cumo_narray_t *na, *nb;
|
297
|
+
|
298
|
+
gemm_args_t g;
|
142
299
|
VALUE kw_hash = Qnil;
|
143
|
-
ID kw_table[
|
144
|
-
VALUE opts[
|
300
|
+
ID kw_table[2] = {rb_intern("alpha"), rb_intern("beta")};
|
301
|
+
VALUE opts[2] = {Qundef, Qundef};
|
145
302
|
|
146
303
|
rb_scan_args(argc, argv, "11:", &b, &c, &kw_hash);
|
147
|
-
rb_get_kwargs(kw_hash, kw_table, 0,
|
148
|
-
alpha
|
149
|
-
g.alpha
|
150
|
-
beta
|
151
|
-
g.beta
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
CHECK_INT_EQ("ka",ka,"kb",kb);
|
167
|
-
g.m = ma;
|
168
|
-
g.n = nb;
|
169
|
-
g.k = ka;
|
170
|
-
|
171
|
-
SWAP_IFROW(ma, nb, tmp);
|
304
|
+
rb_get_kwargs(kw_hash, kw_table, 0, 2, opts);
|
305
|
+
alpha = cumo_cuda_cublas_option_value(opts[0],Qnil);
|
306
|
+
g.alpha = RTEST(alpha) ? m_num_to_data(alpha) : m_one;
|
307
|
+
beta = cumo_cuda_cublas_option_value(opts[1],Qnil);
|
308
|
+
g.beta = RTEST(beta) ? m_num_to_data(beta) : m_zero;
|
309
|
+
|
310
|
+
CumoGetNArray(a, na);
|
311
|
+
CumoGetNArray(b, nb);
|
312
|
+
CHECK_DIM_GE(na, 2);
|
313
|
+
CHECK_DIM_GE(nb, 2);
|
314
|
+
|
315
|
+
if (ROW_SIZE(nb) != COL_SIZE(na)) {
|
316
|
+
rb_raise(cumo_na_eShapeError,"ROW_SIZE(b)=%d must equal to COL_SIZE(a)=%d",
|
317
|
+
(int)ROW_SIZE(nb), (int)COL_SIZE(na));
|
318
|
+
}
|
319
|
+
|
320
|
+
g.m = ROW_SIZE(na);
|
321
|
+
g.k = COL_SIZE(na);
|
322
|
+
g.n = COL_SIZE(nb);
|
172
323
|
|
173
324
|
if (c == Qnil) { // c is not given.
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
shape[1] = ma;
|
325
|
+
int ndim = CUMO_NA_NDIM(na);
|
326
|
+
size_t *shape = ALLOCA_N(size_t, ndim);
|
327
|
+
memcpy(shape, CUMO_NA_SHAPE(na), sizeof(size_t) * (ndim - 1)); // ... x m x k
|
328
|
+
shape[ndim - 1] = g.n; // ... x m x n
|
329
|
+
c = cumo_na_new(cT, ndim, shape);
|
180
330
|
} else {
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
rb_raise(nary_eShapeError,"nc=%d must be >= nb=%d",nc,nb);
|
331
|
+
cumo_narray_t *nc;
|
332
|
+
COPY_OR_CAST_TO(c, cT);
|
333
|
+
CumoGetNArray(c, nc);
|
334
|
+
CHECK_DIM_GE(nc, 2);
|
335
|
+
if (ROW_SIZE(nc) != ROW_SIZE(na)) {
|
336
|
+
rb_raise(cumo_na_eShapeError,"ROW_SIZE(c)=%d must equal to ROW_SIZE(a)=%d",
|
337
|
+
(int)ROW_SIZE(nc), (int)ROW_SIZE(na));
|
189
338
|
}
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
VALUE ans = na_ndloop3(&ndf, &g, 3, a, b, c);
|
194
|
-
|
195
|
-
if (ndf.nout == 1) { // c is not given.
|
196
|
-
return ans;
|
197
|
-
} else {
|
198
|
-
return c;
|
339
|
+
if (COL_SIZE(nc) != COL_SIZE(nb)) {
|
340
|
+
rb_raise(cumo_na_eShapeError,"COL_SIZE(c)=%d must equal to COL_SIZE(b)=%d",
|
341
|
+
(int)COL_SIZE(nc), (int)COL_SIZE(nc));
|
199
342
|
}
|
200
343
|
}
|
201
|
-
}
|
202
344
|
|
203
|
-
|
345
|
+
<%=c_iter%>(a, b, c, &g);
|
346
|
+
return c;
|
347
|
+
}
|