cumo 0.2.5 → 0.3.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -1
- data/README.md +12 -1
- data/cumo.gemspec +1 -1
- data/ext/cumo/cuda/cudnn.c +80 -0
- data/ext/cumo/cuda/cudnn_impl.cpp +572 -0
- data/ext/cumo/cuda/runtime.c +1 -0
- data/ext/cumo/cumo.c +5 -0
- data/ext/cumo/extconf.rb +8 -2
- data/ext/cumo/include/cumo.h +2 -2
- data/ext/cumo/include/cumo/cuda/cudnn.h +205 -0
- data/ext/cumo/include/cumo/hash_combine.hpp +17 -0
- data/ext/cumo/include/cumo/intern.h +5 -0
- data/ext/cumo/include/cumo/types/dfloat.h +1 -0
- data/ext/cumo/include/cumo/types/sfloat.h +1 -0
- data/ext/cumo/narray/gen/spec.rb +21 -0
- data/ext/cumo/narray/gen/tmpl/batch_norm.c +197 -0
- data/ext/cumo/narray/gen/tmpl/batch_norm_backward.c +191 -0
- data/ext/cumo/narray/gen/tmpl/conv.c +216 -0
- data/ext/cumo/narray/gen/tmpl/conv_grad_w.c +183 -0
- data/ext/cumo/narray/gen/tmpl/conv_transpose.c +244 -0
- data/ext/cumo/narray/gen/tmpl/gemm.c +14 -0
- data/ext/cumo/narray/gen/tmpl/pooling_backward.c +136 -0
- data/ext/cumo/narray/gen/tmpl/pooling_forward.c +136 -0
- data/ext/cumo/narray/narray.c +29 -0
- data/lib/cumo/cuda.rb +1 -0
- data/lib/cumo/cuda/cudnn.rb +88 -0
- metadata +18 -5
@@ -0,0 +1,191 @@
|
|
1
|
+
#ifdef CUDNN_FOUND
|
2
|
+
|
3
|
+
<%
|
4
|
+
cudnn_dtype =
|
5
|
+
case type_name
|
6
|
+
when 'sfloat'
|
7
|
+
'CUDNN_DATA_FLOAT'
|
8
|
+
when 'dfloat'
|
9
|
+
'CUDNN_DATA_DOUBLE'
|
10
|
+
else
|
11
|
+
# CUDNN_DATA_HALF
|
12
|
+
raise 'not supported'
|
13
|
+
end
|
14
|
+
%>
|
15
|
+
|
16
|
+
// gx, ggamma, gbeta = x.batch_normalizatoin_backward(gamma, gy, mean:, inv_std:, eps:, axis:)
|
17
|
+
static VALUE
|
18
|
+
<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
|
19
|
+
{
|
20
|
+
cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
|
21
|
+
cudnnStatus_t status = 0;
|
22
|
+
cudnnHandle_t handle = 0;
|
23
|
+
dtype coef_alpha = 1;
|
24
|
+
dtype coef_beta = 0;
|
25
|
+
|
26
|
+
VALUE x=self, gamma, gy, mean, inv_std, eps, axis, gx, ggamma, gbeta;
|
27
|
+
VALUE kw_hash = Qnil;
|
28
|
+
ID kw_table[] = {
|
29
|
+
rb_intern("mean"),
|
30
|
+
rb_intern("inv_std"),
|
31
|
+
rb_intern("eps"),
|
32
|
+
rb_intern("axis"),
|
33
|
+
rb_intern("gx"),
|
34
|
+
rb_intern("ggamma"),
|
35
|
+
rb_intern("gbeta")
|
36
|
+
};
|
37
|
+
VALUE opts[] = {Qundef, Qundef, Qundef, Qundef, Qundef, Qundef, Qundef};
|
38
|
+
|
39
|
+
cumo_narray_t *nx, *ngamma, *ngy;
|
40
|
+
size_t *x_shape, *gamma_shape, *gy_shape, reduced_shape[CUMO_NA_MAX_DIMENSION];
|
41
|
+
size_t x_ndim, gamma_ndim, gy_ndim, reduced_ndim;
|
42
|
+
|
43
|
+
VALUE x_cont, gamma_cont, gy_cont;
|
44
|
+
cudnnTensorDescriptor_t x_desc = 0;
|
45
|
+
cudnnTensorDescriptor_t bn_desc = 0;
|
46
|
+
char *x_cont_ptr, *gamma_cont_ptr, *gy_cont_ptr, *gx_ptr, *ggamma_ptr, *gbeta_ptr;
|
47
|
+
|
48
|
+
cudnnBatchNormMode_t mode;
|
49
|
+
|
50
|
+
// default values
|
51
|
+
char *mean_ptr=NULL;
|
52
|
+
char *inv_std_ptr=NULL;
|
53
|
+
double double_eps = 2e-5;
|
54
|
+
int int_axis[CUMO_NA_MAX_DIMENSION] = {0};
|
55
|
+
size_t axis_ndim = 1;
|
56
|
+
|
57
|
+
rb_scan_args(argc, argv, "2:", &gamma, &gy, &kw_hash);
|
58
|
+
rb_get_kwargs(kw_hash, kw_table, 0, 8, opts);
|
59
|
+
mean = cumo_cuda_cudnn_option_value(opts[0], Qnil);
|
60
|
+
inv_std = cumo_cuda_cudnn_option_value(opts[1], Qnil);
|
61
|
+
eps = cumo_cuda_cudnn_option_value(opts[2], Qnil);
|
62
|
+
axis = cumo_cuda_cudnn_option_value(opts[3], Qnil);
|
63
|
+
gx = cumo_cuda_cudnn_option_value(opts[4], Qnil);
|
64
|
+
ggamma = cumo_cuda_cudnn_option_value(opts[5], Qnil);
|
65
|
+
gbeta = cumo_cuda_cudnn_option_value(opts[6], Qnil);
|
66
|
+
|
67
|
+
if (mean != Qnil) {
|
68
|
+
mean_ptr = cumo_na_get_offset_pointer_for_read(mean);
|
69
|
+
}
|
70
|
+
if (inv_std != Qnil) {
|
71
|
+
inv_std_ptr = cumo_na_get_offset_pointer_for_read(inv_std);
|
72
|
+
}
|
73
|
+
if (eps != Qnil) {
|
74
|
+
double_eps = NUM2DBL(eps);
|
75
|
+
}
|
76
|
+
if (axis != Qnil) {
|
77
|
+
Check_Type(axis, T_ARRAY);
|
78
|
+
axis_ndim = (size_t)(RARRAY_LEN(axis));
|
79
|
+
for (size_t idim = 0; idim < axis_ndim; ++idim) {
|
80
|
+
int_axis[idim] = NUM2INT(rb_ary_entry(axis, (long)idim));
|
81
|
+
}
|
82
|
+
// TODO: check axis is sorted
|
83
|
+
}
|
84
|
+
|
85
|
+
CumoGetNArray(x, nx);
|
86
|
+
CumoGetNArray(gamma, ngamma);
|
87
|
+
CumoGetNArray(gy, ngy);
|
88
|
+
x_ndim = nx->ndim;
|
89
|
+
x_shape = nx->shape;
|
90
|
+
gamma_ndim = ngamma->ndim;
|
91
|
+
gamma_shape = ngamma->shape;
|
92
|
+
gy_ndim = ngy->ndim;
|
93
|
+
gy_shape = ngy->shape;
|
94
|
+
|
95
|
+
// TODO: Size check of gammma, beta, running_mean, running_var, mean, inv_std
|
96
|
+
// are equivalent with either of reduced_shape(keepdims: false) or reduced_shape(keepdims: true)
|
97
|
+
reduced_ndim = cumo_cuda_cudnn_ReduceShape(reduced_shape, x_ndim, x_shape, axis_ndim, int_axis, 1);
|
98
|
+
// CUMO_CUDA_CUDNN_CHECK_DIM_EQ(reduced_ndim, gamma_ndim);
|
99
|
+
// for (size_t idim = 0; idim < reduced_ndim; ++idim) {
|
100
|
+
// CUMO_CUDA_CUDNN_CHECK_DIM_EQ(reduced_shape[idim], gamma_shape[idim]);
|
101
|
+
// }
|
102
|
+
// CUMO_CUDA_CUDNN_CHECK_DIM_EQ(x_ndim, gy_ndim);
|
103
|
+
// for (size_t idim = 0; idim < x_ndim; ++idim) {
|
104
|
+
// CUMO_CUDA_CUDNN_CHECK_DIM_EQ(x_shape[idim], gy_shape[idim]);
|
105
|
+
// }
|
106
|
+
|
107
|
+
// TODO: Add ndim and shape (same with reduced) for mean and inv_std if given
|
108
|
+
|
109
|
+
CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
|
110
|
+
CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gamma, cT);
|
111
|
+
CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gy, cT);
|
112
|
+
if (mean != Qnil) CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(mean, cT);
|
113
|
+
if (inv_std != Qnil) CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(inv_std, cT);
|
114
|
+
|
115
|
+
x_cont = cumo_na_as_contiguous_array(x);
|
116
|
+
gamma_cont = cumo_na_as_contiguous_array(gamma);
|
117
|
+
gy_cont = cumo_na_as_contiguous_array(gy);
|
118
|
+
if (mean != Qnil && cumo_na_check_contiguous(mean) != Qtrue) {
|
119
|
+
rb_raise(rb_eRuntimeError, "mean must be contiguous");
|
120
|
+
}
|
121
|
+
if (inv_std != Qnil && cumo_na_check_contiguous(inv_std) != Qtrue) {
|
122
|
+
rb_raise(rb_eRuntimeError, "inv_std must be contiguous");
|
123
|
+
}
|
124
|
+
|
125
|
+
x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
|
126
|
+
gamma_cont_ptr = cumo_na_get_offset_pointer_for_read(gamma_cont);
|
127
|
+
gy_cont_ptr = cumo_na_get_offset_pointer_for_read(gy_cont);
|
128
|
+
|
129
|
+
// TODO: type and shape check
|
130
|
+
if (gx == Qnil) gx = cumo_na_new(cT, x_ndim, x_shape);
|
131
|
+
gx_ptr = cumo_na_get_offset_pointer_for_write(gx);
|
132
|
+
if (ggamma == Qnil) ggamma = cumo_na_new(cT, gamma_ndim, gamma_shape);
|
133
|
+
ggamma_ptr = cumo_na_get_offset_pointer_for_write(ggamma);
|
134
|
+
if (gbeta == Qnil) gbeta = cumo_na_new(cT, gamma_ndim, gamma_shape);
|
135
|
+
gbeta_ptr = cumo_na_get_offset_pointer_for_write(gbeta);
|
136
|
+
|
137
|
+
status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
|
138
|
+
if (status != CUDNN_STATUS_SUCCESS) goto BATCH_NORM_ERROR;
|
139
|
+
|
140
|
+
mode = cumo_cuda_cudnn_GetBatchNormMode(axis_ndim, int_axis);
|
141
|
+
status = cumo_cuda_cudnn_CreateBNTensorDescriptor(&bn_desc, x_desc, mode);
|
142
|
+
if (status != CUDNN_STATUS_SUCCESS) goto BATCH_NORM_ERROR;
|
143
|
+
// TODO: bn_desc may return another type, and may need to cast gamma, gy, mean, var
|
144
|
+
|
145
|
+
handle = cumo_cuda_cudnn_handle();
|
146
|
+
|
147
|
+
status = cudnnBatchNormalizationBackward(
|
148
|
+
handle,
|
149
|
+
mode,
|
150
|
+
(void*)&coef_alpha,
|
151
|
+
(void*)&coef_beta,
|
152
|
+
(void*)&coef_alpha,
|
153
|
+
(void*)&coef_beta,
|
154
|
+
x_desc,
|
155
|
+
x_cont_ptr,
|
156
|
+
x_desc,
|
157
|
+
gy_cont_ptr,
|
158
|
+
x_desc,
|
159
|
+
gx_ptr,
|
160
|
+
bn_desc,
|
161
|
+
gamma_cont_ptr,
|
162
|
+
ggamma_ptr,
|
163
|
+
gbeta_ptr,
|
164
|
+
double_eps,
|
165
|
+
mean_ptr,
|
166
|
+
inv_std_ptr);
|
167
|
+
if (status != CUDNN_STATUS_SUCCESS) goto BATCH_NORM_ERROR;
|
168
|
+
|
169
|
+
BATCH_NORM_ERROR:
|
170
|
+
if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
|
171
|
+
if (bn_desc) cudnnDestroyTensorDescriptor(bn_desc);
|
172
|
+
cumo_cuda_cudnn_check_status(status);
|
173
|
+
|
174
|
+
{
|
175
|
+
VALUE ret = rb_ary_new2(3);
|
176
|
+
rb_ary_push(ret, gx);
|
177
|
+
rb_ary_push(ret, ggamma);
|
178
|
+
rb_ary_push(ret, gbeta);
|
179
|
+
return ret;
|
180
|
+
}
|
181
|
+
}
|
182
|
+
|
183
|
+
#else // CUDNN_FOUND
|
184
|
+
VALUE cumo_cuda_eCudnnError;
|
185
|
+
|
186
|
+
static VALUE
|
187
|
+
<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
|
188
|
+
{
|
189
|
+
rb_raise(cumo_cuda_eCudnnError, "cuDNN is not available");
|
190
|
+
}
|
191
|
+
#endif // CUDNN_FOUND
|
@@ -0,0 +1,216 @@
|
|
1
|
+
#ifdef CUDNN_FOUND
|
2
|
+
|
3
|
+
<%
|
4
|
+
cudnn_dtype =
|
5
|
+
case type_name
|
6
|
+
when 'sfloat'
|
7
|
+
'CUDNN_DATA_FLOAT'
|
8
|
+
when 'dfloat'
|
9
|
+
'CUDNN_DATA_DOUBLE'
|
10
|
+
else
|
11
|
+
# CUDNN_DATA_HALF
|
12
|
+
raise 'not supported'
|
13
|
+
end
|
14
|
+
%>
|
15
|
+
|
16
|
+
// cover_all=true is not supported with CUDNN
|
17
|
+
// dilation > 1 is not supported yet
|
18
|
+
// x.conv(w, b: nil, stride: 1, pad: 0, y: nil)
|
19
|
+
static VALUE
|
20
|
+
<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
|
21
|
+
{
|
22
|
+
cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
|
23
|
+
cudnnStatus_t status = 0;
|
24
|
+
cudnnHandle_t handle = 0;
|
25
|
+
dtype alpha = 1;
|
26
|
+
dtype beta = 0;
|
27
|
+
|
28
|
+
VALUE x=self, w, b, stride, pad, y;
|
29
|
+
VALUE kw_hash = Qnil;
|
30
|
+
ID kw_table[4] = {rb_intern("stride"), rb_intern("pad"), rb_intern("b"), rb_intern("y")};
|
31
|
+
VALUE opts[4] = {Qundef, Qundef, Qundef, Qundef};
|
32
|
+
|
33
|
+
size_t ndim;
|
34
|
+
cumo_narray_t *nx, *nw;
|
35
|
+
size_t *x_shape, *w_shape;
|
36
|
+
size_t out_channels, batch_size;
|
37
|
+
|
38
|
+
VALUE x_cont, w_cont;
|
39
|
+
cudnnTensorDescriptor_t x_desc = 0;
|
40
|
+
cudnnTensorDescriptor_t y_desc = 0;
|
41
|
+
cudnnTensorDescriptor_t b_desc = 0;
|
42
|
+
cudnnFilterDescriptor_t w_desc = 0;
|
43
|
+
cudnnConvolutionDescriptor_t conv_desc = 0;
|
44
|
+
char *x_cont_ptr, *w_cont_ptr, *y_ptr;
|
45
|
+
|
46
|
+
cudnnConvolutionFwdAlgoPerf_t perf_result;
|
47
|
+
cudnnConvolutionFwdAlgo_t algo;
|
48
|
+
size_t max_workspace_size = CUMO_CUDA_CUDNN_DEFAULT_MAX_WORKSPACE_SIZE;
|
49
|
+
size_t workspace_size;
|
50
|
+
char* workspace = 0;
|
51
|
+
|
52
|
+
int int_stride[CUMO_NA_MAX_DIMENSION];
|
53
|
+
int int_pad[CUMO_NA_MAX_DIMENSION];
|
54
|
+
|
55
|
+
rb_scan_args(argc, argv, "1:", &w, &kw_hash);
|
56
|
+
rb_get_kwargs(kw_hash, kw_table, 0, 4, opts);
|
57
|
+
stride = cumo_cuda_cudnn_option_value(opts[0], Qnil);
|
58
|
+
pad = cumo_cuda_cudnn_option_value(opts[1], Qnil);
|
59
|
+
b = cumo_cuda_cudnn_option_value(opts[2], Qnil);
|
60
|
+
y = cumo_cuda_cudnn_option_value(opts[3], Qnil);
|
61
|
+
|
62
|
+
CumoGetNArray(x, nx);
|
63
|
+
CumoGetNArray(w, nw);
|
64
|
+
|
65
|
+
CUMO_CUDA_CUDNN_CHECK_DIM_EQ(nx->ndim, nw->ndim);
|
66
|
+
CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
|
67
|
+
CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(w, cT);
|
68
|
+
if (nx->ndim - 2 < 2) {
|
69
|
+
rb_raise(cumo_na_eShapeError, "CUDNN convolution requires number of spatial "
|
70
|
+
"dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
|
71
|
+
}
|
72
|
+
ndim = nx->ndim - 2; // Number of spatial dimensions
|
73
|
+
|
74
|
+
cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 1);
|
75
|
+
cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
|
76
|
+
|
77
|
+
x_shape = nx->shape;
|
78
|
+
w_shape = nw->shape;
|
79
|
+
batch_size = x_shape[0]; // x_shape = (batch_size, in_channels, d_1, d_2, ..., d_N)
|
80
|
+
out_channels = w_shape[0]; // w.shape = (out_channels, in_channels, k_1, k_2, ..., k_N)
|
81
|
+
if (x_shape[1] != w_shape[1]) {
|
82
|
+
rb_raise(cumo_na_eShapeError, "x_shape[1]:%d does not match with w_shape[1]:%d",
|
83
|
+
(int)x_shape[1], (int)w_shape[1]);
|
84
|
+
}
|
85
|
+
|
86
|
+
if (y != Qnil) {
|
87
|
+
CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(y, cT);
|
88
|
+
}
|
89
|
+
else {
|
90
|
+
size_t *y_shape = ALLOCA_N(size_t, ndim + 2);
|
91
|
+
// out_shape = (batch_size, out_channels, out_1, out_2, ..., out_N)
|
92
|
+
y_shape[0] = batch_size;
|
93
|
+
y_shape[1] = out_channels;
|
94
|
+
for (size_t i = 0; i < ndim; ++i) {
|
95
|
+
y_shape[i + 2] = cumo_cuda_cudnn_GetConvOutDim(
|
96
|
+
x_shape[i + 2], w_shape[i + 2], int_stride[i], int_pad[i]);
|
97
|
+
}
|
98
|
+
y = cumo_na_new(cT, ndim + 2, y_shape);
|
99
|
+
}
|
100
|
+
|
101
|
+
x_cont = cumo_na_as_contiguous_array(x);
|
102
|
+
w_cont = cumo_na_as_contiguous_array(w);
|
103
|
+
|
104
|
+
x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
|
105
|
+
w_cont_ptr = cumo_na_get_offset_pointer_for_read(w_cont);
|
106
|
+
y_ptr = cumo_na_get_offset_pointer_for_write(y);
|
107
|
+
|
108
|
+
status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
|
109
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
|
110
|
+
status = cumo_cuda_cudnn_CreateTensorDescriptor(&y_desc, y, cudnn_dtype);
|
111
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
|
112
|
+
status = cumo_cuda_cudnn_CreateFilterDescriptor(&w_desc, w_cont, cudnn_dtype);
|
113
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
|
114
|
+
status = cumo_cuda_cudnn_CreateConvolutionDescriptor(&conv_desc, ndim, int_stride, int_pad, cudnn_dtype);
|
115
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
|
116
|
+
|
117
|
+
handle = cumo_cuda_cudnn_handle();
|
118
|
+
|
119
|
+
// auto tune
|
120
|
+
status = cumo_cuda_cudnn_FindConvolutionForwardAlgorithm(
|
121
|
+
&perf_result,
|
122
|
+
handle,
|
123
|
+
x_desc,
|
124
|
+
x_cont,
|
125
|
+
w_desc,
|
126
|
+
w_cont,
|
127
|
+
conv_desc,
|
128
|
+
y_desc,
|
129
|
+
y,
|
130
|
+
max_workspace_size,
|
131
|
+
int_stride,
|
132
|
+
int_pad,
|
133
|
+
ndim,
|
134
|
+
cudnn_dtype);
|
135
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
|
136
|
+
algo = perf_result.algo;
|
137
|
+
workspace_size = perf_result.memory;
|
138
|
+
|
139
|
+
workspace = cumo_cuda_runtime_malloc(max_workspace_size);
|
140
|
+
status = cudnnConvolutionForward(
|
141
|
+
handle,
|
142
|
+
(void*)&alpha,
|
143
|
+
x_desc,
|
144
|
+
(void*)x_cont_ptr,
|
145
|
+
w_desc,
|
146
|
+
(void*)w_cont_ptr,
|
147
|
+
conv_desc,
|
148
|
+
algo,
|
149
|
+
(void*)workspace,
|
150
|
+
workspace_size,
|
151
|
+
(void*)&beta,
|
152
|
+
y_desc,
|
153
|
+
(void*)y_ptr);
|
154
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
|
155
|
+
|
156
|
+
if (b != Qnil) {
|
157
|
+
size_t new_shape[CUMO_NA_MAX_DIMENSION];
|
158
|
+
VALUE b_cont;
|
159
|
+
char* b_cont_ptr;
|
160
|
+
cumo_narray_t *nb, *nb_cont;
|
161
|
+
size_t *b_shape;
|
162
|
+
int b_ndim;
|
163
|
+
|
164
|
+
CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(b, cT);
|
165
|
+
CumoGetNArray(b, nb);
|
166
|
+
new_shape[0] = 1;
|
167
|
+
new_shape[1] = nb->size;
|
168
|
+
for (size_t i = 0; i < ndim; ++i) {
|
169
|
+
new_shape[i + 2] = 1;
|
170
|
+
}
|
171
|
+
b_cont = cumo_na_as_contiguous_array(b);
|
172
|
+
b_cont_ptr = cumo_na_get_offset_pointer_for_read(b_cont);
|
173
|
+
CumoGetNArray(b_cont, nb_cont);
|
174
|
+
b_shape = nb_cont->shape;
|
175
|
+
b_ndim = nb_cont->ndim;
|
176
|
+
// reshape b
|
177
|
+
nb_cont->ndim = ndim + 2;
|
178
|
+
nb_cont->shape = new_shape;
|
179
|
+
status = cumo_cuda_cudnn_CreateTensorDescriptor(&b_desc, b_cont, cudnn_dtype);
|
180
|
+
// restore b.shape
|
181
|
+
nb_cont->ndim = b_ndim;
|
182
|
+
nb_cont->shape = b_shape;
|
183
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
|
184
|
+
|
185
|
+
status = cudnnAddTensor(
|
186
|
+
handle,
|
187
|
+
(void*)&alpha,
|
188
|
+
b_desc,
|
189
|
+
(void*)b_cont_ptr,
|
190
|
+
(void*)&alpha,
|
191
|
+
y_desc,
|
192
|
+
(void*)y_ptr);
|
193
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
|
194
|
+
}
|
195
|
+
|
196
|
+
CONV_ERROR:
|
197
|
+
if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
|
198
|
+
if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
|
199
|
+
if (b_desc) cudnnDestroyTensorDescriptor(b_desc);
|
200
|
+
if (w_desc) cudnnDestroyFilterDescriptor(w_desc);
|
201
|
+
if (conv_desc) cudnnDestroyConvolutionDescriptor(conv_desc);
|
202
|
+
if (workspace) cumo_cuda_runtime_free(workspace);
|
203
|
+
cumo_cuda_cudnn_check_status(status);
|
204
|
+
|
205
|
+
return y;
|
206
|
+
}
|
207
|
+
|
208
|
+
#else // CUDNN_FOUND
|
209
|
+
VALUE cumo_cuda_eCUDNNError;
|
210
|
+
|
211
|
+
static VALUE
|
212
|
+
<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
|
213
|
+
{
|
214
|
+
rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
|
215
|
+
}
|
216
|
+
#endif // CUDNN_FOUND
|
@@ -0,0 +1,183 @@
|
|
1
|
+
#ifdef CUDNN_FOUND
|
2
|
+
|
3
|
+
<%
|
4
|
+
cudnn_dtype =
|
5
|
+
case type_name
|
6
|
+
when 'sfloat'
|
7
|
+
'CUDNN_DATA_FLOAT'
|
8
|
+
when 'dfloat'
|
9
|
+
'CUDNN_DATA_DOUBLE'
|
10
|
+
else
|
11
|
+
# CUDNN_DATA_HALF
|
12
|
+
raise 'not supported'
|
13
|
+
end
|
14
|
+
%>
|
15
|
+
|
16
|
+
static void
|
17
|
+
cumo_cuda_cudnn_get_sizet_ary(size_t *sizet_ary, VALUE ary, size_t ndim)
|
18
|
+
{
|
19
|
+
Check_Type(ary, T_ARRAY);
|
20
|
+
CUMO_CUDA_CUDNN_CHECK_DIM_EQ((size_t)(RARRAY_LEN(ary)), ndim);
|
21
|
+
for (size_t idim = 0; idim < ndim; ++idim) {
|
22
|
+
sizet_ary[idim] = NUM2SIZET(rb_ary_entry(ary, (long)idim));
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
// cover_all=true is not supported with CUDNN
|
27
|
+
// gw = x.conv_grad_w(gy, w_shape, stride: 1, pad: 0, gw: nil)
|
28
|
+
static VALUE
|
29
|
+
<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
|
30
|
+
{
|
31
|
+
cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
|
32
|
+
cudnnStatus_t status = 0;
|
33
|
+
cudnnHandle_t handle = 0;
|
34
|
+
dtype one = 1;
|
35
|
+
dtype zero = 0;
|
36
|
+
|
37
|
+
VALUE x=self, gy, w_shape, stride, pad, gw;
|
38
|
+
VALUE kw_hash = Qnil;
|
39
|
+
ID kw_table[] = {rb_intern("stride"), rb_intern("pad"), rb_intern("gw")};
|
40
|
+
VALUE opts[] = {Qundef, Qundef, Qundef};
|
41
|
+
|
42
|
+
size_t ndim;
|
43
|
+
cumo_narray_t *nx, *ngy;
|
44
|
+
|
45
|
+
VALUE x_cont, gy_cont;
|
46
|
+
cudnnTensorDescriptor_t x_desc = 0;
|
47
|
+
cudnnTensorDescriptor_t gy_desc = 0;
|
48
|
+
cudnnConvolutionDescriptor_t conv_desc = 0;
|
49
|
+
cudnnFilterDescriptor_t gw_desc = 0;
|
50
|
+
char *x_cont_ptr, *gy_cont_ptr, *gw_ptr;
|
51
|
+
|
52
|
+
cudnnConvolutionBwdFilterAlgoPerf_t perf_result;
|
53
|
+
cudnnConvolutionBwdFilterAlgo_t algo;
|
54
|
+
size_t max_workspace_size = CUMO_CUDA_CUDNN_DEFAULT_MAX_WORKSPACE_SIZE;
|
55
|
+
size_t workspace_size;
|
56
|
+
char* workspace = 0;
|
57
|
+
|
58
|
+
size_t sizet_w_shape[CUMO_NA_MAX_DIMENSION];
|
59
|
+
int int_stride[CUMO_NA_MAX_DIMENSION];
|
60
|
+
int int_pad[CUMO_NA_MAX_DIMENSION];
|
61
|
+
|
62
|
+
rb_scan_args(argc, argv, "2:", &gy, &w_shape, &kw_hash);
|
63
|
+
rb_get_kwargs(kw_hash, kw_table, 0, 3, opts);
|
64
|
+
stride = cumo_cuda_cudnn_option_value(opts[0], Qnil);
|
65
|
+
pad = cumo_cuda_cudnn_option_value(opts[1], Qnil);
|
66
|
+
gw = cumo_cuda_cudnn_option_value(opts[2], Qnil);
|
67
|
+
|
68
|
+
CumoGetNArray(x, nx);
|
69
|
+
CumoGetNArray(gy, ngy);
|
70
|
+
|
71
|
+
CUMO_CUDA_CUDNN_CHECK_DIM_EQ(nx->ndim, ngy->ndim);
|
72
|
+
CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
|
73
|
+
CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gy, cT);
|
74
|
+
if (nx->ndim - 2 < 2) {
|
75
|
+
rb_raise(cumo_na_eShapeError, "CUDNN convolution requires number of spatial "
|
76
|
+
"dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
|
77
|
+
}
|
78
|
+
ndim = nx->ndim - 2; // Number of spatial dimensions
|
79
|
+
|
80
|
+
cumo_cuda_cudnn_get_sizet_ary(sizet_w_shape, w_shape, ndim + 2);
|
81
|
+
cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 1);
|
82
|
+
cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
|
83
|
+
|
84
|
+
if (gw != Qnil) {
|
85
|
+
CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gw, cT);
|
86
|
+
assert(cumo_na_check_contiguous(gw) == Qtrue);
|
87
|
+
}
|
88
|
+
else {
|
89
|
+
gw = cumo_na_new(cT, ndim + 2, sizet_w_shape);
|
90
|
+
}
|
91
|
+
// w_shape = (out_channels, in_channels, k_1, k_2, ..., k_N)
|
92
|
+
// x_shape = (batch_size, in_channels, d_1, d_2, ..., d_N)
|
93
|
+
// y_shape = (batch_size, out_channels, out_1, out_2, ..., out_N)
|
94
|
+
CUMO_CUDA_CUDNN_CHECK_DIM_EQ(nx->shape[0], ngy->shape[0]);
|
95
|
+
CUMO_CUDA_CUDNN_CHECK_DIM_EQ(sizet_w_shape[0], ngy->shape[1]);
|
96
|
+
CUMO_CUDA_CUDNN_CHECK_DIM_EQ(sizet_w_shape[1], nx->shape[1]);
|
97
|
+
|
98
|
+
{
|
99
|
+
// shape check of gy
|
100
|
+
size_t *y_shape = ngy->shape;
|
101
|
+
size_t *x_shape = nx->shape;
|
102
|
+
for (size_t i = 0; i < ndim; ++i) {
|
103
|
+
// TODO: raise
|
104
|
+
assert(y_shape[i + 2] == cumo_cuda_cudnn_GetConvOutDim(
|
105
|
+
x_shape[i + 2], sizet_w_shape[i + 2], int_stride[i], int_pad[i]));
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
x_cont = cumo_na_as_contiguous_array(x);
|
110
|
+
gy_cont = cumo_na_as_contiguous_array(gy);
|
111
|
+
|
112
|
+
x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
|
113
|
+
gy_cont_ptr = cumo_na_get_offset_pointer_for_read(gy_cont);
|
114
|
+
gw_ptr = cumo_na_get_offset_pointer_for_write(gw);
|
115
|
+
|
116
|
+
status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
|
117
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
|
118
|
+
status = cumo_cuda_cudnn_CreateTensorDescriptor(&gy_desc, gy_cont, cudnn_dtype);
|
119
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
|
120
|
+
status = cumo_cuda_cudnn_CreateFilterDescriptor(&gw_desc, gw, cudnn_dtype);
|
121
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
|
122
|
+
status = cumo_cuda_cudnn_CreateConvolutionDescriptor(&conv_desc, ndim, int_stride, int_pad, cudnn_dtype);
|
123
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
|
124
|
+
|
125
|
+
handle = cumo_cuda_cudnn_handle();
|
126
|
+
|
127
|
+
// auto tune
|
128
|
+
status = cumo_cuda_cudnn_FindConvolutionBackwardFilterAlgorithm(
|
129
|
+
&perf_result,
|
130
|
+
handle,
|
131
|
+
x_desc,
|
132
|
+
x_cont,
|
133
|
+
gy_desc,
|
134
|
+
gy_cont,
|
135
|
+
conv_desc,
|
136
|
+
gw_desc,
|
137
|
+
gw,
|
138
|
+
max_workspace_size,
|
139
|
+
int_stride,
|
140
|
+
int_pad,
|
141
|
+
ndim,
|
142
|
+
cudnn_dtype);
|
143
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
|
144
|
+
algo = perf_result.algo;
|
145
|
+
workspace_size = perf_result.memory;
|
146
|
+
|
147
|
+
workspace = cumo_cuda_runtime_malloc(max_workspace_size);
|
148
|
+
status = cudnnConvolutionBackwardFilter(
|
149
|
+
handle,
|
150
|
+
(void*)&one,
|
151
|
+
x_desc,
|
152
|
+
(void*)x_cont_ptr,
|
153
|
+
gy_desc,
|
154
|
+
(void*)gy_cont_ptr,
|
155
|
+
conv_desc,
|
156
|
+
algo,
|
157
|
+
(void*)workspace,
|
158
|
+
workspace_size,
|
159
|
+
(void*)&zero,
|
160
|
+
gw_desc,
|
161
|
+
(void*)gw_ptr);
|
162
|
+
if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
|
163
|
+
|
164
|
+
CONV_GRAD_W_ERROR:
|
165
|
+
if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
|
166
|
+
if (gy_desc) cudnnDestroyTensorDescriptor(gy_desc);
|
167
|
+
if (gw_desc) cudnnDestroyFilterDescriptor(gw_desc);
|
168
|
+
if (conv_desc) cudnnDestroyConvolutionDescriptor(conv_desc);
|
169
|
+
if (workspace) cumo_cuda_runtime_free(workspace);
|
170
|
+
cumo_cuda_cudnn_check_status(status);
|
171
|
+
|
172
|
+
return gw;
|
173
|
+
}
|
174
|
+
|
175
|
+
#else // CUDNN_FOUND
|
176
|
+
VALUE cumo_cuda_eCUDNNError;
|
177
|
+
|
178
|
+
static VALUE
|
179
|
+
<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
|
180
|
+
{
|
181
|
+
rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
|
182
|
+
}
|
183
|
+
#endif // CUDNN_FOUND
|