cumo 0.2.5 → 0.3.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,191 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ <%
4
+ cudnn_dtype =
5
+ case type_name
6
+ when 'sfloat'
7
+ 'CUDNN_DATA_FLOAT'
8
+ when 'dfloat'
9
+ 'CUDNN_DATA_DOUBLE'
10
+ else
11
+ # CUDNN_DATA_HALF
12
+ raise 'not supported'
13
+ end
14
+ %>
15
+
16
+ // gx, ggamma, gbeta = x.batch_normalizatoin_backward(gamma, gy, mean:, inv_std:, eps:, axis:)
17
+ static VALUE
18
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
19
+ {
20
+ cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
21
+ cudnnStatus_t status = 0;
22
+ cudnnHandle_t handle = 0;
23
+ dtype coef_alpha = 1;
24
+ dtype coef_beta = 0;
25
+
26
+ VALUE x=self, gamma, gy, mean, inv_std, eps, axis, gx, ggamma, gbeta;
27
+ VALUE kw_hash = Qnil;
28
+ ID kw_table[] = {
29
+ rb_intern("mean"),
30
+ rb_intern("inv_std"),
31
+ rb_intern("eps"),
32
+ rb_intern("axis"),
33
+ rb_intern("gx"),
34
+ rb_intern("ggamma"),
35
+ rb_intern("gbeta")
36
+ };
37
+ VALUE opts[] = {Qundef, Qundef, Qundef, Qundef, Qundef, Qundef, Qundef};
38
+
39
+ cumo_narray_t *nx, *ngamma, *ngy;
40
+ size_t *x_shape, *gamma_shape, *gy_shape, reduced_shape[CUMO_NA_MAX_DIMENSION];
41
+ size_t x_ndim, gamma_ndim, gy_ndim, reduced_ndim;
42
+
43
+ VALUE x_cont, gamma_cont, gy_cont;
44
+ cudnnTensorDescriptor_t x_desc = 0;
45
+ cudnnTensorDescriptor_t bn_desc = 0;
46
+ char *x_cont_ptr, *gamma_cont_ptr, *gy_cont_ptr, *gx_ptr, *ggamma_ptr, *gbeta_ptr;
47
+
48
+ cudnnBatchNormMode_t mode;
49
+
50
+ // default values
51
+ char *mean_ptr=NULL;
52
+ char *inv_std_ptr=NULL;
53
+ double double_eps = 2e-5;
54
+ int int_axis[CUMO_NA_MAX_DIMENSION] = {0};
55
+ size_t axis_ndim = 1;
56
+
57
+ rb_scan_args(argc, argv, "2:", &gamma, &gy, &kw_hash);
58
+ rb_get_kwargs(kw_hash, kw_table, 0, 8, opts);
59
+ mean = cumo_cuda_cudnn_option_value(opts[0], Qnil);
60
+ inv_std = cumo_cuda_cudnn_option_value(opts[1], Qnil);
61
+ eps = cumo_cuda_cudnn_option_value(opts[2], Qnil);
62
+ axis = cumo_cuda_cudnn_option_value(opts[3], Qnil);
63
+ gx = cumo_cuda_cudnn_option_value(opts[4], Qnil);
64
+ ggamma = cumo_cuda_cudnn_option_value(opts[5], Qnil);
65
+ gbeta = cumo_cuda_cudnn_option_value(opts[6], Qnil);
66
+
67
+ if (mean != Qnil) {
68
+ mean_ptr = cumo_na_get_offset_pointer_for_read(mean);
69
+ }
70
+ if (inv_std != Qnil) {
71
+ inv_std_ptr = cumo_na_get_offset_pointer_for_read(inv_std);
72
+ }
73
+ if (eps != Qnil) {
74
+ double_eps = NUM2DBL(eps);
75
+ }
76
+ if (axis != Qnil) {
77
+ Check_Type(axis, T_ARRAY);
78
+ axis_ndim = (size_t)(RARRAY_LEN(axis));
79
+ for (size_t idim = 0; idim < axis_ndim; ++idim) {
80
+ int_axis[idim] = NUM2INT(rb_ary_entry(axis, (long)idim));
81
+ }
82
+ // TODO: check axis is sorted
83
+ }
84
+
85
+ CumoGetNArray(x, nx);
86
+ CumoGetNArray(gamma, ngamma);
87
+ CumoGetNArray(gy, ngy);
88
+ x_ndim = nx->ndim;
89
+ x_shape = nx->shape;
90
+ gamma_ndim = ngamma->ndim;
91
+ gamma_shape = ngamma->shape;
92
+ gy_ndim = ngy->ndim;
93
+ gy_shape = ngy->shape;
94
+
95
+ // TODO: Size check of gammma, beta, running_mean, running_var, mean, inv_std
96
+ // are equivalent with either of reduced_shape(keepdims: false) or reduced_shape(keepdims: true)
97
+ reduced_ndim = cumo_cuda_cudnn_ReduceShape(reduced_shape, x_ndim, x_shape, axis_ndim, int_axis, 1);
98
+ // CUMO_CUDA_CUDNN_CHECK_DIM_EQ(reduced_ndim, gamma_ndim);
99
+ // for (size_t idim = 0; idim < reduced_ndim; ++idim) {
100
+ // CUMO_CUDA_CUDNN_CHECK_DIM_EQ(reduced_shape[idim], gamma_shape[idim]);
101
+ // }
102
+ // CUMO_CUDA_CUDNN_CHECK_DIM_EQ(x_ndim, gy_ndim);
103
+ // for (size_t idim = 0; idim < x_ndim; ++idim) {
104
+ // CUMO_CUDA_CUDNN_CHECK_DIM_EQ(x_shape[idim], gy_shape[idim]);
105
+ // }
106
+
107
+ // TODO: Add ndim and shape (same with reduced) for mean and inv_std if given
108
+
109
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
110
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gamma, cT);
111
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gy, cT);
112
+ if (mean != Qnil) CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(mean, cT);
113
+ if (inv_std != Qnil) CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(inv_std, cT);
114
+
115
+ x_cont = cumo_na_as_contiguous_array(x);
116
+ gamma_cont = cumo_na_as_contiguous_array(gamma);
117
+ gy_cont = cumo_na_as_contiguous_array(gy);
118
+ if (mean != Qnil && cumo_na_check_contiguous(mean) != Qtrue) {
119
+ rb_raise(rb_eRuntimeError, "mean must be contiguous");
120
+ }
121
+ if (inv_std != Qnil && cumo_na_check_contiguous(inv_std) != Qtrue) {
122
+ rb_raise(rb_eRuntimeError, "inv_std must be contiguous");
123
+ }
124
+
125
+ x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
126
+ gamma_cont_ptr = cumo_na_get_offset_pointer_for_read(gamma_cont);
127
+ gy_cont_ptr = cumo_na_get_offset_pointer_for_read(gy_cont);
128
+
129
+ // TODO: type and shape check
130
+ if (gx == Qnil) gx = cumo_na_new(cT, x_ndim, x_shape);
131
+ gx_ptr = cumo_na_get_offset_pointer_for_write(gx);
132
+ if (ggamma == Qnil) ggamma = cumo_na_new(cT, gamma_ndim, gamma_shape);
133
+ ggamma_ptr = cumo_na_get_offset_pointer_for_write(ggamma);
134
+ if (gbeta == Qnil) gbeta = cumo_na_new(cT, gamma_ndim, gamma_shape);
135
+ gbeta_ptr = cumo_na_get_offset_pointer_for_write(gbeta);
136
+
137
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
138
+ if (status != CUDNN_STATUS_SUCCESS) goto BATCH_NORM_ERROR;
139
+
140
+ mode = cumo_cuda_cudnn_GetBatchNormMode(axis_ndim, int_axis);
141
+ status = cumo_cuda_cudnn_CreateBNTensorDescriptor(&bn_desc, x_desc, mode);
142
+ if (status != CUDNN_STATUS_SUCCESS) goto BATCH_NORM_ERROR;
143
+ // TODO: bn_desc may return another type, and may need to cast gamma, gy, mean, var
144
+
145
+ handle = cumo_cuda_cudnn_handle();
146
+
147
+ status = cudnnBatchNormalizationBackward(
148
+ handle,
149
+ mode,
150
+ (void*)&coef_alpha,
151
+ (void*)&coef_beta,
152
+ (void*)&coef_alpha,
153
+ (void*)&coef_beta,
154
+ x_desc,
155
+ x_cont_ptr,
156
+ x_desc,
157
+ gy_cont_ptr,
158
+ x_desc,
159
+ gx_ptr,
160
+ bn_desc,
161
+ gamma_cont_ptr,
162
+ ggamma_ptr,
163
+ gbeta_ptr,
164
+ double_eps,
165
+ mean_ptr,
166
+ inv_std_ptr);
167
+ if (status != CUDNN_STATUS_SUCCESS) goto BATCH_NORM_ERROR;
168
+
169
+ BATCH_NORM_ERROR:
170
+ if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
171
+ if (bn_desc) cudnnDestroyTensorDescriptor(bn_desc);
172
+ cumo_cuda_cudnn_check_status(status);
173
+
174
+ {
175
+ VALUE ret = rb_ary_new2(3);
176
+ rb_ary_push(ret, gx);
177
+ rb_ary_push(ret, ggamma);
178
+ rb_ary_push(ret, gbeta);
179
+ return ret;
180
+ }
181
+ }
182
+
183
+ #else // CUDNN_FOUND
184
+ VALUE cumo_cuda_eCudnnError;
185
+
186
+ static VALUE
187
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
188
+ {
189
+ rb_raise(cumo_cuda_eCudnnError, "cuDNN is not available");
190
+ }
191
+ #endif // CUDNN_FOUND
@@ -0,0 +1,216 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ <%
4
+ cudnn_dtype =
5
+ case type_name
6
+ when 'sfloat'
7
+ 'CUDNN_DATA_FLOAT'
8
+ when 'dfloat'
9
+ 'CUDNN_DATA_DOUBLE'
10
+ else
11
+ # CUDNN_DATA_HALF
12
+ raise 'not supported'
13
+ end
14
+ %>
15
+
16
+ // cover_all=true is not supported with CUDNN
17
+ // dilation > 1 is not supported yet
18
+ // x.conv(w, b: nil, stride: 1, pad: 0, y: nil)
19
+ static VALUE
20
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
21
+ {
22
+ cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
23
+ cudnnStatus_t status = 0;
24
+ cudnnHandle_t handle = 0;
25
+ dtype alpha = 1;
26
+ dtype beta = 0;
27
+
28
+ VALUE x=self, w, b, stride, pad, y;
29
+ VALUE kw_hash = Qnil;
30
+ ID kw_table[4] = {rb_intern("stride"), rb_intern("pad"), rb_intern("b"), rb_intern("y")};
31
+ VALUE opts[4] = {Qundef, Qundef, Qundef, Qundef};
32
+
33
+ size_t ndim;
34
+ cumo_narray_t *nx, *nw;
35
+ size_t *x_shape, *w_shape;
36
+ size_t out_channels, batch_size;
37
+
38
+ VALUE x_cont, w_cont;
39
+ cudnnTensorDescriptor_t x_desc = 0;
40
+ cudnnTensorDescriptor_t y_desc = 0;
41
+ cudnnTensorDescriptor_t b_desc = 0;
42
+ cudnnFilterDescriptor_t w_desc = 0;
43
+ cudnnConvolutionDescriptor_t conv_desc = 0;
44
+ char *x_cont_ptr, *w_cont_ptr, *y_ptr;
45
+
46
+ cudnnConvolutionFwdAlgoPerf_t perf_result;
47
+ cudnnConvolutionFwdAlgo_t algo;
48
+ size_t max_workspace_size = CUMO_CUDA_CUDNN_DEFAULT_MAX_WORKSPACE_SIZE;
49
+ size_t workspace_size;
50
+ char* workspace = 0;
51
+
52
+ int int_stride[CUMO_NA_MAX_DIMENSION];
53
+ int int_pad[CUMO_NA_MAX_DIMENSION];
54
+
55
+ rb_scan_args(argc, argv, "1:", &w, &kw_hash);
56
+ rb_get_kwargs(kw_hash, kw_table, 0, 4, opts);
57
+ stride = cumo_cuda_cudnn_option_value(opts[0], Qnil);
58
+ pad = cumo_cuda_cudnn_option_value(opts[1], Qnil);
59
+ b = cumo_cuda_cudnn_option_value(opts[2], Qnil);
60
+ y = cumo_cuda_cudnn_option_value(opts[3], Qnil);
61
+
62
+ CumoGetNArray(x, nx);
63
+ CumoGetNArray(w, nw);
64
+
65
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ(nx->ndim, nw->ndim);
66
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
67
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(w, cT);
68
+ if (nx->ndim - 2 < 2) {
69
+ rb_raise(cumo_na_eShapeError, "CUDNN convolution requires number of spatial "
70
+ "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
71
+ }
72
+ ndim = nx->ndim - 2; // Number of spatial dimensions
73
+
74
+ cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 1);
75
+ cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
76
+
77
+ x_shape = nx->shape;
78
+ w_shape = nw->shape;
79
+ batch_size = x_shape[0]; // x_shape = (batch_size, in_channels, d_1, d_2, ..., d_N)
80
+ out_channels = w_shape[0]; // w.shape = (out_channels, in_channels, k_1, k_2, ..., k_N)
81
+ if (x_shape[1] != w_shape[1]) {
82
+ rb_raise(cumo_na_eShapeError, "x_shape[1]:%d does not match with w_shape[1]:%d",
83
+ (int)x_shape[1], (int)w_shape[1]);
84
+ }
85
+
86
+ if (y != Qnil) {
87
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(y, cT);
88
+ }
89
+ else {
90
+ size_t *y_shape = ALLOCA_N(size_t, ndim + 2);
91
+ // out_shape = (batch_size, out_channels, out_1, out_2, ..., out_N)
92
+ y_shape[0] = batch_size;
93
+ y_shape[1] = out_channels;
94
+ for (size_t i = 0; i < ndim; ++i) {
95
+ y_shape[i + 2] = cumo_cuda_cudnn_GetConvOutDim(
96
+ x_shape[i + 2], w_shape[i + 2], int_stride[i], int_pad[i]);
97
+ }
98
+ y = cumo_na_new(cT, ndim + 2, y_shape);
99
+ }
100
+
101
+ x_cont = cumo_na_as_contiguous_array(x);
102
+ w_cont = cumo_na_as_contiguous_array(w);
103
+
104
+ x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
105
+ w_cont_ptr = cumo_na_get_offset_pointer_for_read(w_cont);
106
+ y_ptr = cumo_na_get_offset_pointer_for_write(y);
107
+
108
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
109
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
110
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&y_desc, y, cudnn_dtype);
111
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
112
+ status = cumo_cuda_cudnn_CreateFilterDescriptor(&w_desc, w_cont, cudnn_dtype);
113
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
114
+ status = cumo_cuda_cudnn_CreateConvolutionDescriptor(&conv_desc, ndim, int_stride, int_pad, cudnn_dtype);
115
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
116
+
117
+ handle = cumo_cuda_cudnn_handle();
118
+
119
+ // auto tune
120
+ status = cumo_cuda_cudnn_FindConvolutionForwardAlgorithm(
121
+ &perf_result,
122
+ handle,
123
+ x_desc,
124
+ x_cont,
125
+ w_desc,
126
+ w_cont,
127
+ conv_desc,
128
+ y_desc,
129
+ y,
130
+ max_workspace_size,
131
+ int_stride,
132
+ int_pad,
133
+ ndim,
134
+ cudnn_dtype);
135
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
136
+ algo = perf_result.algo;
137
+ workspace_size = perf_result.memory;
138
+
139
+ workspace = cumo_cuda_runtime_malloc(max_workspace_size);
140
+ status = cudnnConvolutionForward(
141
+ handle,
142
+ (void*)&alpha,
143
+ x_desc,
144
+ (void*)x_cont_ptr,
145
+ w_desc,
146
+ (void*)w_cont_ptr,
147
+ conv_desc,
148
+ algo,
149
+ (void*)workspace,
150
+ workspace_size,
151
+ (void*)&beta,
152
+ y_desc,
153
+ (void*)y_ptr);
154
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
155
+
156
+ if (b != Qnil) {
157
+ size_t new_shape[CUMO_NA_MAX_DIMENSION];
158
+ VALUE b_cont;
159
+ char* b_cont_ptr;
160
+ cumo_narray_t *nb, *nb_cont;
161
+ size_t *b_shape;
162
+ int b_ndim;
163
+
164
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(b, cT);
165
+ CumoGetNArray(b, nb);
166
+ new_shape[0] = 1;
167
+ new_shape[1] = nb->size;
168
+ for (size_t i = 0; i < ndim; ++i) {
169
+ new_shape[i + 2] = 1;
170
+ }
171
+ b_cont = cumo_na_as_contiguous_array(b);
172
+ b_cont_ptr = cumo_na_get_offset_pointer_for_read(b_cont);
173
+ CumoGetNArray(b_cont, nb_cont);
174
+ b_shape = nb_cont->shape;
175
+ b_ndim = nb_cont->ndim;
176
+ // reshape b
177
+ nb_cont->ndim = ndim + 2;
178
+ nb_cont->shape = new_shape;
179
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&b_desc, b_cont, cudnn_dtype);
180
+ // restore b.shape
181
+ nb_cont->ndim = b_ndim;
182
+ nb_cont->shape = b_shape;
183
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
184
+
185
+ status = cudnnAddTensor(
186
+ handle,
187
+ (void*)&alpha,
188
+ b_desc,
189
+ (void*)b_cont_ptr,
190
+ (void*)&alpha,
191
+ y_desc,
192
+ (void*)y_ptr);
193
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
194
+ }
195
+
196
+ CONV_ERROR:
197
+ if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
198
+ if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
199
+ if (b_desc) cudnnDestroyTensorDescriptor(b_desc);
200
+ if (w_desc) cudnnDestroyFilterDescriptor(w_desc);
201
+ if (conv_desc) cudnnDestroyConvolutionDescriptor(conv_desc);
202
+ if (workspace) cumo_cuda_runtime_free(workspace);
203
+ cumo_cuda_cudnn_check_status(status);
204
+
205
+ return y;
206
+ }
207
+
208
+ #else // CUDNN_FOUND
209
+ VALUE cumo_cuda_eCUDNNError;
210
+
211
+ static VALUE
212
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
213
+ {
214
+ rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
215
+ }
216
+ #endif // CUDNN_FOUND
@@ -0,0 +1,183 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ <%
4
+ cudnn_dtype =
5
+ case type_name
6
+ when 'sfloat'
7
+ 'CUDNN_DATA_FLOAT'
8
+ when 'dfloat'
9
+ 'CUDNN_DATA_DOUBLE'
10
+ else
11
+ # CUDNN_DATA_HALF
12
+ raise 'not supported'
13
+ end
14
+ %>
15
+
16
+ static void
17
+ cumo_cuda_cudnn_get_sizet_ary(size_t *sizet_ary, VALUE ary, size_t ndim)
18
+ {
19
+ Check_Type(ary, T_ARRAY);
20
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ((size_t)(RARRAY_LEN(ary)), ndim);
21
+ for (size_t idim = 0; idim < ndim; ++idim) {
22
+ sizet_ary[idim] = NUM2SIZET(rb_ary_entry(ary, (long)idim));
23
+ }
24
+ }
25
+
26
+ // cover_all=true is not supported with CUDNN
27
+ // gw = x.conv_grad_w(gy, w_shape, stride: 1, pad: 0, gw: nil)
28
+ static VALUE
29
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
30
+ {
31
+ cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
32
+ cudnnStatus_t status = 0;
33
+ cudnnHandle_t handle = 0;
34
+ dtype one = 1;
35
+ dtype zero = 0;
36
+
37
+ VALUE x=self, gy, w_shape, stride, pad, gw;
38
+ VALUE kw_hash = Qnil;
39
+ ID kw_table[] = {rb_intern("stride"), rb_intern("pad"), rb_intern("gw")};
40
+ VALUE opts[] = {Qundef, Qundef, Qundef};
41
+
42
+ size_t ndim;
43
+ cumo_narray_t *nx, *ngy;
44
+
45
+ VALUE x_cont, gy_cont;
46
+ cudnnTensorDescriptor_t x_desc = 0;
47
+ cudnnTensorDescriptor_t gy_desc = 0;
48
+ cudnnConvolutionDescriptor_t conv_desc = 0;
49
+ cudnnFilterDescriptor_t gw_desc = 0;
50
+ char *x_cont_ptr, *gy_cont_ptr, *gw_ptr;
51
+
52
+ cudnnConvolutionBwdFilterAlgoPerf_t perf_result;
53
+ cudnnConvolutionBwdFilterAlgo_t algo;
54
+ size_t max_workspace_size = CUMO_CUDA_CUDNN_DEFAULT_MAX_WORKSPACE_SIZE;
55
+ size_t workspace_size;
56
+ char* workspace = 0;
57
+
58
+ size_t sizet_w_shape[CUMO_NA_MAX_DIMENSION];
59
+ int int_stride[CUMO_NA_MAX_DIMENSION];
60
+ int int_pad[CUMO_NA_MAX_DIMENSION];
61
+
62
+ rb_scan_args(argc, argv, "2:", &gy, &w_shape, &kw_hash);
63
+ rb_get_kwargs(kw_hash, kw_table, 0, 3, opts);
64
+ stride = cumo_cuda_cudnn_option_value(opts[0], Qnil);
65
+ pad = cumo_cuda_cudnn_option_value(opts[1], Qnil);
66
+ gw = cumo_cuda_cudnn_option_value(opts[2], Qnil);
67
+
68
+ CumoGetNArray(x, nx);
69
+ CumoGetNArray(gy, ngy);
70
+
71
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ(nx->ndim, ngy->ndim);
72
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
73
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gy, cT);
74
+ if (nx->ndim - 2 < 2) {
75
+ rb_raise(cumo_na_eShapeError, "CUDNN convolution requires number of spatial "
76
+ "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
77
+ }
78
+ ndim = nx->ndim - 2; // Number of spatial dimensions
79
+
80
+ cumo_cuda_cudnn_get_sizet_ary(sizet_w_shape, w_shape, ndim + 2);
81
+ cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 1);
82
+ cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
83
+
84
+ if (gw != Qnil) {
85
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gw, cT);
86
+ assert(cumo_na_check_contiguous(gw) == Qtrue);
87
+ }
88
+ else {
89
+ gw = cumo_na_new(cT, ndim + 2, sizet_w_shape);
90
+ }
91
+ // w_shape = (out_channels, in_channels, k_1, k_2, ..., k_N)
92
+ // x_shape = (batch_size, in_channels, d_1, d_2, ..., d_N)
93
+ // y_shape = (batch_size, out_channels, out_1, out_2, ..., out_N)
94
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ(nx->shape[0], ngy->shape[0]);
95
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ(sizet_w_shape[0], ngy->shape[1]);
96
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ(sizet_w_shape[1], nx->shape[1]);
97
+
98
+ {
99
+ // shape check of gy
100
+ size_t *y_shape = ngy->shape;
101
+ size_t *x_shape = nx->shape;
102
+ for (size_t i = 0; i < ndim; ++i) {
103
+ // TODO: raise
104
+ assert(y_shape[i + 2] == cumo_cuda_cudnn_GetConvOutDim(
105
+ x_shape[i + 2], sizet_w_shape[i + 2], int_stride[i], int_pad[i]));
106
+ }
107
+ }
108
+
109
+ x_cont = cumo_na_as_contiguous_array(x);
110
+ gy_cont = cumo_na_as_contiguous_array(gy);
111
+
112
+ x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
113
+ gy_cont_ptr = cumo_na_get_offset_pointer_for_read(gy_cont);
114
+ gw_ptr = cumo_na_get_offset_pointer_for_write(gw);
115
+
116
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
117
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
118
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&gy_desc, gy_cont, cudnn_dtype);
119
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
120
+ status = cumo_cuda_cudnn_CreateFilterDescriptor(&gw_desc, gw, cudnn_dtype);
121
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
122
+ status = cumo_cuda_cudnn_CreateConvolutionDescriptor(&conv_desc, ndim, int_stride, int_pad, cudnn_dtype);
123
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
124
+
125
+ handle = cumo_cuda_cudnn_handle();
126
+
127
+ // auto tune
128
+ status = cumo_cuda_cudnn_FindConvolutionBackwardFilterAlgorithm(
129
+ &perf_result,
130
+ handle,
131
+ x_desc,
132
+ x_cont,
133
+ gy_desc,
134
+ gy_cont,
135
+ conv_desc,
136
+ gw_desc,
137
+ gw,
138
+ max_workspace_size,
139
+ int_stride,
140
+ int_pad,
141
+ ndim,
142
+ cudnn_dtype);
143
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
144
+ algo = perf_result.algo;
145
+ workspace_size = perf_result.memory;
146
+
147
+ workspace = cumo_cuda_runtime_malloc(max_workspace_size);
148
+ status = cudnnConvolutionBackwardFilter(
149
+ handle,
150
+ (void*)&one,
151
+ x_desc,
152
+ (void*)x_cont_ptr,
153
+ gy_desc,
154
+ (void*)gy_cont_ptr,
155
+ conv_desc,
156
+ algo,
157
+ (void*)workspace,
158
+ workspace_size,
159
+ (void*)&zero,
160
+ gw_desc,
161
+ (void*)gw_ptr);
162
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
163
+
164
+ CONV_GRAD_W_ERROR:
165
+ if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
166
+ if (gy_desc) cudnnDestroyTensorDescriptor(gy_desc);
167
+ if (gw_desc) cudnnDestroyFilterDescriptor(gw_desc);
168
+ if (conv_desc) cudnnDestroyConvolutionDescriptor(conv_desc);
169
+ if (workspace) cumo_cuda_runtime_free(workspace);
170
+ cumo_cuda_cudnn_check_status(status);
171
+
172
+ return gw;
173
+ }
174
+
175
+ #else // CUDNN_FOUND
176
+ VALUE cumo_cuda_eCUDNNError;
177
+
178
+ static VALUE
179
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
180
+ {
181
+ rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
182
+ }
183
+ #endif // CUDNN_FOUND