cumo 0.2.5 → 0.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,191 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ <%
4
+ cudnn_dtype =
5
+ case type_name
6
+ when 'sfloat'
7
+ 'CUDNN_DATA_FLOAT'
8
+ when 'dfloat'
9
+ 'CUDNN_DATA_DOUBLE'
10
+ else
11
+ # CUDNN_DATA_HALF
12
+ raise 'not supported'
13
+ end
14
+ %>
15
+
16
+ // gx, ggamma, gbeta = x.batch_normalizatoin_backward(gamma, gy, mean:, inv_std:, eps:, axis:)
17
+ static VALUE
18
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
19
+ {
20
+ cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
21
+ cudnnStatus_t status = 0;
22
+ cudnnHandle_t handle = 0;
23
+ dtype coef_alpha = 1;
24
+ dtype coef_beta = 0;
25
+
26
+ VALUE x=self, gamma, gy, mean, inv_std, eps, axis, gx, ggamma, gbeta;
27
+ VALUE kw_hash = Qnil;
28
+ ID kw_table[] = {
29
+ rb_intern("mean"),
30
+ rb_intern("inv_std"),
31
+ rb_intern("eps"),
32
+ rb_intern("axis"),
33
+ rb_intern("gx"),
34
+ rb_intern("ggamma"),
35
+ rb_intern("gbeta")
36
+ };
37
+ VALUE opts[] = {Qundef, Qundef, Qundef, Qundef, Qundef, Qundef, Qundef};
38
+
39
+ cumo_narray_t *nx, *ngamma, *ngy;
40
+ size_t *x_shape, *gamma_shape, *gy_shape, reduced_shape[CUMO_NA_MAX_DIMENSION];
41
+ size_t x_ndim, gamma_ndim, gy_ndim, reduced_ndim;
42
+
43
+ VALUE x_cont, gamma_cont, gy_cont;
44
+ cudnnTensorDescriptor_t x_desc = 0;
45
+ cudnnTensorDescriptor_t bn_desc = 0;
46
+ char *x_cont_ptr, *gamma_cont_ptr, *gy_cont_ptr, *gx_ptr, *ggamma_ptr, *gbeta_ptr;
47
+
48
+ cudnnBatchNormMode_t mode;
49
+
50
+ // default values
51
+ char *mean_ptr=NULL;
52
+ char *inv_std_ptr=NULL;
53
+ double double_eps = 2e-5;
54
+ int int_axis[CUMO_NA_MAX_DIMENSION] = {0};
55
+ size_t axis_ndim = 1;
56
+
57
+ rb_scan_args(argc, argv, "2:", &gamma, &gy, &kw_hash);
58
+ rb_get_kwargs(kw_hash, kw_table, 0, 8, opts);
59
+ mean = cumo_cuda_cudnn_option_value(opts[0], Qnil);
60
+ inv_std = cumo_cuda_cudnn_option_value(opts[1], Qnil);
61
+ eps = cumo_cuda_cudnn_option_value(opts[2], Qnil);
62
+ axis = cumo_cuda_cudnn_option_value(opts[3], Qnil);
63
+ gx = cumo_cuda_cudnn_option_value(opts[4], Qnil);
64
+ ggamma = cumo_cuda_cudnn_option_value(opts[5], Qnil);
65
+ gbeta = cumo_cuda_cudnn_option_value(opts[6], Qnil);
66
+
67
+ if (mean != Qnil) {
68
+ mean_ptr = cumo_na_get_offset_pointer_for_read(mean);
69
+ }
70
+ if (inv_std != Qnil) {
71
+ inv_std_ptr = cumo_na_get_offset_pointer_for_read(inv_std);
72
+ }
73
+ if (eps != Qnil) {
74
+ double_eps = NUM2DBL(eps);
75
+ }
76
+ if (axis != Qnil) {
77
+ Check_Type(axis, T_ARRAY);
78
+ axis_ndim = (size_t)(RARRAY_LEN(axis));
79
+ for (size_t idim = 0; idim < axis_ndim; ++idim) {
80
+ int_axis[idim] = NUM2INT(rb_ary_entry(axis, (long)idim));
81
+ }
82
+ // TODO: check axis is sorted
83
+ }
84
+
85
+ CumoGetNArray(x, nx);
86
+ CumoGetNArray(gamma, ngamma);
87
+ CumoGetNArray(gy, ngy);
88
+ x_ndim = nx->ndim;
89
+ x_shape = nx->shape;
90
+ gamma_ndim = ngamma->ndim;
91
+ gamma_shape = ngamma->shape;
92
+ gy_ndim = ngy->ndim;
93
+ gy_shape = ngy->shape;
94
+
95
+ // TODO: Size check of gammma, beta, running_mean, running_var, mean, inv_std
96
+ // are equivalent with either of reduced_shape(keepdims: false) or reduced_shape(keepdims: true)
97
+ reduced_ndim = cumo_cuda_cudnn_ReduceShape(reduced_shape, x_ndim, x_shape, axis_ndim, int_axis, 1);
98
+ // CUMO_CUDA_CUDNN_CHECK_DIM_EQ(reduced_ndim, gamma_ndim);
99
+ // for (size_t idim = 0; idim < reduced_ndim; ++idim) {
100
+ // CUMO_CUDA_CUDNN_CHECK_DIM_EQ(reduced_shape[idim], gamma_shape[idim]);
101
+ // }
102
+ // CUMO_CUDA_CUDNN_CHECK_DIM_EQ(x_ndim, gy_ndim);
103
+ // for (size_t idim = 0; idim < x_ndim; ++idim) {
104
+ // CUMO_CUDA_CUDNN_CHECK_DIM_EQ(x_shape[idim], gy_shape[idim]);
105
+ // }
106
+
107
+ // TODO: Add ndim and shape (same with reduced) for mean and inv_std if given
108
+
109
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
110
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gamma, cT);
111
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gy, cT);
112
+ if (mean != Qnil) CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(mean, cT);
113
+ if (inv_std != Qnil) CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(inv_std, cT);
114
+
115
+ x_cont = cumo_na_as_contiguous_array(x);
116
+ gamma_cont = cumo_na_as_contiguous_array(gamma);
117
+ gy_cont = cumo_na_as_contiguous_array(gy);
118
+ if (mean != Qnil && cumo_na_check_contiguous(mean) != Qtrue) {
119
+ rb_raise(rb_eRuntimeError, "mean must be contiguous");
120
+ }
121
+ if (inv_std != Qnil && cumo_na_check_contiguous(inv_std) != Qtrue) {
122
+ rb_raise(rb_eRuntimeError, "inv_std must be contiguous");
123
+ }
124
+
125
+ x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
126
+ gamma_cont_ptr = cumo_na_get_offset_pointer_for_read(gamma_cont);
127
+ gy_cont_ptr = cumo_na_get_offset_pointer_for_read(gy_cont);
128
+
129
+ // TODO: type and shape check
130
+ if (gx == Qnil) gx = cumo_na_new(cT, x_ndim, x_shape);
131
+ gx_ptr = cumo_na_get_offset_pointer_for_write(gx);
132
+ if (ggamma == Qnil) ggamma = cumo_na_new(cT, gamma_ndim, gamma_shape);
133
+ ggamma_ptr = cumo_na_get_offset_pointer_for_write(ggamma);
134
+ if (gbeta == Qnil) gbeta = cumo_na_new(cT, gamma_ndim, gamma_shape);
135
+ gbeta_ptr = cumo_na_get_offset_pointer_for_write(gbeta);
136
+
137
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
138
+ if (status != CUDNN_STATUS_SUCCESS) goto BATCH_NORM_ERROR;
139
+
140
+ mode = cumo_cuda_cudnn_GetBatchNormMode(axis_ndim, int_axis);
141
+ status = cumo_cuda_cudnn_CreateBNTensorDescriptor(&bn_desc, x_desc, mode);
142
+ if (status != CUDNN_STATUS_SUCCESS) goto BATCH_NORM_ERROR;
143
+ // TODO: bn_desc may return another type, and may need to cast gamma, gy, mean, var
144
+
145
+ handle = cumo_cuda_cudnn_handle();
146
+
147
+ status = cudnnBatchNormalizationBackward(
148
+ handle,
149
+ mode,
150
+ (void*)&coef_alpha,
151
+ (void*)&coef_beta,
152
+ (void*)&coef_alpha,
153
+ (void*)&coef_beta,
154
+ x_desc,
155
+ x_cont_ptr,
156
+ x_desc,
157
+ gy_cont_ptr,
158
+ x_desc,
159
+ gx_ptr,
160
+ bn_desc,
161
+ gamma_cont_ptr,
162
+ ggamma_ptr,
163
+ gbeta_ptr,
164
+ double_eps,
165
+ mean_ptr,
166
+ inv_std_ptr);
167
+ if (status != CUDNN_STATUS_SUCCESS) goto BATCH_NORM_ERROR;
168
+
169
+ BATCH_NORM_ERROR:
170
+ if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
171
+ if (bn_desc) cudnnDestroyTensorDescriptor(bn_desc);
172
+ cumo_cuda_cudnn_check_status(status);
173
+
174
+ {
175
+ VALUE ret = rb_ary_new2(3);
176
+ rb_ary_push(ret, gx);
177
+ rb_ary_push(ret, ggamma);
178
+ rb_ary_push(ret, gbeta);
179
+ return ret;
180
+ }
181
+ }
182
+
183
+ #else // CUDNN_FOUND
184
+ VALUE cumo_cuda_eCudnnError;
185
+
186
+ static VALUE
187
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
188
+ {
189
+ rb_raise(cumo_cuda_eCudnnError, "cuDNN is not available");
190
+ }
191
+ #endif // CUDNN_FOUND
@@ -0,0 +1,216 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ <%
4
+ cudnn_dtype =
5
+ case type_name
6
+ when 'sfloat'
7
+ 'CUDNN_DATA_FLOAT'
8
+ when 'dfloat'
9
+ 'CUDNN_DATA_DOUBLE'
10
+ else
11
+ # CUDNN_DATA_HALF
12
+ raise 'not supported'
13
+ end
14
+ %>
15
+
16
+ // cover_all=true is not supported with CUDNN
17
+ // dilation > 1 is not supported yet
18
+ // x.conv(w, b: nil, stride: 1, pad: 0, y: nil)
19
+ static VALUE
20
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
21
+ {
22
+ cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
23
+ cudnnStatus_t status = 0;
24
+ cudnnHandle_t handle = 0;
25
+ dtype alpha = 1;
26
+ dtype beta = 0;
27
+
28
+ VALUE x=self, w, b, stride, pad, y;
29
+ VALUE kw_hash = Qnil;
30
+ ID kw_table[4] = {rb_intern("stride"), rb_intern("pad"), rb_intern("b"), rb_intern("y")};
31
+ VALUE opts[4] = {Qundef, Qundef, Qundef, Qundef};
32
+
33
+ size_t ndim;
34
+ cumo_narray_t *nx, *nw;
35
+ size_t *x_shape, *w_shape;
36
+ size_t out_channels, batch_size;
37
+
38
+ VALUE x_cont, w_cont;
39
+ cudnnTensorDescriptor_t x_desc = 0;
40
+ cudnnTensorDescriptor_t y_desc = 0;
41
+ cudnnTensorDescriptor_t b_desc = 0;
42
+ cudnnFilterDescriptor_t w_desc = 0;
43
+ cudnnConvolutionDescriptor_t conv_desc = 0;
44
+ char *x_cont_ptr, *w_cont_ptr, *y_ptr;
45
+
46
+ cudnnConvolutionFwdAlgoPerf_t perf_result;
47
+ cudnnConvolutionFwdAlgo_t algo;
48
+ size_t max_workspace_size = CUMO_CUDA_CUDNN_DEFAULT_MAX_WORKSPACE_SIZE;
49
+ size_t workspace_size;
50
+ char* workspace = 0;
51
+
52
+ int int_stride[CUMO_NA_MAX_DIMENSION];
53
+ int int_pad[CUMO_NA_MAX_DIMENSION];
54
+
55
+ rb_scan_args(argc, argv, "1:", &w, &kw_hash);
56
+ rb_get_kwargs(kw_hash, kw_table, 0, 4, opts);
57
+ stride = cumo_cuda_cudnn_option_value(opts[0], Qnil);
58
+ pad = cumo_cuda_cudnn_option_value(opts[1], Qnil);
59
+ b = cumo_cuda_cudnn_option_value(opts[2], Qnil);
60
+ y = cumo_cuda_cudnn_option_value(opts[3], Qnil);
61
+
62
+ CumoGetNArray(x, nx);
63
+ CumoGetNArray(w, nw);
64
+
65
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ(nx->ndim, nw->ndim);
66
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
67
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(w, cT);
68
+ if (nx->ndim - 2 < 2) {
69
+ rb_raise(cumo_na_eShapeError, "CUDNN convolution requires number of spatial "
70
+ "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
71
+ }
72
+ ndim = nx->ndim - 2; // Number of spatial dimensions
73
+
74
+ cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 1);
75
+ cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
76
+
77
+ x_shape = nx->shape;
78
+ w_shape = nw->shape;
79
+ batch_size = x_shape[0]; // x_shape = (batch_size, in_channels, d_1, d_2, ..., d_N)
80
+ out_channels = w_shape[0]; // w.shape = (out_channels, in_channels, k_1, k_2, ..., k_N)
81
+ if (x_shape[1] != w_shape[1]) {
82
+ rb_raise(cumo_na_eShapeError, "x_shape[1]:%d does not match with w_shape[1]:%d",
83
+ (int)x_shape[1], (int)w_shape[1]);
84
+ }
85
+
86
+ if (y != Qnil) {
87
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(y, cT);
88
+ }
89
+ else {
90
+ size_t *y_shape = ALLOCA_N(size_t, ndim + 2);
91
+ // out_shape = (batch_size, out_channels, out_1, out_2, ..., out_N)
92
+ y_shape[0] = batch_size;
93
+ y_shape[1] = out_channels;
94
+ for (size_t i = 0; i < ndim; ++i) {
95
+ y_shape[i + 2] = cumo_cuda_cudnn_GetConvOutDim(
96
+ x_shape[i + 2], w_shape[i + 2], int_stride[i], int_pad[i]);
97
+ }
98
+ y = cumo_na_new(cT, ndim + 2, y_shape);
99
+ }
100
+
101
+ x_cont = cumo_na_as_contiguous_array(x);
102
+ w_cont = cumo_na_as_contiguous_array(w);
103
+
104
+ x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
105
+ w_cont_ptr = cumo_na_get_offset_pointer_for_read(w_cont);
106
+ y_ptr = cumo_na_get_offset_pointer_for_write(y);
107
+
108
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
109
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
110
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&y_desc, y, cudnn_dtype);
111
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
112
+ status = cumo_cuda_cudnn_CreateFilterDescriptor(&w_desc, w_cont, cudnn_dtype);
113
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
114
+ status = cumo_cuda_cudnn_CreateConvolutionDescriptor(&conv_desc, ndim, int_stride, int_pad, cudnn_dtype);
115
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
116
+
117
+ handle = cumo_cuda_cudnn_handle();
118
+
119
+ // auto tune
120
+ status = cumo_cuda_cudnn_FindConvolutionForwardAlgorithm(
121
+ &perf_result,
122
+ handle,
123
+ x_desc,
124
+ x_cont,
125
+ w_desc,
126
+ w_cont,
127
+ conv_desc,
128
+ y_desc,
129
+ y,
130
+ max_workspace_size,
131
+ int_stride,
132
+ int_pad,
133
+ ndim,
134
+ cudnn_dtype);
135
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
136
+ algo = perf_result.algo;
137
+ workspace_size = perf_result.memory;
138
+
139
+ workspace = cumo_cuda_runtime_malloc(max_workspace_size);
140
+ status = cudnnConvolutionForward(
141
+ handle,
142
+ (void*)&alpha,
143
+ x_desc,
144
+ (void*)x_cont_ptr,
145
+ w_desc,
146
+ (void*)w_cont_ptr,
147
+ conv_desc,
148
+ algo,
149
+ (void*)workspace,
150
+ workspace_size,
151
+ (void*)&beta,
152
+ y_desc,
153
+ (void*)y_ptr);
154
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
155
+
156
+ if (b != Qnil) {
157
+ size_t new_shape[CUMO_NA_MAX_DIMENSION];
158
+ VALUE b_cont;
159
+ char* b_cont_ptr;
160
+ cumo_narray_t *nb, *nb_cont;
161
+ size_t *b_shape;
162
+ int b_ndim;
163
+
164
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(b, cT);
165
+ CumoGetNArray(b, nb);
166
+ new_shape[0] = 1;
167
+ new_shape[1] = nb->size;
168
+ for (size_t i = 0; i < ndim; ++i) {
169
+ new_shape[i + 2] = 1;
170
+ }
171
+ b_cont = cumo_na_as_contiguous_array(b);
172
+ b_cont_ptr = cumo_na_get_offset_pointer_for_read(b_cont);
173
+ CumoGetNArray(b_cont, nb_cont);
174
+ b_shape = nb_cont->shape;
175
+ b_ndim = nb_cont->ndim;
176
+ // reshape b
177
+ nb_cont->ndim = ndim + 2;
178
+ nb_cont->shape = new_shape;
179
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&b_desc, b_cont, cudnn_dtype);
180
+ // restore b.shape
181
+ nb_cont->ndim = b_ndim;
182
+ nb_cont->shape = b_shape;
183
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
184
+
185
+ status = cudnnAddTensor(
186
+ handle,
187
+ (void*)&alpha,
188
+ b_desc,
189
+ (void*)b_cont_ptr,
190
+ (void*)&alpha,
191
+ y_desc,
192
+ (void*)y_ptr);
193
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
194
+ }
195
+
196
+ CONV_ERROR:
197
+ if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
198
+ if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
199
+ if (b_desc) cudnnDestroyTensorDescriptor(b_desc);
200
+ if (w_desc) cudnnDestroyFilterDescriptor(w_desc);
201
+ if (conv_desc) cudnnDestroyConvolutionDescriptor(conv_desc);
202
+ if (workspace) cumo_cuda_runtime_free(workspace);
203
+ cumo_cuda_cudnn_check_status(status);
204
+
205
+ return y;
206
+ }
207
+
208
+ #else // CUDNN_FOUND
209
+ VALUE cumo_cuda_eCUDNNError;
210
+
211
+ static VALUE
212
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
213
+ {
214
+ rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
215
+ }
216
+ #endif // CUDNN_FOUND
@@ -0,0 +1,183 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ <%
4
+ cudnn_dtype =
5
+ case type_name
6
+ when 'sfloat'
7
+ 'CUDNN_DATA_FLOAT'
8
+ when 'dfloat'
9
+ 'CUDNN_DATA_DOUBLE'
10
+ else
11
+ # CUDNN_DATA_HALF
12
+ raise 'not supported'
13
+ end
14
+ %>
15
+
16
+ static void
17
+ cumo_cuda_cudnn_get_sizet_ary(size_t *sizet_ary, VALUE ary, size_t ndim)
18
+ {
19
+ Check_Type(ary, T_ARRAY);
20
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ((size_t)(RARRAY_LEN(ary)), ndim);
21
+ for (size_t idim = 0; idim < ndim; ++idim) {
22
+ sizet_ary[idim] = NUM2SIZET(rb_ary_entry(ary, (long)idim));
23
+ }
24
+ }
25
+
26
+ // cover_all=true is not supported with CUDNN
27
+ // gw = x.conv_grad_w(gy, w_shape, stride: 1, pad: 0, gw: nil)
28
+ static VALUE
29
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
30
+ {
31
+ cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
32
+ cudnnStatus_t status = 0;
33
+ cudnnHandle_t handle = 0;
34
+ dtype one = 1;
35
+ dtype zero = 0;
36
+
37
+ VALUE x=self, gy, w_shape, stride, pad, gw;
38
+ VALUE kw_hash = Qnil;
39
+ ID kw_table[] = {rb_intern("stride"), rb_intern("pad"), rb_intern("gw")};
40
+ VALUE opts[] = {Qundef, Qundef, Qundef};
41
+
42
+ size_t ndim;
43
+ cumo_narray_t *nx, *ngy;
44
+
45
+ VALUE x_cont, gy_cont;
46
+ cudnnTensorDescriptor_t x_desc = 0;
47
+ cudnnTensorDescriptor_t gy_desc = 0;
48
+ cudnnConvolutionDescriptor_t conv_desc = 0;
49
+ cudnnFilterDescriptor_t gw_desc = 0;
50
+ char *x_cont_ptr, *gy_cont_ptr, *gw_ptr;
51
+
52
+ cudnnConvolutionBwdFilterAlgoPerf_t perf_result;
53
+ cudnnConvolutionBwdFilterAlgo_t algo;
54
+ size_t max_workspace_size = CUMO_CUDA_CUDNN_DEFAULT_MAX_WORKSPACE_SIZE;
55
+ size_t workspace_size;
56
+ char* workspace = 0;
57
+
58
+ size_t sizet_w_shape[CUMO_NA_MAX_DIMENSION];
59
+ int int_stride[CUMO_NA_MAX_DIMENSION];
60
+ int int_pad[CUMO_NA_MAX_DIMENSION];
61
+
62
+ rb_scan_args(argc, argv, "2:", &gy, &w_shape, &kw_hash);
63
+ rb_get_kwargs(kw_hash, kw_table, 0, 3, opts);
64
+ stride = cumo_cuda_cudnn_option_value(opts[0], Qnil);
65
+ pad = cumo_cuda_cudnn_option_value(opts[1], Qnil);
66
+ gw = cumo_cuda_cudnn_option_value(opts[2], Qnil);
67
+
68
+ CumoGetNArray(x, nx);
69
+ CumoGetNArray(gy, ngy);
70
+
71
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ(nx->ndim, ngy->ndim);
72
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
73
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gy, cT);
74
+ if (nx->ndim - 2 < 2) {
75
+ rb_raise(cumo_na_eShapeError, "CUDNN convolution requires number of spatial "
76
+ "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
77
+ }
78
+ ndim = nx->ndim - 2; // Number of spatial dimensions
79
+
80
+ cumo_cuda_cudnn_get_sizet_ary(sizet_w_shape, w_shape, ndim + 2);
81
+ cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 1);
82
+ cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
83
+
84
+ if (gw != Qnil) {
85
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gw, cT);
86
+ assert(cumo_na_check_contiguous(gw) == Qtrue);
87
+ }
88
+ else {
89
+ gw = cumo_na_new(cT, ndim + 2, sizet_w_shape);
90
+ }
91
+ // w_shape = (out_channels, in_channels, k_1, k_2, ..., k_N)
92
+ // x_shape = (batch_size, in_channels, d_1, d_2, ..., d_N)
93
+ // y_shape = (batch_size, out_channels, out_1, out_2, ..., out_N)
94
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ(nx->shape[0], ngy->shape[0]);
95
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ(sizet_w_shape[0], ngy->shape[1]);
96
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ(sizet_w_shape[1], nx->shape[1]);
97
+
98
+ {
99
+ // shape check of gy
100
+ size_t *y_shape = ngy->shape;
101
+ size_t *x_shape = nx->shape;
102
+ for (size_t i = 0; i < ndim; ++i) {
103
+ // TODO: raise
104
+ assert(y_shape[i + 2] == cumo_cuda_cudnn_GetConvOutDim(
105
+ x_shape[i + 2], sizet_w_shape[i + 2], int_stride[i], int_pad[i]));
106
+ }
107
+ }
108
+
109
+ x_cont = cumo_na_as_contiguous_array(x);
110
+ gy_cont = cumo_na_as_contiguous_array(gy);
111
+
112
+ x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
113
+ gy_cont_ptr = cumo_na_get_offset_pointer_for_read(gy_cont);
114
+ gw_ptr = cumo_na_get_offset_pointer_for_write(gw);
115
+
116
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
117
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
118
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&gy_desc, gy_cont, cudnn_dtype);
119
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
120
+ status = cumo_cuda_cudnn_CreateFilterDescriptor(&gw_desc, gw, cudnn_dtype);
121
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
122
+ status = cumo_cuda_cudnn_CreateConvolutionDescriptor(&conv_desc, ndim, int_stride, int_pad, cudnn_dtype);
123
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
124
+
125
+ handle = cumo_cuda_cudnn_handle();
126
+
127
+ // auto tune
128
+ status = cumo_cuda_cudnn_FindConvolutionBackwardFilterAlgorithm(
129
+ &perf_result,
130
+ handle,
131
+ x_desc,
132
+ x_cont,
133
+ gy_desc,
134
+ gy_cont,
135
+ conv_desc,
136
+ gw_desc,
137
+ gw,
138
+ max_workspace_size,
139
+ int_stride,
140
+ int_pad,
141
+ ndim,
142
+ cudnn_dtype);
143
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
144
+ algo = perf_result.algo;
145
+ workspace_size = perf_result.memory;
146
+
147
+ workspace = cumo_cuda_runtime_malloc(max_workspace_size);
148
+ status = cudnnConvolutionBackwardFilter(
149
+ handle,
150
+ (void*)&one,
151
+ x_desc,
152
+ (void*)x_cont_ptr,
153
+ gy_desc,
154
+ (void*)gy_cont_ptr,
155
+ conv_desc,
156
+ algo,
157
+ (void*)workspace,
158
+ workspace_size,
159
+ (void*)&zero,
160
+ gw_desc,
161
+ (void*)gw_ptr);
162
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_GRAD_W_ERROR;
163
+
164
+ CONV_GRAD_W_ERROR:
165
+ if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
166
+ if (gy_desc) cudnnDestroyTensorDescriptor(gy_desc);
167
+ if (gw_desc) cudnnDestroyFilterDescriptor(gw_desc);
168
+ if (conv_desc) cudnnDestroyConvolutionDescriptor(conv_desc);
169
+ if (workspace) cumo_cuda_runtime_free(workspace);
170
+ cumo_cuda_cudnn_check_status(status);
171
+
172
+ return gw;
173
+ }
174
+
175
+ #else // CUDNN_FOUND
176
+ VALUE cumo_cuda_eCUDNNError;
177
+
178
+ static VALUE
179
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
180
+ {
181
+ rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
182
+ }
183
+ #endif // CUDNN_FOUND