cumo 0.2.5 → 0.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,244 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ <%
4
+ cudnn_dtype =
5
+ case type_name
6
+ when 'sfloat'
7
+ 'CUDNN_DATA_FLOAT'
8
+ when 'dfloat'
9
+ 'CUDNN_DATA_DOUBLE'
10
+ else
11
+ # CUDNN_DATA_HALF
12
+ raise 'not supported'
13
+ end
14
+ %>
15
+
16
+ // VALUE is Ruby Array
17
+ static void
18
+ get_int_out_size(int* int_out_size, VALUE out_size, size_t ndim, size_t* x_shape, size_t* w_shape, int* int_stride, int* int_pad)
19
+ {
20
+ if (out_size == Qnil) {
21
+ for (size_t i = 0; i < ndim; ++i) {
22
+ int_out_size[i] = cumo_cuda_cudnn_GetConvTransposeOutDim(
23
+ x_shape[i + 2], w_shape[i + 2], int_stride[i], int_pad[i]);
24
+ }
25
+ } else {
26
+ Check_Type(out_size, T_ARRAY);
27
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ((size_t)(RARRAY_LEN(out_size)), ndim);
28
+ for (size_t i = 0; i < ndim; ++i) {
29
+ int_out_size[i] = NUM2INT(rb_ary_entry(out_size, (long)i));
30
+ }
31
+ }
32
+ // only cover_all=false is supported
33
+ for (size_t i = 0; i < ndim; ++i) {
34
+ if (x_shape[i + 2] != cumo_cuda_cudnn_GetConvOutDim(
35
+ int_out_size[i], w_shape[i + 2], int_stride[i], int_pad[i])) {
36
+ rb_raise(rb_eRuntimeError, "CUDA transposed convolution does not support specified output sizes");
37
+ }
38
+ }
39
+ }
40
+
41
+ // cover_all=true is not supported with CUDNN
42
+ // dilation > 1 is not supported yet
43
+ // x.conv(w, b: nil, stride: 1, pad: 0, out_size: nil, y: nil)
44
+ static VALUE
45
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
46
+ {
47
+ cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
48
+ cudnnStatus_t status = 0;
49
+ cudnnHandle_t handle = 0;
50
+ dtype alpha = 1;
51
+ dtype beta = 0;
52
+
53
+ VALUE x=self, w, b, stride, pad, out_size, y;
54
+ VALUE kw_hash = Qnil;
55
+ ID kw_table[5] = {rb_intern("b"), rb_intern("stride"), rb_intern("pad"), rb_intern("out_size"), rb_intern("y")};
56
+ VALUE opts[5] = {Qundef, Qundef, Qundef, Qundef, Qundef};
57
+
58
+ size_t ndim;
59
+ cumo_narray_t *nx, *nw;
60
+ size_t *x_shape, *w_shape;
61
+ size_t out_channels, batch_size;
62
+
63
+ VALUE x_cont, w_cont;
64
+ cudnnTensorDescriptor_t x_desc = 0;
65
+ cudnnTensorDescriptor_t y_desc = 0;
66
+ cudnnTensorDescriptor_t b_desc = 0;
67
+ cudnnFilterDescriptor_t w_desc = 0;
68
+ cudnnConvolutionDescriptor_t conv_desc = 0;
69
+ char *x_cont_ptr, *w_cont_ptr, *y_ptr;
70
+
71
+ cudnnConvolutionBwdDataAlgoPerf_t perf_result;
72
+ cudnnConvolutionBwdDataAlgo_t algo;
73
+ size_t max_workspace_size = CUMO_CUDA_CUDNN_DEFAULT_MAX_WORKSPACE_SIZE;
74
+ size_t workspace_size;
75
+ char* workspace = 0;
76
+
77
+ int int_stride[CUMO_NA_MAX_DIMENSION];
78
+ int int_pad[CUMO_NA_MAX_DIMENSION];
79
+ int int_out_size[CUMO_NA_MAX_DIMENSION];
80
+
81
+ rb_scan_args(argc, argv, "1:", &w, &kw_hash);
82
+ rb_get_kwargs(kw_hash, kw_table, 0, 4, opts);
83
+ b = cumo_cuda_cudnn_option_value(opts[0], Qnil);
84
+ stride = cumo_cuda_cudnn_option_value(opts[1], Qnil);
85
+ pad = cumo_cuda_cudnn_option_value(opts[2], Qnil);
86
+ out_size = cumo_cuda_cudnn_option_value(opts[3], Qnil);
87
+ y = cumo_cuda_cudnn_option_value(opts[4], Qnil);
88
+
89
+ CumoGetNArray(x, nx);
90
+ CumoGetNArray(w, nw);
91
+
92
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ(nx->ndim, nw->ndim);
93
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
94
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(w, cT);
95
+ if (nx->ndim - 2 < 2) {
96
+ rb_raise(cumo_na_eShapeError, "CUDNN convolution requires number of spatial "
97
+ "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
98
+ }
99
+ ndim = nx->ndim - 2; // Number of spatial dimensions
100
+
101
+ x_shape = nx->shape;
102
+ w_shape = nw->shape;
103
+ batch_size = x_shape[0]; // x_shape = (batch_size, in_channels, d_1, d_2, ..., d_N)
104
+ out_channels = w_shape[1]; // w.shape = (in_channels, out_channels, k_1, k_2, ..., k_N)
105
+ if (x_shape[1] != w_shape[0]) {
106
+ rb_raise(cumo_na_eShapeError, "x_shape[1]:%d does not match with w_shape[0]:%d",
107
+ (int)x_shape[1], (int)w_shape[0]);
108
+ }
109
+
110
+ cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 1);
111
+ cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
112
+ get_int_out_size(int_out_size, out_size, ndim, x_shape, w_shape, int_stride, int_pad);
113
+
114
+ // out_shape = (batch_size, out_channels, out_1, out_2, ..., out_N)
115
+ if (y != Qnil) {
116
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(y, cT);
117
+ // TODO: shape check
118
+ }
119
+ else {
120
+ size_t *y_shape = ALLOCA_N(size_t, ndim + 2);
121
+ y_shape[0] = batch_size;
122
+ y_shape[1] = out_channels;
123
+ for (size_t i = 0; i < ndim; ++i) {
124
+ y_shape[i + 2] = int_out_size[i];
125
+ }
126
+ y = cumo_na_new(cT, ndim + 2, y_shape);
127
+ }
128
+
129
+ x_cont = cumo_na_as_contiguous_array(x);
130
+ w_cont = cumo_na_as_contiguous_array(w);
131
+
132
+ x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
133
+ w_cont_ptr = cumo_na_get_offset_pointer_for_read(w_cont);
134
+ y_ptr = cumo_na_get_offset_pointer_for_write(y);
135
+
136
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
137
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
138
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&y_desc, y, cudnn_dtype);
139
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
140
+ status = cumo_cuda_cudnn_CreateFilterDescriptor(&w_desc, w_cont, cudnn_dtype);
141
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
142
+ status = cumo_cuda_cudnn_CreateConvolutionDescriptor(&conv_desc, ndim, int_stride, int_pad, cudnn_dtype);
143
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
144
+
145
+ handle = cumo_cuda_cudnn_handle();
146
+
147
+ // auto tune
148
+ status = cumo_cuda_cudnn_FindConvolutionBackwardDataAlgorithm(
149
+ &perf_result,
150
+ handle,
151
+ w_desc,
152
+ w_cont,
153
+ x_desc,
154
+ x_cont,
155
+ conv_desc,
156
+ y_desc,
157
+ y,
158
+ max_workspace_size,
159
+ int_stride,
160
+ int_pad,
161
+ ndim,
162
+ cudnn_dtype);
163
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
164
+ algo = perf_result.algo;
165
+ workspace_size = perf_result.memory;
166
+
167
+ workspace = cumo_cuda_runtime_malloc(max_workspace_size);
168
+ status = cudnnConvolutionBackwardData(
169
+ handle,
170
+ (void*)&alpha,
171
+ w_desc,
172
+ (void*)w_cont_ptr,
173
+ x_desc,
174
+ (void*)x_cont_ptr,
175
+ conv_desc,
176
+ algo,
177
+ (void*)workspace,
178
+ workspace_size,
179
+ (void*)&beta,
180
+ y_desc,
181
+ (void*)y_ptr);
182
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
183
+
184
+ if (b != Qnil) {
185
+ size_t new_shape[CUMO_NA_MAX_DIMENSION];
186
+ VALUE b_cont;
187
+ char* b_cont_ptr;
188
+ cumo_narray_t *nb, *nb_cont;
189
+ size_t *b_shape;
190
+ int b_ndim;
191
+
192
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(b, cT);
193
+ CumoGetNArray(b, nb);
194
+ new_shape[0] = 1;
195
+ new_shape[1] = nb->size;
196
+ for (size_t i = 0; i < ndim; ++i) {
197
+ new_shape[i + 2] = 1;
198
+ }
199
+ b_cont = cumo_na_as_contiguous_array(b);
200
+ b_cont_ptr = cumo_na_get_offset_pointer_for_read(b_cont);
201
+ CumoGetNArray(b_cont, nb_cont);
202
+ b_shape = nb_cont->shape;
203
+ b_ndim = nb_cont->ndim;
204
+ // reshape b
205
+ nb_cont->ndim = ndim + 2;
206
+ nb_cont->shape = new_shape;
207
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&b_desc, b_cont, cudnn_dtype);
208
+ // restore b.shape
209
+ nb_cont->ndim = b_ndim;
210
+ nb_cont->shape = b_shape;
211
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
212
+
213
+ status = cudnnAddTensor(
214
+ handle,
215
+ (void*)&alpha,
216
+ b_desc,
217
+ (void*)b_cont_ptr,
218
+ (void*)&alpha,
219
+ y_desc,
220
+ (void*)y_ptr);
221
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
222
+ }
223
+
224
+ CONV_ERROR:
225
+ if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
226
+ if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
227
+ if (b_desc) cudnnDestroyTensorDescriptor(b_desc);
228
+ if (w_desc) cudnnDestroyFilterDescriptor(w_desc);
229
+ if (conv_desc) cudnnDestroyConvolutionDescriptor(conv_desc);
230
+ if (workspace) cumo_cuda_runtime_free(workspace);
231
+ cumo_cuda_cudnn_check_status(status);
232
+
233
+ return y;
234
+ }
235
+
236
+ #else // CUDNN_FOUND
237
+ VALUE cumo_cuda_eCUDNNError;
238
+
239
+ static VALUE
240
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
241
+ {
242
+ rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
243
+ }
244
+ #endif // CUDNN_FOUND
@@ -345,3 +345,17 @@ static VALUE
345
345
  <%=c_iter%>(a, b, c, &g);
346
346
  return c;
347
347
  }
348
+
349
+ #undef ROW_SIZE
350
+ #undef COL_SIZE
351
+ #undef CHECK_NARRAY_TYPE
352
+ #undef CHECK_DIM_GE
353
+ #undef CHECK_DIM_EQ
354
+ #undef CHECK_SQUARE
355
+ #undef CHECK_SIZE_GE
356
+ #undef CHECK_NON_EMPTY
357
+ #undef CHECK_SIZE_EQ
358
+ #undef CHECK_SAME_SHAPE
359
+ #undef CHECK_INT_EQ
360
+ #undef CHECK_LEADING_GE
361
+ #undef COPY_OR_CAST_TO
@@ -0,0 +1,136 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ <%
4
+ cudnn_dtype =
5
+ case type_name
6
+ when 'sfloat'
7
+ 'CUDNN_DATA_FLOAT'
8
+ when 'dfloat'
9
+ 'CUDNN_DATA_DOUBLE'
10
+ else
11
+ # CUDNN_DATA_HALF
12
+ raise 'not supported'
13
+ end
14
+ %>
15
+
16
+ // cover_all=true is not supported with CUDNN
17
+ // gy = x.pooling_backward(mode, y, kernel_size, stride: 1, pad: 0)
18
+ //CUDNN_POOLING_MAX
19
+ //CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING
20
+ //CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
21
+ //CUDNN_POOLING_MAX_DETERMINISTIC
22
+ static VALUE
23
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
24
+ {
25
+ cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
26
+ cudnnStatus_t status = 0;
27
+ cudnnHandle_t handle = 0;
28
+ dtype alpha = 1;
29
+ dtype beta = 0;
30
+
31
+ VALUE x=self, y, gy, mode, kernel_size, stride, pad, gx;
32
+ VALUE kw_hash = Qnil;
33
+ ID kw_table[4] = {rb_intern("stride"), rb_intern("pad"), rb_intern("gx")};
34
+ VALUE opts[4] = {Qundef, Qundef, Qundef};
35
+
36
+ size_t ndim;
37
+ cumo_narray_t *nx;
38
+ size_t *x_shape;
39
+
40
+ VALUE x_cont, y_cont, gy_cont;
41
+ cudnnTensorDescriptor_t x_desc = 0;
42
+ cudnnTensorDescriptor_t y_desc = 0;
43
+ cudnnPoolingDescriptor_t pool_desc = 0;
44
+ char *x_cont_ptr, *y_cont_ptr, *gy_cont_ptr, *gx_ptr;
45
+
46
+ cudnnPoolingMode_t int_mode;
47
+ int int_kernel_size[CUMO_NA_MAX_DIMENSION];
48
+ int int_stride[CUMO_NA_MAX_DIMENSION];
49
+ int int_pad[CUMO_NA_MAX_DIMENSION];
50
+
51
+ rb_scan_args(argc, argv, "4:", &mode, &y, &gy, &kernel_size, &kw_hash);
52
+ rb_get_kwargs(kw_hash, kw_table, 0, 3, opts);
53
+ stride = cumo_cuda_cudnn_option_value(opts[0], Qnil);
54
+ pad = cumo_cuda_cudnn_option_value(opts[1], Qnil);
55
+ gx = cumo_cuda_cudnn_option_value(opts[2], Qnil);
56
+
57
+ CumoGetNArray(x, nx);
58
+
59
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
60
+ if (nx->ndim - 2 < 2) {
61
+ rb_raise(cumo_na_eShapeError, "cuDNN pooling requires number of spatial "
62
+ "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
63
+ }
64
+ ndim = nx->ndim - 2; // Number of spatial dimensions
65
+
66
+ // required parameter
67
+ int_mode = (cudnnPoolingMode_t)NUM2INT(mode);
68
+ cumo_cuda_cudnn_get_int_ary(int_kernel_size, kernel_size, ndim, 0);
69
+ // default to kernel_size
70
+ if (stride == Qnil) {
71
+ memcpy(int_stride, int_kernel_size, sizeof(int) * ndim);
72
+ } else {
73
+ cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 0);
74
+ }
75
+ // default to 0
76
+ cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
77
+
78
+ x_shape = nx->shape;
79
+
80
+ if (gx != Qnil) {
81
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gx, cT);
82
+ }
83
+ else {
84
+ gx = cumo_na_new(cT, ndim + 2, x_shape);
85
+ }
86
+
87
+ x_cont = cumo_na_as_contiguous_array(x);
88
+ y_cont = cumo_na_as_contiguous_array(y);
89
+ gy_cont = cumo_na_as_contiguous_array(gy);
90
+
91
+ x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
92
+ y_cont_ptr = cumo_na_get_offset_pointer_for_read(y_cont);
93
+ gy_cont_ptr = cumo_na_get_offset_pointer_for_read(gy_cont);
94
+ gx_ptr = cumo_na_get_offset_pointer_for_write(gx);
95
+
96
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
97
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
98
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&y_desc, y, cudnn_dtype);
99
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
100
+ status = cumo_cuda_cudnn_CreatePoolingDescriptor(&pool_desc, int_mode, ndim, int_kernel_size, int_stride, int_pad);
101
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
102
+
103
+ handle = cumo_cuda_cudnn_handle();
104
+ status = cudnnPoolingBackward(
105
+ handle,
106
+ pool_desc,
107
+ (void*)&alpha,
108
+ y_desc,
109
+ (void*)y_cont_ptr,
110
+ y_desc,
111
+ (void*)gy_cont_ptr,
112
+ x_desc,
113
+ (void*)x_cont_ptr,
114
+ (void*)&beta,
115
+ x_desc,
116
+ (void*)gx_ptr);
117
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
118
+
119
+ POOLING_ERROR:
120
+ if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
121
+ if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
122
+ if (pool_desc) cudnnDestroyPoolingDescriptor(pool_desc);
123
+ cumo_cuda_cudnn_check_status(status);
124
+
125
+ return gx;
126
+ }
127
+
128
+ #else // CUDNN_FOUND
129
+ VALUE cumo_cuda_eCUDNNError;
130
+
131
+ static VALUE
132
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
133
+ {
134
+ rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
135
+ }
136
+ #endif // CUDNN_FOUND
@@ -0,0 +1,136 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ <%
4
+ cudnn_dtype =
5
+ case type_name
6
+ when 'sfloat'
7
+ 'CUDNN_DATA_FLOAT'
8
+ when 'dfloat'
9
+ 'CUDNN_DATA_DOUBLE'
10
+ else
11
+ # CUDNN_DATA_HALF
12
+ raise 'not supported'
13
+ end
14
+ %>
15
+
16
+ // cover_all=true is not supported with CUDNN
17
+ // x.pooling_forward(mode, kernel_size, stride: 1, pad: 0, y: nil)
18
+ //CUDNN_POOLING_MAX
19
+ //CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING
20
+ //CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
21
+ //CUDNN_POOLING_MAX_DETERMINISTIC
22
+ static VALUE
23
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
24
+ {
25
+ cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
26
+ cudnnStatus_t status = 0;
27
+ cudnnHandle_t handle = 0;
28
+ dtype alpha = 1;
29
+ dtype beta = 0;
30
+
31
+ VALUE x=self, mode, kernel_size, stride, pad, y;
32
+ VALUE kw_hash = Qnil;
33
+ ID kw_table[4] = {rb_intern("stride"), rb_intern("pad"), rb_intern("y")};
34
+ VALUE opts[4] = {Qundef, Qundef, Qundef};
35
+
36
+ size_t ndim;
37
+ cumo_narray_t *nx;
38
+ size_t *x_shape;
39
+
40
+ VALUE x_cont;
41
+ cudnnTensorDescriptor_t x_desc = 0;
42
+ cudnnTensorDescriptor_t y_desc = 0;
43
+ cudnnPoolingDescriptor_t pool_desc = 0;
44
+ char *x_cont_ptr, *y_ptr;
45
+
46
+ cudnnPoolingMode_t int_mode;
47
+ int int_kernel_size[CUMO_NA_MAX_DIMENSION];
48
+ int int_stride[CUMO_NA_MAX_DIMENSION];
49
+ int int_pad[CUMO_NA_MAX_DIMENSION];
50
+
51
+ rb_scan_args(argc, argv, "2:", &mode, &kernel_size, &kw_hash);
52
+ rb_get_kwargs(kw_hash, kw_table, 0, 3, opts);
53
+ stride = cumo_cuda_cudnn_option_value(opts[0], Qnil);
54
+ pad = cumo_cuda_cudnn_option_value(opts[1], Qnil);
55
+ y = cumo_cuda_cudnn_option_value(opts[2], Qnil);
56
+
57
+ CumoGetNArray(x, nx);
58
+
59
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
60
+ if (nx->ndim - 2 < 2) {
61
+ rb_raise(cumo_na_eShapeError, "CUDNN pooling requires number of spatial "
62
+ "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
63
+ }
64
+ ndim = nx->ndim - 2; // Number of spatial dimensions
65
+
66
+ // required parameter
67
+ int_mode = (cudnnPoolingMode_t)NUM2INT(mode);
68
+ cumo_cuda_cudnn_get_int_ary(int_kernel_size, kernel_size, ndim, 0);
69
+ // default to kernel_size
70
+ if (stride == Qnil) {
71
+ memcpy(int_stride, int_kernel_size, sizeof(int) * ndim);
72
+ } else {
73
+ cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 0);
74
+ }
75
+ // default to 0
76
+ cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
77
+
78
+ x_shape = nx->shape;
79
+
80
+ if (y != Qnil) {
81
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(y, cT);
82
+ }
83
+ else {
84
+ size_t *y_shape = ALLOCA_N(size_t, ndim + 2);
85
+ // out_shape = (batch_size, num_channels, out_1, out_2, ..., out_N)
86
+ y_shape[0] = x_shape[0];
87
+ y_shape[1] = x_shape[1];
88
+ for (size_t i = 0; i < ndim; ++i) {
89
+ y_shape[i + 2] = cumo_cuda_cudnn_GetConvOutDim(
90
+ x_shape[i + 2], int_kernel_size[i], int_stride[i], int_pad[i]);
91
+ }
92
+ y = cumo_na_new(cT, ndim + 2, y_shape);
93
+ }
94
+
95
+ x_cont = cumo_na_as_contiguous_array(x);
96
+
97
+ x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
98
+ y_ptr = cumo_na_get_offset_pointer_for_write(y);
99
+
100
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
101
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
102
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&y_desc, y, cudnn_dtype);
103
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
104
+ status = cumo_cuda_cudnn_CreatePoolingDescriptor(&pool_desc, int_mode, ndim, int_kernel_size, int_stride, int_pad);
105
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
106
+
107
+ handle = cumo_cuda_cudnn_handle();
108
+ status = cudnnPoolingForward(
109
+ handle,
110
+ pool_desc,
111
+ (void*)&alpha,
112
+ x_desc,
113
+ (void*)x_cont_ptr,
114
+ (void*)&beta,
115
+ y_desc,
116
+ (void*)y_ptr);
117
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
118
+
119
+ POOLING_ERROR:
120
+ if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
121
+ if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
122
+ if (pool_desc) cudnnDestroyPoolingDescriptor(pool_desc);
123
+ cumo_cuda_cudnn_check_status(status);
124
+
125
+ return y;
126
+ }
127
+
128
+ #else // CUDNN_FOUND
129
+ VALUE cumo_cuda_eCUDNNError;
130
+
131
+ static VALUE
132
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
133
+ {
134
+ rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
135
+ }
136
+ #endif // CUDNN_FOUND