cumo 0.2.5 → 0.3.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,244 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ <%
4
+ cudnn_dtype =
5
+ case type_name
6
+ when 'sfloat'
7
+ 'CUDNN_DATA_FLOAT'
8
+ when 'dfloat'
9
+ 'CUDNN_DATA_DOUBLE'
10
+ else
11
+ # CUDNN_DATA_HALF
12
+ raise 'not supported'
13
+ end
14
+ %>
15
+
16
+ // VALUE is Ruby Array
17
+ static void
18
+ get_int_out_size(int* int_out_size, VALUE out_size, size_t ndim, size_t* x_shape, size_t* w_shape, int* int_stride, int* int_pad)
19
+ {
20
+ if (out_size == Qnil) {
21
+ for (size_t i = 0; i < ndim; ++i) {
22
+ int_out_size[i] = cumo_cuda_cudnn_GetConvTransposeOutDim(
23
+ x_shape[i + 2], w_shape[i + 2], int_stride[i], int_pad[i]);
24
+ }
25
+ } else {
26
+ Check_Type(out_size, T_ARRAY);
27
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ((size_t)(RARRAY_LEN(out_size)), ndim);
28
+ for (size_t i = 0; i < ndim; ++i) {
29
+ int_out_size[i] = NUM2INT(rb_ary_entry(out_size, (long)i));
30
+ }
31
+ }
32
+ // only cover_all=false is supported
33
+ for (size_t i = 0; i < ndim; ++i) {
34
+ if (x_shape[i + 2] != cumo_cuda_cudnn_GetConvOutDim(
35
+ int_out_size[i], w_shape[i + 2], int_stride[i], int_pad[i])) {
36
+ rb_raise(rb_eRuntimeError, "CUDA transposed convolution does not support specified output sizes");
37
+ }
38
+ }
39
+ }
40
+
41
+ // cover_all=true is not supported with CUDNN
42
+ // dilation > 1 is not supported yet
43
+ // x.conv(w, b: nil, stride: 1, pad: 0, out_size: nil, y: nil)
44
+ static VALUE
45
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
46
+ {
47
+ cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
48
+ cudnnStatus_t status = 0;
49
+ cudnnHandle_t handle = 0;
50
+ dtype alpha = 1;
51
+ dtype beta = 0;
52
+
53
+ VALUE x=self, w, b, stride, pad, out_size, y;
54
+ VALUE kw_hash = Qnil;
55
+ ID kw_table[5] = {rb_intern("b"), rb_intern("stride"), rb_intern("pad"), rb_intern("out_size"), rb_intern("y")};
56
+ VALUE opts[5] = {Qundef, Qundef, Qundef, Qundef, Qundef};
57
+
58
+ size_t ndim;
59
+ cumo_narray_t *nx, *nw;
60
+ size_t *x_shape, *w_shape;
61
+ size_t out_channels, batch_size;
62
+
63
+ VALUE x_cont, w_cont;
64
+ cudnnTensorDescriptor_t x_desc = 0;
65
+ cudnnTensorDescriptor_t y_desc = 0;
66
+ cudnnTensorDescriptor_t b_desc = 0;
67
+ cudnnFilterDescriptor_t w_desc = 0;
68
+ cudnnConvolutionDescriptor_t conv_desc = 0;
69
+ char *x_cont_ptr, *w_cont_ptr, *y_ptr;
70
+
71
+ cudnnConvolutionBwdDataAlgoPerf_t perf_result;
72
+ cudnnConvolutionBwdDataAlgo_t algo;
73
+ size_t max_workspace_size = CUMO_CUDA_CUDNN_DEFAULT_MAX_WORKSPACE_SIZE;
74
+ size_t workspace_size;
75
+ char* workspace = 0;
76
+
77
+ int int_stride[CUMO_NA_MAX_DIMENSION];
78
+ int int_pad[CUMO_NA_MAX_DIMENSION];
79
+ int int_out_size[CUMO_NA_MAX_DIMENSION];
80
+
81
+ rb_scan_args(argc, argv, "1:", &w, &kw_hash);
82
+ rb_get_kwargs(kw_hash, kw_table, 0, 4, opts);
83
+ b = cumo_cuda_cudnn_option_value(opts[0], Qnil);
84
+ stride = cumo_cuda_cudnn_option_value(opts[1], Qnil);
85
+ pad = cumo_cuda_cudnn_option_value(opts[2], Qnil);
86
+ out_size = cumo_cuda_cudnn_option_value(opts[3], Qnil);
87
+ y = cumo_cuda_cudnn_option_value(opts[4], Qnil);
88
+
89
+ CumoGetNArray(x, nx);
90
+ CumoGetNArray(w, nw);
91
+
92
+ CUMO_CUDA_CUDNN_CHECK_DIM_EQ(nx->ndim, nw->ndim);
93
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
94
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(w, cT);
95
+ if (nx->ndim - 2 < 2) {
96
+ rb_raise(cumo_na_eShapeError, "CUDNN convolution requires number of spatial "
97
+ "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
98
+ }
99
+ ndim = nx->ndim - 2; // Number of spatial dimensions
100
+
101
+ x_shape = nx->shape;
102
+ w_shape = nw->shape;
103
+ batch_size = x_shape[0]; // x_shape = (batch_size, in_channels, d_1, d_2, ..., d_N)
104
+ out_channels = w_shape[1]; // w.shape = (in_channels, out_channels, k_1, k_2, ..., k_N)
105
+ if (x_shape[1] != w_shape[0]) {
106
+ rb_raise(cumo_na_eShapeError, "x_shape[1]:%d does not match with w_shape[0]:%d",
107
+ (int)x_shape[1], (int)w_shape[0]);
108
+ }
109
+
110
+ cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 1);
111
+ cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
112
+ get_int_out_size(int_out_size, out_size, ndim, x_shape, w_shape, int_stride, int_pad);
113
+
114
+ // out_shape = (batch_size, out_channels, out_1, out_2, ..., out_N)
115
+ if (y != Qnil) {
116
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(y, cT);
117
+ // TODO: shape check
118
+ }
119
+ else {
120
+ size_t *y_shape = ALLOCA_N(size_t, ndim + 2);
121
+ y_shape[0] = batch_size;
122
+ y_shape[1] = out_channels;
123
+ for (size_t i = 0; i < ndim; ++i) {
124
+ y_shape[i + 2] = int_out_size[i];
125
+ }
126
+ y = cumo_na_new(cT, ndim + 2, y_shape);
127
+ }
128
+
129
+ x_cont = cumo_na_as_contiguous_array(x);
130
+ w_cont = cumo_na_as_contiguous_array(w);
131
+
132
+ x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
133
+ w_cont_ptr = cumo_na_get_offset_pointer_for_read(w_cont);
134
+ y_ptr = cumo_na_get_offset_pointer_for_write(y);
135
+
136
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
137
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
138
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&y_desc, y, cudnn_dtype);
139
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
140
+ status = cumo_cuda_cudnn_CreateFilterDescriptor(&w_desc, w_cont, cudnn_dtype);
141
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
142
+ status = cumo_cuda_cudnn_CreateConvolutionDescriptor(&conv_desc, ndim, int_stride, int_pad, cudnn_dtype);
143
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
144
+
145
+ handle = cumo_cuda_cudnn_handle();
146
+
147
+ // auto tune
148
+ status = cumo_cuda_cudnn_FindConvolutionBackwardDataAlgorithm(
149
+ &perf_result,
150
+ handle,
151
+ w_desc,
152
+ w_cont,
153
+ x_desc,
154
+ x_cont,
155
+ conv_desc,
156
+ y_desc,
157
+ y,
158
+ max_workspace_size,
159
+ int_stride,
160
+ int_pad,
161
+ ndim,
162
+ cudnn_dtype);
163
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
164
+ algo = perf_result.algo;
165
+ workspace_size = perf_result.memory;
166
+
167
+ workspace = cumo_cuda_runtime_malloc(max_workspace_size);
168
+ status = cudnnConvolutionBackwardData(
169
+ handle,
170
+ (void*)&alpha,
171
+ w_desc,
172
+ (void*)w_cont_ptr,
173
+ x_desc,
174
+ (void*)x_cont_ptr,
175
+ conv_desc,
176
+ algo,
177
+ (void*)workspace,
178
+ workspace_size,
179
+ (void*)&beta,
180
+ y_desc,
181
+ (void*)y_ptr);
182
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
183
+
184
+ if (b != Qnil) {
185
+ size_t new_shape[CUMO_NA_MAX_DIMENSION];
186
+ VALUE b_cont;
187
+ char* b_cont_ptr;
188
+ cumo_narray_t *nb, *nb_cont;
189
+ size_t *b_shape;
190
+ int b_ndim;
191
+
192
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(b, cT);
193
+ CumoGetNArray(b, nb);
194
+ new_shape[0] = 1;
195
+ new_shape[1] = nb->size;
196
+ for (size_t i = 0; i < ndim; ++i) {
197
+ new_shape[i + 2] = 1;
198
+ }
199
+ b_cont = cumo_na_as_contiguous_array(b);
200
+ b_cont_ptr = cumo_na_get_offset_pointer_for_read(b_cont);
201
+ CumoGetNArray(b_cont, nb_cont);
202
+ b_shape = nb_cont->shape;
203
+ b_ndim = nb_cont->ndim;
204
+ // reshape b
205
+ nb_cont->ndim = ndim + 2;
206
+ nb_cont->shape = new_shape;
207
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&b_desc, b_cont, cudnn_dtype);
208
+ // restore b.shape
209
+ nb_cont->ndim = b_ndim;
210
+ nb_cont->shape = b_shape;
211
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
212
+
213
+ status = cudnnAddTensor(
214
+ handle,
215
+ (void*)&alpha,
216
+ b_desc,
217
+ (void*)b_cont_ptr,
218
+ (void*)&alpha,
219
+ y_desc,
220
+ (void*)y_ptr);
221
+ if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
222
+ }
223
+
224
+ CONV_ERROR:
225
+ if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
226
+ if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
227
+ if (b_desc) cudnnDestroyTensorDescriptor(b_desc);
228
+ if (w_desc) cudnnDestroyFilterDescriptor(w_desc);
229
+ if (conv_desc) cudnnDestroyConvolutionDescriptor(conv_desc);
230
+ if (workspace) cumo_cuda_runtime_free(workspace);
231
+ cumo_cuda_cudnn_check_status(status);
232
+
233
+ return y;
234
+ }
235
+
236
+ #else // CUDNN_FOUND
237
+ VALUE cumo_cuda_eCUDNNError;
238
+
239
+ static VALUE
240
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
241
+ {
242
+ rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
243
+ }
244
+ #endif // CUDNN_FOUND
@@ -345,3 +345,17 @@ static VALUE
345
345
  <%=c_iter%>(a, b, c, &g);
346
346
  return c;
347
347
  }
348
+
349
+ #undef ROW_SIZE
350
+ #undef COL_SIZE
351
+ #undef CHECK_NARRAY_TYPE
352
+ #undef CHECK_DIM_GE
353
+ #undef CHECK_DIM_EQ
354
+ #undef CHECK_SQUARE
355
+ #undef CHECK_SIZE_GE
356
+ #undef CHECK_NON_EMPTY
357
+ #undef CHECK_SIZE_EQ
358
+ #undef CHECK_SAME_SHAPE
359
+ #undef CHECK_INT_EQ
360
+ #undef CHECK_LEADING_GE
361
+ #undef COPY_OR_CAST_TO
@@ -0,0 +1,136 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ <%
4
+ cudnn_dtype =
5
+ case type_name
6
+ when 'sfloat'
7
+ 'CUDNN_DATA_FLOAT'
8
+ when 'dfloat'
9
+ 'CUDNN_DATA_DOUBLE'
10
+ else
11
+ # CUDNN_DATA_HALF
12
+ raise 'not supported'
13
+ end
14
+ %>
15
+
16
+ // cover_all=true is not supported with CUDNN
17
+ // gy = x.pooling_backward(mode, y, kernel_size, stride: 1, pad: 0)
18
+ //CUDNN_POOLING_MAX
19
+ //CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING
20
+ //CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
21
+ //CUDNN_POOLING_MAX_DETERMINISTIC
22
+ static VALUE
23
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
24
+ {
25
+ cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
26
+ cudnnStatus_t status = 0;
27
+ cudnnHandle_t handle = 0;
28
+ dtype alpha = 1;
29
+ dtype beta = 0;
30
+
31
+ VALUE x=self, y, gy, mode, kernel_size, stride, pad, gx;
32
+ VALUE kw_hash = Qnil;
33
+ ID kw_table[4] = {rb_intern("stride"), rb_intern("pad"), rb_intern("gx")};
34
+ VALUE opts[4] = {Qundef, Qundef, Qundef};
35
+
36
+ size_t ndim;
37
+ cumo_narray_t *nx;
38
+ size_t *x_shape;
39
+
40
+ VALUE x_cont, y_cont, gy_cont;
41
+ cudnnTensorDescriptor_t x_desc = 0;
42
+ cudnnTensorDescriptor_t y_desc = 0;
43
+ cudnnPoolingDescriptor_t pool_desc = 0;
44
+ char *x_cont_ptr, *y_cont_ptr, *gy_cont_ptr, *gx_ptr;
45
+
46
+ cudnnPoolingMode_t int_mode;
47
+ int int_kernel_size[CUMO_NA_MAX_DIMENSION];
48
+ int int_stride[CUMO_NA_MAX_DIMENSION];
49
+ int int_pad[CUMO_NA_MAX_DIMENSION];
50
+
51
+ rb_scan_args(argc, argv, "4:", &mode, &y, &gy, &kernel_size, &kw_hash);
52
+ rb_get_kwargs(kw_hash, kw_table, 0, 3, opts);
53
+ stride = cumo_cuda_cudnn_option_value(opts[0], Qnil);
54
+ pad = cumo_cuda_cudnn_option_value(opts[1], Qnil);
55
+ gx = cumo_cuda_cudnn_option_value(opts[2], Qnil);
56
+
57
+ CumoGetNArray(x, nx);
58
+
59
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
60
+ if (nx->ndim - 2 < 2) {
61
+ rb_raise(cumo_na_eShapeError, "cuDNN pooling requires number of spatial "
62
+ "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
63
+ }
64
+ ndim = nx->ndim - 2; // Number of spatial dimensions
65
+
66
+ // required parameter
67
+ int_mode = (cudnnPoolingMode_t)NUM2INT(mode);
68
+ cumo_cuda_cudnn_get_int_ary(int_kernel_size, kernel_size, ndim, 0);
69
+ // default to kernel_size
70
+ if (stride == Qnil) {
71
+ memcpy(int_stride, int_kernel_size, sizeof(int) * ndim);
72
+ } else {
73
+ cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 0);
74
+ }
75
+ // default to 0
76
+ cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
77
+
78
+ x_shape = nx->shape;
79
+
80
+ if (gx != Qnil) {
81
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gx, cT);
82
+ }
83
+ else {
84
+ gx = cumo_na_new(cT, ndim + 2, x_shape);
85
+ }
86
+
87
+ x_cont = cumo_na_as_contiguous_array(x);
88
+ y_cont = cumo_na_as_contiguous_array(y);
89
+ gy_cont = cumo_na_as_contiguous_array(gy);
90
+
91
+ x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
92
+ y_cont_ptr = cumo_na_get_offset_pointer_for_read(y_cont);
93
+ gy_cont_ptr = cumo_na_get_offset_pointer_for_read(gy_cont);
94
+ gx_ptr = cumo_na_get_offset_pointer_for_write(gx);
95
+
96
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
97
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
98
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&y_desc, y, cudnn_dtype);
99
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
100
+ status = cumo_cuda_cudnn_CreatePoolingDescriptor(&pool_desc, int_mode, ndim, int_kernel_size, int_stride, int_pad);
101
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
102
+
103
+ handle = cumo_cuda_cudnn_handle();
104
+ status = cudnnPoolingBackward(
105
+ handle,
106
+ pool_desc,
107
+ (void*)&alpha,
108
+ y_desc,
109
+ (void*)y_cont_ptr,
110
+ y_desc,
111
+ (void*)gy_cont_ptr,
112
+ x_desc,
113
+ (void*)x_cont_ptr,
114
+ (void*)&beta,
115
+ x_desc,
116
+ (void*)gx_ptr);
117
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
118
+
119
+ POOLING_ERROR:
120
+ if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
121
+ if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
122
+ if (pool_desc) cudnnDestroyPoolingDescriptor(pool_desc);
123
+ cumo_cuda_cudnn_check_status(status);
124
+
125
+ return gx;
126
+ }
127
+
128
+ #else // CUDNN_FOUND
129
+ VALUE cumo_cuda_eCUDNNError;
130
+
131
+ static VALUE
132
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
133
+ {
134
+ rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
135
+ }
136
+ #endif // CUDNN_FOUND
@@ -0,0 +1,136 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ <%
4
+ cudnn_dtype =
5
+ case type_name
6
+ when 'sfloat'
7
+ 'CUDNN_DATA_FLOAT'
8
+ when 'dfloat'
9
+ 'CUDNN_DATA_DOUBLE'
10
+ else
11
+ # CUDNN_DATA_HALF
12
+ raise 'not supported'
13
+ end
14
+ %>
15
+
16
+ // cover_all=true is not supported with CUDNN
17
+ // x.pooling_forward(mode, kernel_size, stride: 1, pad: 0, y: nil)
18
+ //CUDNN_POOLING_MAX
19
+ //CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING
20
+ //CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
21
+ //CUDNN_POOLING_MAX_DETERMINISTIC
22
+ static VALUE
23
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
24
+ {
25
+ cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
26
+ cudnnStatus_t status = 0;
27
+ cudnnHandle_t handle = 0;
28
+ dtype alpha = 1;
29
+ dtype beta = 0;
30
+
31
+ VALUE x=self, mode, kernel_size, stride, pad, y;
32
+ VALUE kw_hash = Qnil;
33
+ ID kw_table[4] = {rb_intern("stride"), rb_intern("pad"), rb_intern("y")};
34
+ VALUE opts[4] = {Qundef, Qundef, Qundef};
35
+
36
+ size_t ndim;
37
+ cumo_narray_t *nx;
38
+ size_t *x_shape;
39
+
40
+ VALUE x_cont;
41
+ cudnnTensorDescriptor_t x_desc = 0;
42
+ cudnnTensorDescriptor_t y_desc = 0;
43
+ cudnnPoolingDescriptor_t pool_desc = 0;
44
+ char *x_cont_ptr, *y_ptr;
45
+
46
+ cudnnPoolingMode_t int_mode;
47
+ int int_kernel_size[CUMO_NA_MAX_DIMENSION];
48
+ int int_stride[CUMO_NA_MAX_DIMENSION];
49
+ int int_pad[CUMO_NA_MAX_DIMENSION];
50
+
51
+ rb_scan_args(argc, argv, "2:", &mode, &kernel_size, &kw_hash);
52
+ rb_get_kwargs(kw_hash, kw_table, 0, 3, opts);
53
+ stride = cumo_cuda_cudnn_option_value(opts[0], Qnil);
54
+ pad = cumo_cuda_cudnn_option_value(opts[1], Qnil);
55
+ y = cumo_cuda_cudnn_option_value(opts[2], Qnil);
56
+
57
+ CumoGetNArray(x, nx);
58
+
59
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
60
+ if (nx->ndim - 2 < 2) {
61
+ rb_raise(cumo_na_eShapeError, "CUDNN pooling requires number of spatial "
62
+ "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
63
+ }
64
+ ndim = nx->ndim - 2; // Number of spatial dimensions
65
+
66
+ // required parameter
67
+ int_mode = (cudnnPoolingMode_t)NUM2INT(mode);
68
+ cumo_cuda_cudnn_get_int_ary(int_kernel_size, kernel_size, ndim, 0);
69
+ // default to kernel_size
70
+ if (stride == Qnil) {
71
+ memcpy(int_stride, int_kernel_size, sizeof(int) * ndim);
72
+ } else {
73
+ cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 0);
74
+ }
75
+ // default to 0
76
+ cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
77
+
78
+ x_shape = nx->shape;
79
+
80
+ if (y != Qnil) {
81
+ CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(y, cT);
82
+ }
83
+ else {
84
+ size_t *y_shape = ALLOCA_N(size_t, ndim + 2);
85
+ // out_shape = (batch_size, num_channels, out_1, out_2, ..., out_N)
86
+ y_shape[0] = x_shape[0];
87
+ y_shape[1] = x_shape[1];
88
+ for (size_t i = 0; i < ndim; ++i) {
89
+ y_shape[i + 2] = cumo_cuda_cudnn_GetConvOutDim(
90
+ x_shape[i + 2], int_kernel_size[i], int_stride[i], int_pad[i]);
91
+ }
92
+ y = cumo_na_new(cT, ndim + 2, y_shape);
93
+ }
94
+
95
+ x_cont = cumo_na_as_contiguous_array(x);
96
+
97
+ x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
98
+ y_ptr = cumo_na_get_offset_pointer_for_write(y);
99
+
100
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
101
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
102
+ status = cumo_cuda_cudnn_CreateTensorDescriptor(&y_desc, y, cudnn_dtype);
103
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
104
+ status = cumo_cuda_cudnn_CreatePoolingDescriptor(&pool_desc, int_mode, ndim, int_kernel_size, int_stride, int_pad);
105
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
106
+
107
+ handle = cumo_cuda_cudnn_handle();
108
+ status = cudnnPoolingForward(
109
+ handle,
110
+ pool_desc,
111
+ (void*)&alpha,
112
+ x_desc,
113
+ (void*)x_cont_ptr,
114
+ (void*)&beta,
115
+ y_desc,
116
+ (void*)y_ptr);
117
+ if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
118
+
119
+ POOLING_ERROR:
120
+ if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
121
+ if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
122
+ if (pool_desc) cudnnDestroyPoolingDescriptor(pool_desc);
123
+ cumo_cuda_cudnn_check_status(status);
124
+
125
+ return y;
126
+ }
127
+
128
+ #else // CUDNN_FOUND
129
+ VALUE cumo_cuda_eCUDNNError;
130
+
131
+ static VALUE
132
+ <%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
133
+ {
134
+ rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
135
+ }
136
+ #endif // CUDNN_FOUND