RubyGems - cumo - Versions diffs - 0.2.5 → 0.3.0.pre1 - Mend

cumo 0.2.5 → 0.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +14 -1
data/README.md +12 -1
data/cumo.gemspec +1 -1
data/ext/cumo/cuda/cudnn.c +80 -0
data/ext/cumo/cuda/cudnn_impl.cpp +572 -0
data/ext/cumo/cuda/runtime.c +1 -0
data/ext/cumo/cumo.c +5 -0
data/ext/cumo/extconf.rb +8 -2
data/ext/cumo/include/cumo.h +2 -2
data/ext/cumo/include/cumo/cuda/cudnn.h +205 -0
data/ext/cumo/include/cumo/hash_combine.hpp +17 -0
data/ext/cumo/include/cumo/intern.h +5 -0
data/ext/cumo/include/cumo/types/dfloat.h +1 -0
data/ext/cumo/include/cumo/types/sfloat.h +1 -0
data/ext/cumo/narray/gen/spec.rb +21 -0
data/ext/cumo/narray/gen/tmpl/batch_norm.c +197 -0
data/ext/cumo/narray/gen/tmpl/batch_norm_backward.c +191 -0
data/ext/cumo/narray/gen/tmpl/conv.c +216 -0
data/ext/cumo/narray/gen/tmpl/conv_grad_w.c +183 -0
data/ext/cumo/narray/gen/tmpl/conv_transpose.c +244 -0
data/ext/cumo/narray/gen/tmpl/gemm.c +14 -0
data/ext/cumo/narray/gen/tmpl/pooling_backward.c +136 -0
data/ext/cumo/narray/gen/tmpl/pooling_forward.c +136 -0
data/ext/cumo/narray/narray.c +29 -0
data/lib/cumo/cuda.rb +1 -0
data/lib/cumo/cuda/cudnn.rb +88 -0
metadata +18 -5

data/ext/cumo/narray/gen/tmpl/conv_transpose.c ADDED Viewed

@@ -0,0 +1,244 @@
+#ifdef CUDNN_FOUND
+<%
+  cudnn_dtype =
+    case type_name
+    when 'sfloat'
+      'CUDNN_DATA_FLOAT'
+    when 'dfloat'
+      'CUDNN_DATA_DOUBLE'
+    else
+      # CUDNN_DATA_HALF
+      raise 'not supported'
+    end
+%>
+// VALUE is Ruby Array
+static void
+get_int_out_size(int* int_out_size, VALUE out_size, size_t ndim, size_t* x_shape, size_t* w_shape, int* int_stride, int* int_pad)
+{
+    if (out_size == Qnil) {
+        for (size_t i = 0; i < ndim; ++i) {
+            int_out_size[i] = cumo_cuda_cudnn_GetConvTransposeOutDim(
+                    x_shape[i + 2], w_shape[i + 2], int_stride[i], int_pad[i]);
+        }
+    } else {
+        Check_Type(out_size, T_ARRAY);
+        CUMO_CUDA_CUDNN_CHECK_DIM_EQ((size_t)(RARRAY_LEN(out_size)), ndim);
+        for (size_t i = 0; i < ndim; ++i) {
+            int_out_size[i] = NUM2INT(rb_ary_entry(out_size, (long)i));
+        }
+    }
+    // only cover_all=false is supported
+    for (size_t i = 0; i < ndim; ++i) {
+        if (x_shape[i + 2] != cumo_cuda_cudnn_GetConvOutDim(
+                    int_out_size[i], w_shape[i + 2], int_stride[i], int_pad[i])) {
+            rb_raise(rb_eRuntimeError, "CUDA transposed convolution does not support specified output sizes");
+        }
+    }
+}
+// cover_all=true is not supported with CUDNN
+// dilation > 1 is not supported yet
+// x.conv(w, b: nil, stride: 1, pad: 0, out_size: nil, y: nil)
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
+{
+    cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
+    cudnnStatus_t status = 0;
+    cudnnHandle_t handle = 0;
+    dtype alpha = 1;
+    dtype beta = 0;
+    VALUE x=self, w, b, stride, pad, out_size, y;
+    VALUE kw_hash = Qnil;
+    ID kw_table[5] = {rb_intern("b"), rb_intern("stride"), rb_intern("pad"), rb_intern("out_size"), rb_intern("y")};
+    VALUE opts[5] = {Qundef, Qundef, Qundef, Qundef, Qundef};
+    size_t ndim;
+    cumo_narray_t *nx, *nw;
+    size_t *x_shape, *w_shape;
+    size_t out_channels, batch_size;
+    VALUE x_cont, w_cont;
+    cudnnTensorDescriptor_t x_desc = 0;
+    cudnnTensorDescriptor_t y_desc = 0;
+    cudnnTensorDescriptor_t b_desc = 0;
+    cudnnFilterDescriptor_t w_desc = 0;
+    cudnnConvolutionDescriptor_t conv_desc = 0;
+    char *x_cont_ptr, *w_cont_ptr, *y_ptr;
+    cudnnConvolutionBwdDataAlgoPerf_t perf_result;
+    cudnnConvolutionBwdDataAlgo_t algo;
+    size_t max_workspace_size = CUMO_CUDA_CUDNN_DEFAULT_MAX_WORKSPACE_SIZE;
+    size_t workspace_size;
+    char* workspace = 0;
+    int int_stride[CUMO_NA_MAX_DIMENSION];
+    int int_pad[CUMO_NA_MAX_DIMENSION];
+    int int_out_size[CUMO_NA_MAX_DIMENSION];
+    rb_scan_args(argc, argv, "1:", &w, &kw_hash);
+    rb_get_kwargs(kw_hash, kw_table, 0, 4, opts);
+    b = cumo_cuda_cudnn_option_value(opts[0], Qnil);
+    stride = cumo_cuda_cudnn_option_value(opts[1], Qnil);
+    pad = cumo_cuda_cudnn_option_value(opts[2], Qnil);
+    out_size = cumo_cuda_cudnn_option_value(opts[3], Qnil);
+    y = cumo_cuda_cudnn_option_value(opts[4], Qnil);
+    CumoGetNArray(x, nx);
+    CumoGetNArray(w, nw);
+    CUMO_CUDA_CUDNN_CHECK_DIM_EQ(nx->ndim, nw->ndim);
+    CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
+    CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(w, cT);
+    if (nx->ndim - 2 < 2) {
+        rb_raise(cumo_na_eShapeError, "CUDNN convolution requires number of spatial "
+                "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
+    }
+    ndim = nx->ndim - 2;  // Number of spatial dimensions
+    x_shape = nx->shape;
+    w_shape = nw->shape;
+    batch_size = x_shape[0]; // x_shape = (batch_size, in_channels, d_1, d_2, ..., d_N)
+    out_channels = w_shape[1]; // w.shape = (in_channels, out_channels, k_1, k_2, ..., k_N)
+    if (x_shape[1] != w_shape[0]) {
+        rb_raise(cumo_na_eShapeError, "x_shape[1]:%d does not match with w_shape[0]:%d",
+                (int)x_shape[1], (int)w_shape[0]);
+    }
+    cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 1);
+    cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
+    get_int_out_size(int_out_size, out_size, ndim, x_shape, w_shape, int_stride, int_pad);
+    // out_shape = (batch_size, out_channels, out_1, out_2, ..., out_N)
+    if (y != Qnil) {
+        CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(y, cT);
+        // TODO: shape check
+    }
+    else {
+        size_t *y_shape = ALLOCA_N(size_t, ndim + 2);
+        y_shape[0] = batch_size;
+        y_shape[1] = out_channels;
+        for (size_t i = 0; i < ndim; ++i) {
+            y_shape[i + 2] = int_out_size[i];
+        }
+        y = cumo_na_new(cT, ndim + 2, y_shape);
+    }
+    x_cont = cumo_na_as_contiguous_array(x);
+    w_cont = cumo_na_as_contiguous_array(w);
+    x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
+    w_cont_ptr = cumo_na_get_offset_pointer_for_read(w_cont);
+    y_ptr = cumo_na_get_offset_pointer_for_write(y);
+    status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
+    if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
+    status = cumo_cuda_cudnn_CreateTensorDescriptor(&y_desc, y, cudnn_dtype);
+    if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
+    status = cumo_cuda_cudnn_CreateFilterDescriptor(&w_desc, w_cont, cudnn_dtype);
+    if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
+    status = cumo_cuda_cudnn_CreateConvolutionDescriptor(&conv_desc, ndim, int_stride, int_pad, cudnn_dtype);
+    if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
+    handle = cumo_cuda_cudnn_handle();
+    // auto tune
+    status = cumo_cuda_cudnn_FindConvolutionBackwardDataAlgorithm(
+            &perf_result,
+            handle,
+            w_desc,
+            w_cont,
+            x_desc,
+            x_cont,
+            conv_desc,
+            y_desc,
+            y,
+            max_workspace_size,
+            int_stride,
+            int_pad,
+            ndim,
+            cudnn_dtype);
+    if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
+    algo = perf_result.algo;
+    workspace_size = perf_result.memory;
+    workspace = cumo_cuda_runtime_malloc(max_workspace_size);
+    status = cudnnConvolutionBackwardData(
+            handle,
+            (void*)&alpha,
+            w_desc,
+            (void*)w_cont_ptr,
+            x_desc,
+            (void*)x_cont_ptr,
+            conv_desc,
+            algo,
+            (void*)workspace,
+            workspace_size,
+            (void*)&beta,
+            y_desc,
+            (void*)y_ptr);
+    if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
+    if (b != Qnil) {
+        size_t new_shape[CUMO_NA_MAX_DIMENSION];
+        VALUE b_cont;
+        char* b_cont_ptr;
+        cumo_narray_t *nb, *nb_cont;
+        size_t *b_shape;
+        int b_ndim;
+        CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(b, cT);
+        CumoGetNArray(b, nb);
+        new_shape[0] = 1;
+        new_shape[1] = nb->size;
+        for (size_t i = 0; i < ndim; ++i) {
+            new_shape[i + 2] = 1;
+        }
+        b_cont =  cumo_na_as_contiguous_array(b);
+        b_cont_ptr = cumo_na_get_offset_pointer_for_read(b_cont);
+        CumoGetNArray(b_cont, nb_cont);
+        b_shape = nb_cont->shape;
+        b_ndim = nb_cont->ndim;
+        // reshape b
+        nb_cont->ndim = ndim + 2;
+        nb_cont->shape = new_shape;
+        status = cumo_cuda_cudnn_CreateTensorDescriptor(&b_desc, b_cont, cudnn_dtype);
+        // restore b.shape
+        nb_cont->ndim = b_ndim;
+        nb_cont->shape = b_shape;
+        if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
+        status = cudnnAddTensor(
+                    handle,
+                    (void*)&alpha,
+                    b_desc,
+                    (void*)b_cont_ptr,
+                    (void*)&alpha,
+                    y_desc,
+                    (void*)y_ptr);
+        if (status != CUDNN_STATUS_SUCCESS) goto CONV_ERROR;
+    }
+CONV_ERROR:
+    if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
+    if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
+    if (b_desc) cudnnDestroyTensorDescriptor(b_desc);
+    if (w_desc) cudnnDestroyFilterDescriptor(w_desc);
+    if (conv_desc) cudnnDestroyConvolutionDescriptor(conv_desc);
+    if (workspace) cumo_cuda_runtime_free(workspace);
+    cumo_cuda_cudnn_check_status(status);
+    return y;
+}
+#else // CUDNN_FOUND
+VALUE cumo_cuda_eCUDNNError;
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
+{
+    rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
+}
+#endif // CUDNN_FOUND

data/ext/cumo/narray/gen/tmpl/gemm.c CHANGED Viewed

@@ -345,3 +345,17 @@ static VALUE
     <%=c_iter%>(a, b, c, &g);
     return c;
 }
+#undef ROW_SIZE
+#undef COL_SIZE
+#undef CHECK_NARRAY_TYPE
+#undef CHECK_DIM_GE
+#undef CHECK_DIM_EQ
+#undef CHECK_SQUARE
+#undef CHECK_SIZE_GE
+#undef CHECK_NON_EMPTY
+#undef CHECK_SIZE_EQ
+#undef CHECK_SAME_SHAPE
+#undef CHECK_INT_EQ
+#undef CHECK_LEADING_GE
+#undef COPY_OR_CAST_TO

data/ext/cumo/narray/gen/tmpl/pooling_backward.c ADDED Viewed

@@ -0,0 +1,136 @@
+#ifdef CUDNN_FOUND
+<%
+  cudnn_dtype =
+    case type_name
+    when 'sfloat'
+      'CUDNN_DATA_FLOAT'
+    when 'dfloat'
+      'CUDNN_DATA_DOUBLE'
+    else
+      # CUDNN_DATA_HALF
+      raise 'not supported'
+    end
+%>
+// cover_all=true is not supported with CUDNN
+// gy = x.pooling_backward(mode, y, kernel_size, stride: 1, pad: 0)
+//CUDNN_POOLING_MAX
+//CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING
+//CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
+//CUDNN_POOLING_MAX_DETERMINISTIC
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
+{
+    cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
+    cudnnStatus_t status = 0;
+    cudnnHandle_t handle = 0;
+    dtype alpha = 1;
+    dtype beta = 0;
+    VALUE x=self, y, gy, mode, kernel_size, stride, pad, gx;
+    VALUE kw_hash = Qnil;
+    ID kw_table[4] = {rb_intern("stride"), rb_intern("pad"), rb_intern("gx")};
+    VALUE opts[4] = {Qundef, Qundef, Qundef};
+    size_t ndim;
+    cumo_narray_t *nx;
+    size_t *x_shape;
+    VALUE x_cont, y_cont, gy_cont;
+    cudnnTensorDescriptor_t x_desc = 0;
+    cudnnTensorDescriptor_t y_desc = 0;
+    cudnnPoolingDescriptor_t pool_desc = 0;
+    char *x_cont_ptr, *y_cont_ptr, *gy_cont_ptr, *gx_ptr;
+    cudnnPoolingMode_t int_mode;
+    int int_kernel_size[CUMO_NA_MAX_DIMENSION];
+    int int_stride[CUMO_NA_MAX_DIMENSION];
+    int int_pad[CUMO_NA_MAX_DIMENSION];
+    rb_scan_args(argc, argv, "4:", &mode, &y, &gy, &kernel_size, &kw_hash);
+    rb_get_kwargs(kw_hash, kw_table, 0, 3, opts);
+    stride = cumo_cuda_cudnn_option_value(opts[0], Qnil);
+    pad = cumo_cuda_cudnn_option_value(opts[1], Qnil);
+    gx = cumo_cuda_cudnn_option_value(opts[2], Qnil);
+    CumoGetNArray(x, nx);
+    CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
+    if (nx->ndim - 2 < 2) {
+        rb_raise(cumo_na_eShapeError, "cuDNN pooling requires number of spatial "
+                "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
+    }
+    ndim = nx->ndim - 2;  // Number of spatial dimensions
+    // required parameter
+    int_mode = (cudnnPoolingMode_t)NUM2INT(mode);
+    cumo_cuda_cudnn_get_int_ary(int_kernel_size, kernel_size, ndim, 0);
+    // default to kernel_size
+    if (stride == Qnil) {
+        memcpy(int_stride, int_kernel_size, sizeof(int) * ndim);
+    } else {
+        cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 0);
+    }
+    // default to 0
+    cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
+    x_shape = nx->shape;
+    if (gx != Qnil) {
+        CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(gx, cT);
+    }
+    else {
+        gx = cumo_na_new(cT, ndim + 2, x_shape);
+    }
+    x_cont = cumo_na_as_contiguous_array(x);
+    y_cont = cumo_na_as_contiguous_array(y);
+    gy_cont = cumo_na_as_contiguous_array(gy);
+    x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
+    y_cont_ptr = cumo_na_get_offset_pointer_for_read(y_cont);
+    gy_cont_ptr = cumo_na_get_offset_pointer_for_read(gy_cont);
+    gx_ptr = cumo_na_get_offset_pointer_for_write(gx);
+    status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
+    if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
+    status = cumo_cuda_cudnn_CreateTensorDescriptor(&y_desc, y, cudnn_dtype);
+    if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
+    status = cumo_cuda_cudnn_CreatePoolingDescriptor(&pool_desc, int_mode, ndim, int_kernel_size, int_stride, int_pad);
+    if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
+    handle = cumo_cuda_cudnn_handle();
+    status = cudnnPoolingBackward(
+            handle,
+            pool_desc,
+            (void*)&alpha,
+            y_desc,
+            (void*)y_cont_ptr,
+            y_desc,
+            (void*)gy_cont_ptr,
+            x_desc,
+            (void*)x_cont_ptr,
+            (void*)&beta,
+            x_desc,
+            (void*)gx_ptr);
+    if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
+POOLING_ERROR:
+    if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
+    if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
+    if (pool_desc) cudnnDestroyPoolingDescriptor(pool_desc);
+    cumo_cuda_cudnn_check_status(status);
+    return gx;
+}
+#else // CUDNN_FOUND
+VALUE cumo_cuda_eCUDNNError;
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
+{
+    rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
+}
+#endif // CUDNN_FOUND

data/ext/cumo/narray/gen/tmpl/pooling_forward.c ADDED Viewed

@@ -0,0 +1,136 @@
+#ifdef CUDNN_FOUND
+<%
+  cudnn_dtype =
+    case type_name
+    when 'sfloat'
+      'CUDNN_DATA_FLOAT'
+    when 'dfloat'
+      'CUDNN_DATA_DOUBLE'
+    else
+      # CUDNN_DATA_HALF
+      raise 'not supported'
+    end
+%>
+// cover_all=true is not supported with CUDNN
+// x.pooling_forward(mode, kernel_size, stride: 1, pad: 0, y: nil)
+//CUDNN_POOLING_MAX
+//CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING
+//CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
+//CUDNN_POOLING_MAX_DETERMINISTIC
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
+{
+    cudnnDataType_t cudnn_dtype = <%= cudnn_dtype %>;
+    cudnnStatus_t status = 0;
+    cudnnHandle_t handle = 0;
+    dtype alpha = 1;
+    dtype beta = 0;
+    VALUE x=self, mode, kernel_size, stride, pad, y;
+    VALUE kw_hash = Qnil;
+    ID kw_table[4] = {rb_intern("stride"), rb_intern("pad"), rb_intern("y")};
+    VALUE opts[4] = {Qundef, Qundef, Qundef};
+    size_t ndim;
+    cumo_narray_t *nx;
+    size_t *x_shape;
+    VALUE x_cont;
+    cudnnTensorDescriptor_t x_desc = 0;
+    cudnnTensorDescriptor_t y_desc = 0;
+    cudnnPoolingDescriptor_t pool_desc = 0;
+    char *x_cont_ptr, *y_ptr;
+    cudnnPoolingMode_t int_mode;
+    int int_kernel_size[CUMO_NA_MAX_DIMENSION];
+    int int_stride[CUMO_NA_MAX_DIMENSION];
+    int int_pad[CUMO_NA_MAX_DIMENSION];
+    rb_scan_args(argc, argv, "2:", &mode, &kernel_size, &kw_hash);
+    rb_get_kwargs(kw_hash, kw_table, 0, 3, opts);
+    stride = cumo_cuda_cudnn_option_value(opts[0], Qnil);
+    pad = cumo_cuda_cudnn_option_value(opts[1], Qnil);
+    y = cumo_cuda_cudnn_option_value(opts[2], Qnil);
+    CumoGetNArray(x, nx);
+    CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(x, cT);
+    if (nx->ndim - 2 < 2) {
+        rb_raise(cumo_na_eShapeError, "CUDNN pooling requires number of spatial "
+                "dimensions to be greater than or equal to 2, but %d", nx->ndim - 2);
+    }
+    ndim = nx->ndim - 2;  // Number of spatial dimensions
+    // required parameter
+    int_mode = (cudnnPoolingMode_t)NUM2INT(mode);
+    cumo_cuda_cudnn_get_int_ary(int_kernel_size, kernel_size, ndim, 0);
+    // default to kernel_size
+    if (stride == Qnil) {
+        memcpy(int_stride, int_kernel_size, sizeof(int) * ndim);
+    } else {
+        cumo_cuda_cudnn_get_int_ary(int_stride, stride, ndim, 0);
+    }
+    // default to 0
+    cumo_cuda_cudnn_get_int_ary(int_pad, pad, ndim, 0);
+    x_shape = nx->shape;
+    if (y != Qnil) {
+        CUMO_CUDA_CUDNN_CHECK_NARRAY_TYPE(y, cT);
+    }
+    else {
+        size_t *y_shape = ALLOCA_N(size_t, ndim + 2);
+        // out_shape = (batch_size, num_channels, out_1, out_2, ..., out_N)
+        y_shape[0] = x_shape[0];
+        y_shape[1] = x_shape[1];
+        for (size_t i = 0; i < ndim; ++i) {
+            y_shape[i + 2] = cumo_cuda_cudnn_GetConvOutDim(
+                    x_shape[i + 2], int_kernel_size[i], int_stride[i], int_pad[i]);
+        }
+        y = cumo_na_new(cT, ndim + 2, y_shape);
+    }
+    x_cont = cumo_na_as_contiguous_array(x);
+    x_cont_ptr = cumo_na_get_offset_pointer_for_read(x_cont);
+    y_ptr = cumo_na_get_offset_pointer_for_write(y);
+    status = cumo_cuda_cudnn_CreateTensorDescriptor(&x_desc, x_cont, cudnn_dtype);
+    if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
+    status = cumo_cuda_cudnn_CreateTensorDescriptor(&y_desc, y, cudnn_dtype);
+    if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
+    status = cumo_cuda_cudnn_CreatePoolingDescriptor(&pool_desc, int_mode, ndim, int_kernel_size, int_stride, int_pad);
+    if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
+    handle = cumo_cuda_cudnn_handle();
+    status = cudnnPoolingForward(
+            handle,
+            pool_desc,
+            (void*)&alpha,
+            x_desc,
+            (void*)x_cont_ptr,
+            (void*)&beta,
+            y_desc,
+            (void*)y_ptr);
+    if (status != CUDNN_STATUS_SUCCESS) goto POOLING_ERROR;
+POOLING_ERROR:
+    if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
+    if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
+    if (pool_desc) cudnnDestroyPoolingDescriptor(pool_desc);
+    cumo_cuda_cudnn_check_status(status);
+    return y;
+}
+#else // CUDNN_FOUND
+VALUE cumo_cuda_eCUDNNError;
+static VALUE
+<%=c_func(-1)%>(int argc, VALUE argv[], VALUE self)
+{
+    rb_raise(cumo_cuda_eCUDNNError, "cuDNN is not available");
+}
+#endif // CUDNN_FOUND