RubyGems - cumo - Versions diffs - 0.2.5 → 0.3.0.pre1 - Mend

cumo 0.2.5 → 0.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +14 -1
data/README.md +12 -1
data/cumo.gemspec +1 -1
data/ext/cumo/cuda/cudnn.c +80 -0
data/ext/cumo/cuda/cudnn_impl.cpp +572 -0
data/ext/cumo/cuda/runtime.c +1 -0
data/ext/cumo/cumo.c +5 -0
data/ext/cumo/extconf.rb +8 -2
data/ext/cumo/include/cumo.h +2 -2
data/ext/cumo/include/cumo/cuda/cudnn.h +205 -0
data/ext/cumo/include/cumo/hash_combine.hpp +17 -0
data/ext/cumo/include/cumo/intern.h +5 -0
data/ext/cumo/include/cumo/types/dfloat.h +1 -0
data/ext/cumo/include/cumo/types/sfloat.h +1 -0
data/ext/cumo/narray/gen/spec.rb +21 -0
data/ext/cumo/narray/gen/tmpl/batch_norm.c +197 -0
data/ext/cumo/narray/gen/tmpl/batch_norm_backward.c +191 -0
data/ext/cumo/narray/gen/tmpl/conv.c +216 -0
data/ext/cumo/narray/gen/tmpl/conv_grad_w.c +183 -0
data/ext/cumo/narray/gen/tmpl/conv_transpose.c +244 -0
data/ext/cumo/narray/gen/tmpl/gemm.c +14 -0
data/ext/cumo/narray/gen/tmpl/pooling_backward.c +136 -0
data/ext/cumo/narray/gen/tmpl/pooling_forward.c +136 -0
data/ext/cumo/narray/narray.c +29 -0
data/lib/cumo/cuda.rb +1 -0
data/lib/cumo/cuda/cudnn.rb +88 -0
metadata +18 -5

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1b28beaea182d622d304bcb3153e56aa3280993ec079aea44c00b915d1e92b77
-  data.tar.gz: 26fc0e1942a444e5f9cb4641b3e36f9985593a10de26188f0a4142e72314d82a
+  metadata.gz: 6fbd39b063f8c40636b699f956ab5ebf4905a58013fd249819845f3dc525f77a
+  data.tar.gz: 66c369f01877aa42e73dba6bcaf5a499f52084024171c7f1b5e561c1da08f7e0
 SHA512:
-  metadata.gz: a678cb7965fbbc9febf6b5f2f557f8be34f28c051fc0437a87506d3a067a34778a73b75dbeb56da14fd538062a8454355efd06bb686056db5b4df7cab9c04e86
-  data.tar.gz: 30ce98cae4e84ee7e9e73eae3ad76bcaca1e636462301d1afe1aa50e1f50633ed1b16756b90aaeba1a3e0870179d7e2dbee41696b175f0a454efee93e5f89591
+  metadata.gz: 6d98b07a55ead442c4edd2e2e3c648d58d26a1343938eee60ff1cf8ee3bfd9b0539c5c650e349eef5d116ef0f5cd095f077a9cd2586cddcd01023e8e7cdb225e
+  data.tar.gz: 22012ddfb97cde8ff78324c599351bd5aa16bac5747208c573def3b0f2d2f47f0e1d55dc83393685a0e54a006be08bab2c15631ed26c940ecd20cd07efb4a86d

data/CHANGELOG.md CHANGED Viewed

@@ -1,4 +1,17 @@
-# 0.2.5 (2019-03-04)-
+# 0.3.0.pre1 (2019-04-09)
+Enhancements:
+* Support cuDNN
+  * conv (cudnnConvolution)
+  * conv\_transpose (cudnnConvolutionBackwardData)
+  * conv\_grad\_w (cudnnConvolutionBackwardFilter)
+  * batch\_norm (cudnnBatchNormalization)
+  * batch\_norm\_backward (cudnnBatchNormalizationBackward)
+  * avg\_pool and max\_pool (cudnnPoolingForward)
+  * avg\_pool\_backward and max\_pool\_backward (cudnnPoolingBackward)
+# 0.2.5 (2019-03-04)
 Enhancements:

data/README.md CHANGED Viewed

@@ -22,6 +22,17 @@ export PATH="$CUDA_PATH/bin:$PATH"
 export LIBRARY_PATH="$CUDA_PATH/lib64:$CUDA_PATH/lib:$LIBRARY_PATH"
 ```
+To use cuDNN features, install cuDNN and set your environment variables as follows:
+```
+export CUDNN_ROOT_DIR=/path/to/cudnn
+export CPATH=$CUDNN_ROOT_DIR/include:$CPATH
+export LD_LIBRARY_PATH=$CUDNN_ROOT_DIR/lib64:$LD_LIBRARY_PATH
+export LIBRARY_PATH=$CUDNN_ROOT_DIR/lib64:$LIBRARY_PATH
+```
+FYI: I use [cudnnenv](https://github.com/unnonouno/cudnnenv) to install cudnn under my home directory like `export CUDNN_ROOT_DIR=/home/sonots/.cudnn/active/cuda`.
 ## Installation
 Add the following line to your Gemfile:
@@ -216,7 +227,7 @@ bundle exec gdb -x run.gdb --args ruby test/narray_test.rb
 You may put a breakpoint by calling `cumo_debug_breakpoint()` at C source codes.
-### Run tests only a specific line
+### Run tests only a specific line
 `--location` option is available as:
 ```

data/cumo.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 lib = File.expand_path("../lib", __FILE__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
-cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([\d.]+)"/)[1]
+cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([^"]+)"/)[1]
 numo_narray_version = File.read(File.join(__dir__, "numo-narray-version")).strip
 Gem::Specification.new do |spec|

data/ext/cumo/cuda/cudnn.c ADDED Viewed

@@ -0,0 +1,80 @@
+#include "cumo/cuda/cudnn.h"
+#include <assert.h>
+#include <ruby.h>
+#include "cumo/narray.h"
+#include "cumo/template.h"
+#include "cumo/cuda/runtime.h"
+VALUE cumo_cuda_eCUDNNError;
+VALUE cumo_cuda_mCUDNN;
+#define eCUDNNError cumo_cuda_eCUDNNError
+#define mCUDNN cumo_cuda_mCUDNN
+#ifdef CUDNN_FOUND
+void
+cumo_cuda_cudnn_check_status(cudnnStatus_t status)
+{
+    if (status != CUDNN_STATUS_SUCCESS) {
+        rb_raise(cumo_cuda_eCUDNNError, "%s (error=%d)", cudnnGetErrorString(status), status);
+    }
+}
+// Lazily initialize cudnn handle, and cache it
+cudnnHandle_t
+cumo_cuda_cudnn_handle()
+{
+    static cudnnHandle_t *handles = 0;  // handle is never destroyed
+    int device;
+    if (handles == 0) {
+        int i;
+        int device_count = cumo_cuda_runtime_get_device_count();
+        handles = malloc(sizeof(cudnnHandle_t) * device_count);
+        for (i = 0; i < device_count; ++i) {
+            handles[i] = 0;
+        }
+    }
+    device = cumo_cuda_runtime_get_device();
+    if (handles[device] == 0) {
+        cudnnCreate(&handles[device]);
+    }
+    return handles[device];
+}
+#endif // CUDNN_FOUND
+/*
+  Returns availability of cuDNN.
+  @return [Boolean] Returns true if cuDNN is available
+ */
+static VALUE
+rb_cudnn_available_p()
+{
+#if CUDNN_FOUND
+    return Qtrue;
+#else
+    return Qfalse;
+#endif
+}
+void
+Init_cumo_cuda_cudnn(void)
+{
+    VALUE mCumo = rb_define_module("Cumo");
+    VALUE mCUDA = rb_define_module_under(mCumo, "CUDA");
+    /*
+      Document-module: Cumo::CUDNN
+    */
+    mCUDNN = rb_define_module_under(mCUDA, "CUDNN");
+    rb_define_const(mCUDA, "Cudnn", mCUDNN); // alias
+    eCUDNNError = rb_define_class_under(mCUDA, "CUDNNError", rb_eStandardError);
+    rb_define_singleton_method(mCUDNN, "available?", RUBY_METHOD_FUNC(rb_cudnn_available_p), 0);
+    rb_define_const(mCUDNN, "CUDNN_POOLING_MAX", INT2NUM(CUDNN_POOLING_MAX));
+    rb_define_const(mCUDNN, "CUDNN_POOLING_MAX_DETERMINISTIC", INT2NUM(CUDNN_POOLING_MAX_DETERMINISTIC));
+    rb_define_const(mCUDNN, "CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", INT2NUM(CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING));
+    rb_define_const(mCUDNN, "CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", INT2NUM(CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING));
+}

data/ext/cumo/cuda/cudnn_impl.cpp ADDED Viewed

@@ -0,0 +1,572 @@
+#ifdef CUDNN_FOUND
+#include "cumo/cuda/cudnn.h"
+#include <assert.h>
+#include <ruby.h>
+#include <cudnn.h>
+#include "cumo/narray.h"
+#include "cumo/template.h"
+#include "cumo/cuda/runtime.h"
+#include "cumo/cuda/memory_pool.h"
+#include <unordered_map>
+#if defined(__cplusplus)
+extern "C" {
+#if 0
+} /* satisfy cc-mode */
+#endif
+#endif
+// cover_all=true is not supported
+size_t
+cumo_cuda_cudnn_GetConvOutDim(
+        size_t in_dim,
+        size_t kernel_size,
+        size_t stride,
+        size_t pad) {
+    int64_t numerator;
+    assert(stride > 0);
+    // if (cover_all) {
+    //     numerator = in_dim + pad * 2 - kernel_size + stride - 1;
+    // } else {
+    numerator = in_dim + pad * 2 - kernel_size;
+    // }
+    if (numerator < 0) {
+        rb_raise(rb_eRuntimeError, "Output size should be positive.");
+    }
+    return (size_t)(numerator / stride + 1);
+}
+// cover_all=true is not supported
+size_t
+cumo_cuda_cudnn_GetConvTransposeOutDim(
+        size_t in_dim,
+        size_t kernel_size,
+        size_t stride,
+        size_t pad) {
+    // if (cover_all) {
+    //     return stride * (in_dim - 1) + kernel_size - stride + 1 - 2 * pad;
+    // }
+    int64_t out_size = stride * (in_dim - 1) + kernel_size - 2 * pad;
+    if (out_size < 0) {
+        rb_raise(rb_eRuntimeError, "Output size should be positive.");
+    }
+    return (size_t)out_size;
+}
+cudnnStatus_t
+cumo_cuda_cudnn_CreateTensorDescriptor(
+        cudnnTensorDescriptor_t *desc,
+        VALUE a, cudnnDataType_t cudnn_dtype) {
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    cumo_narray_t *na;
+    CumoGetNArray(a, na);
+    int ndim = (int)(na->ndim);
+    size_t *shape = na->shape;
+    assert(cumo_na_check_contiguous(a) == Qtrue);
+    status = cudnnCreateTensorDescriptor(desc);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    if (ndim == 4) {
+        status = cudnnSetTensor4dDescriptor(
+                *desc, CUDNN_TENSOR_NCHW, cudnn_dtype, shape[0], shape[1], shape[2], shape[3]);
+    }
+    else {
+        int int_shape[CUMO_NA_MAX_DIMENSION];
+        for (int idim = 0; idim < ndim; ++idim) {
+            int_shape[idim] = (int)(shape[idim]);
+        }
+        int int_strides[CUMO_NA_MAX_DIMENSION]; // strides divided by item size
+        int stride = 1;
+        for (int idim = ndim - 1; idim >= 0; --idim) {
+            int_strides[idim] = stride;
+            stride *= int_shape[idim];
+        }
+        status = cudnnSetTensorNdDescriptor(*desc, cudnn_dtype, ndim, int_shape, int_strides);
+    }
+    return status;
+}
+cudnnStatus_t
+cumo_cuda_cudnn_CreateFilterDescriptor(
+        cudnnFilterDescriptor_t *desc,
+        VALUE a,
+        cudnnDataType_t cudnn_dtype) {
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    cumo_narray_t *na;
+    int ndim;
+    size_t *shape;
+    CumoGetNArray(a, na);
+    ndim = (int)(na->ndim);
+    shape = na->shape;
+    assert(cumo_na_check_contiguous(a) == Qtrue);
+    status = cudnnCreateFilterDescriptor(desc);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    if (ndim == 4) {
+        status = cudnnSetFilter4dDescriptor(
+                *desc, cudnn_dtype, CUDNN_TENSOR_NCHW, shape[0], shape[1], shape[2], shape[3]);
+    } else {
+        int int_shape[CUMO_NA_MAX_DIMENSION];
+        for (int idim = 0; idim < ndim; ++idim) {
+            int_shape[idim] = (int)(shape[idim]);
+        }
+        status = cudnnSetFilterNdDescriptor(*desc, cudnn_dtype, CUDNN_TENSOR_NCHW, ndim, int_shape);
+    }
+    return status;
+}
+cudnnStatus_t
+cumo_cuda_cudnn_CreateConvolutionDescriptor(
+        cudnnConvolutionDescriptor_t *desc,
+        size_t ndim,
+        int* int_stride,
+        int* int_pad,
+        cudnnDataType_t cudnn_dtype) {
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    int int_dilation[CUMO_NA_MAX_DIMENSION];
+    for (size_t idim = 0; idim < ndim; ++idim) {
+        int_dilation[idim] = 1;
+    }
+    status = cudnnCreateConvolutionDescriptor(desc);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    if (ndim == 2) {
+        status = cudnnSetConvolution2dDescriptor(
+                *desc,
+                int_pad[0],
+                int_pad[1],
+                int_stride[0],
+                int_stride[1],
+                int_dilation[0],
+                int_dilation[1],
+                CUDNN_CROSS_CORRELATION,
+                cudnn_dtype);
+    } else {
+        status = cudnnSetConvolutionNdDescriptor(
+                *desc,
+                ndim,
+                int_pad,
+                int_stride,
+                int_dilation,
+                CUDNN_CROSS_CORRELATION,
+                cudnn_dtype);
+    }
+    return status;
+}
+cudnnStatus_t
+cumo_cuda_cudnn_CreatePoolingDescriptor(
+        cudnnPoolingDescriptor_t *desc,
+        cudnnPoolingMode_t mode,
+        size_t ndim,
+        int* int_kernel_size,
+        int* int_stride,
+        int* int_pad) {
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    status = cudnnCreatePoolingDescriptor(desc);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    if (ndim == 2) {
+        status = cudnnSetPooling2dDescriptor(
+                *desc,
+                mode,
+                CUDNN_NOT_PROPAGATE_NAN,
+                int_kernel_size[0],
+                int_kernel_size[1],
+                int_pad[0],
+                int_pad[1],
+                int_stride[0],
+                int_stride[1]);
+    } else {
+        status = cudnnSetPoolingNdDescriptor(
+                *desc,
+                mode,
+                CUDNN_NOT_PROPAGATE_NAN,
+                ndim,
+                int_kernel_size,
+                int_pad,
+                int_stride);
+    }
+    return status;
+}
+// Borrowed from boost::hash_combine
+//
+// TODO(sonots): hash combine in 64bit
+static void HashCombine(std::size_t& seed, std::size_t hash_value) {
+    seed ^= hash_value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+// Partially Borrowed from ChainerX
+struct AlgoCacheKey {
+    size_t ndim;  // # of spatial dimensions
+    size_t x_shape[CUMO_NA_MAX_DIMENSION];
+    size_t w_shape[CUMO_NA_MAX_DIMENSION];
+    size_t y_shape[CUMO_NA_MAX_DIMENSION];
+    size_t pad[CUMO_NA_MAX_DIMENSION];
+    size_t stride[CUMO_NA_MAX_DIMENSION];
+    cudnnDataType_t dtype;
+    size_t max_workspace_size;
+    bool operator==(const AlgoCacheKey& other) const {
+        if (ndim != other.ndim) return false;
+        if (dtype != other.dtype) return false;
+        if (max_workspace_size != other.max_workspace_size) return false;
+        for (size_t idim = 0; idim < ndim + 2; ++idim) {
+            if (x_shape[idim] != other.x_shape[idim]) return false;
+        }
+        for (size_t idim = 0; idim < ndim + 2; ++idim) {
+            if (w_shape[idim] != other.w_shape[idim]) return false;
+        }
+        for (size_t idim = 0; idim < ndim + 2; ++idim) {
+            if (y_shape[idim] != other.y_shape[idim]) return false;
+        }
+        for (size_t idim = 0; idim < ndim; ++idim) {
+            if (pad[idim] != other.pad[idim]) return false;
+        }
+        for (size_t idim = 0; idim < ndim; ++idim) {
+            if (stride[idim] != other.stride[idim]) return false;
+        }
+        return true;
+    }
+    bool operator!=(const AlgoCacheKey& other) const { return !operator==(other); }
+};
+struct AlgoCacheKeyHash {
+    using result_type = std::size_t;
+    std::size_t operator()(const AlgoCacheKey& key) const {
+        std::size_t seed = 0;
+        size_t ndim = key.ndim;
+        HashCombine(seed, std::hash<size_t>()(key.ndim));
+        for (size_t idim = 0; idim < ndim + 2; ++idim) {
+            HashCombine(seed, std::hash<size_t>()(key.x_shape[idim]));
+        }
+        for (size_t idim = 0; idim < ndim + 2; ++idim) {
+            HashCombine(seed, std::hash<size_t>()(key.w_shape[idim]));
+        }
+        for (size_t idim = 0; idim < ndim + 2; ++idim) {
+            HashCombine(seed, std::hash<size_t>()(key.y_shape[idim]));
+        }
+        for (size_t idim = 0; idim < ndim; ++idim) {
+            HashCombine(seed, std::hash<size_t>()(key.pad[idim]));
+        }
+        for (size_t idim = 0; idim < ndim; ++idim) {
+            HashCombine(seed, std::hash<size_t>()(key.stride[idim]));
+        }
+        HashCombine(seed, std::hash<int>()((int)(key.dtype)));
+        HashCombine(seed, std::hash<size_t>()(key.max_workspace_size));
+        return seed;
+    }
+};
+using FwdAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionFwdAlgo_t, size_t>, AlgoCacheKeyHash>;
+using BwdDataAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionBwdDataAlgo_t, size_t>, AlgoCacheKeyHash>;
+using BwdFilterAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionBwdFilterAlgo_t, size_t>, AlgoCacheKeyHash>;
+// TODO: Another cache for another device
+static FwdAlgoCacheMap fwd_algo_cache_map_{};
+static BwdDataAlgoCacheMap bwd_data_algo_cache_map_{};
+static BwdFilterAlgoCacheMap bwd_filter_algo_cache_map_{};
+cudnnStatus_t
+cumo_cuda_cudnn_FindConvolutionForwardAlgorithm(
+        cudnnConvolutionFwdAlgoPerf_t *perf_result,
+        cudnnHandle_t handle,
+        cudnnTensorDescriptor_t x_desc,
+        VALUE x,
+        cudnnFilterDescriptor_t w_desc,
+        VALUE w,
+        cudnnConvolutionDescriptor_t conv_desc,
+        cudnnTensorDescriptor_t y_desc,
+        VALUE y,
+        size_t max_workspace_size,
+        int* int_stride,
+        int* int_pad,
+        size_t ndim,
+        cudnnDataType_t cudnn_dtype)
+{
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    cumo_narray_t *nx, *nw, *ny;
+    CumoGetNArray(x, nx);
+    CumoGetNArray(w, nw);
+    CumoGetNArray(y, ny);
+    auto key = AlgoCacheKey{};
+    key.ndim = ndim;
+    for (size_t idim = 0; idim < ndim + 2; ++idim) {
+        key.x_shape[idim] = nx->shape[idim];
+        key.w_shape[idim] = nw->shape[idim];
+        key.y_shape[idim] = ny->shape[idim];
+    }
+    for (size_t idim = 0; idim < ndim; ++idim) {
+        key.pad[idim]= int_pad[idim];
+        key.stride[idim]= int_stride[idim];
+    }
+    key.dtype = cudnn_dtype;
+    key.max_workspace_size = max_workspace_size;
+    auto& algo_cache_map = fwd_algo_cache_map_;
+    // TODO: thread-safe
+    auto it = algo_cache_map.find(key);
+    if (it != algo_cache_map.end()) {
+        auto pair = it->second;
+        perf_result->algo = pair.first;
+        perf_result->memory = pair.second;
+        return CUDNN_STATUS_SUCCESS;
+    }
+    char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
+    char* w_ptr = cumo_na_get_offset_pointer_for_read(w);
+    char* y_ptr = cumo_na_get_offset_pointer_for_read(y);
+    char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
+    int returned_algo_count{};
+    status = cudnnFindConvolutionForwardAlgorithmEx(
+                handle,
+                x_desc,
+                (void*)x_ptr,
+                w_desc,
+                (void*)w_ptr,
+                conv_desc,
+                y_desc,
+                (void*)y_ptr,
+                1,  // requested algo count,
+                &returned_algo_count,
+                perf_result,
+                (void*)workspace,
+                max_workspace_size);
+    cumo_cuda_runtime_free(workspace);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    assert(returned_algo_count == 1);
+    // TODO: thread-safe
+    algo_cache_map[key] = {perf_result->algo, perf_result->memory};
+    return status;
+}
+cudnnStatus_t
+cumo_cuda_cudnn_FindConvolutionBackwardDataAlgorithm(
+        cudnnConvolutionBwdDataAlgoPerf_t *perf_result,
+        cudnnHandle_t handle,
+        cudnnFilterDescriptor_t w_desc,
+        VALUE w,
+        cudnnTensorDescriptor_t x_desc,
+        VALUE x,
+        cudnnConvolutionDescriptor_t conv_desc,
+        cudnnTensorDescriptor_t y_desc,
+        VALUE y,
+        size_t max_workspace_size,
+        int* int_stride,
+        int* int_pad,
+        size_t ndim,
+        cudnnDataType_t cudnn_dtype)
+{
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    cumo_narray_t *nx, *nw, *ny;
+    CumoGetNArray(x, nx);
+    CumoGetNArray(w, nw);
+    CumoGetNArray(y, ny);
+    auto key = AlgoCacheKey{};
+    key.ndim = ndim;
+    for (size_t idim = 0; idim < ndim + 2; ++idim) {
+        key.x_shape[idim] = nx->shape[idim];
+        key.w_shape[idim] = nw->shape[idim];
+        key.y_shape[idim] = ny->shape[idim];
+    }
+    for (size_t idim = 0; idim < ndim; ++idim) {
+        key.pad[idim]= int_pad[idim];
+        key.stride[idim]= int_stride[idim];
+    }
+    key.dtype = cudnn_dtype;
+    key.max_workspace_size = max_workspace_size;
+    auto& algo_cache_map = bwd_data_algo_cache_map_;
+    // TODO: thread-safe
+    auto it = algo_cache_map.find(key);
+    if (it != algo_cache_map.end()) {
+        auto pair = it->second;
+        perf_result->algo = pair.first;
+        perf_result->memory = pair.second;
+        return CUDNN_STATUS_SUCCESS;
+    }
+    char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
+    char* w_ptr = cumo_na_get_offset_pointer_for_read(w);
+    char* y_ptr = cumo_na_get_offset_pointer_for_read(y);
+    char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
+    int returned_algo_count{};
+    status = cudnnFindConvolutionBackwardDataAlgorithmEx(
+                handle,
+                w_desc,
+                (void*)w_ptr,
+                x_desc,
+                (void*)x_ptr,
+                conv_desc,
+                y_desc,
+                (void*)y_ptr,
+                1,  // requested algo count,
+                &returned_algo_count,
+                perf_result,
+                (void*)workspace,
+                max_workspace_size);
+    cumo_cuda_runtime_free(workspace);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    assert(returned_algo_count == 1);
+    // TODO: thread-safe
+    algo_cache_map[key] = {perf_result->algo, perf_result->memory};
+    return status;
+}
+cudnnStatus_t
+cumo_cuda_cudnn_FindConvolutionBackwardFilterAlgorithm(
+        cudnnConvolutionBwdFilterAlgoPerf_t *perf_result,
+        cudnnHandle_t handle,
+        cudnnTensorDescriptor_t x_desc,
+        VALUE x,
+        cudnnTensorDescriptor_t gy_desc,
+        VALUE gy,
+        cudnnConvolutionDescriptor_t conv_desc,
+        cudnnFilterDescriptor_t gw_desc,
+        VALUE gw,
+        size_t max_workspace_size,
+        int* int_stride,
+        int* int_pad,
+        size_t ndim,
+        cudnnDataType_t cudnn_dtype)
+{
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    cumo_narray_t *nx, *ngy, *ngw;
+    CumoGetNArray(x, nx);
+    CumoGetNArray(gy, ngy);
+    CumoGetNArray(gw, ngw);
+    auto key = AlgoCacheKey{};
+    key.ndim = ndim;
+    for (size_t idim = 0; idim < ndim + 2; ++idim) {
+        key.x_shape[idim] = nx->shape[idim];
+        key.w_shape[idim] = ngw->shape[idim];
+        key.y_shape[idim] = ngy->shape[idim];
+    }
+    for (size_t idim = 0; idim < ndim; ++idim) {
+        key.pad[idim]= int_pad[idim];
+        key.stride[idim]= int_stride[idim];
+    }
+    key.dtype = cudnn_dtype;
+    key.max_workspace_size = max_workspace_size;
+    auto& algo_cache_map = bwd_filter_algo_cache_map_;
+    // TODO: thread-safe
+    auto it = algo_cache_map.find(key);
+    if (it != algo_cache_map.end()) {
+        auto pair = it->second;
+        perf_result->algo = pair.first;
+        perf_result->memory = pair.second;
+        return CUDNN_STATUS_SUCCESS;
+    }
+    char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
+    char* gy_ptr = cumo_na_get_offset_pointer_for_read(gy);
+    char* gw_ptr = cumo_na_get_offset_pointer_for_read(gw);
+    char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
+    int returned_algo_count{};
+    status = cudnnFindConvolutionBackwardFilterAlgorithmEx(
+                handle,
+                x_desc,
+                (void*)x_ptr,
+                gy_desc,
+                (void*)gy_ptr,
+                conv_desc,
+                gw_desc,
+                (void*)gw_ptr,
+                1,  // requested algo count,
+                &returned_algo_count,
+                perf_result,
+                (void*)workspace,
+                max_workspace_size);
+    cumo_cuda_runtime_free(workspace);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    assert(returned_algo_count == 1);
+    // TODO: thread-safe
+    algo_cache_map[key] = {perf_result->algo, perf_result->memory};
+    return status;
+}
+// TODO(sonots): Support other than 4, 5 dimensional arrays by reshaping into 4-dimensional arrays as Chainer does.
+cudnnBatchNormMode_t
+cumo_cuda_cudnn_GetBatchNormMode(size_t ndim, int* axis) {
+    if (ndim == 1 && axis[0] == 0) {  // (1, channels, (depth, )height, width)
+        return CUDNN_BATCHNORM_PER_ACTIVATION;
+    }
+    if ((ndim == 3 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3) ||
+        (ndim == 4 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3 && axis[3] == 4)) {  // (1, channels, (1, )1, 1)
+        // TODO: Consider CUDNN_BATCHNORM_SPATIAL_PERSISTENT if we can afford to check for overflow, with or without blocking.
+        return CUDNN_BATCHNORM_SPATIAL;
+    }
+    rb_raise(rb_eRuntimeError, "Invalid axis for BatchNorm using cuDNN. Expected 1, 3 or 4 dimensions.");
+}
+cudnnStatus_t
+cumo_cuda_cudnn_CreateBNTensorDescriptor(
+        cudnnTensorDescriptor_t *desc,
+        cudnnTensorDescriptor_t x_desc,
+        cudnnBatchNormMode_t mode)
+{
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    status = cudnnCreateTensorDescriptor(desc);
+    if (status = CUDNN_STATUS_SUCCESS) return status;
+    status = cudnnDeriveBNTensorDescriptor(*desc, x_desc, mode);
+    return status;
+}
+size_t
+cumo_cuda_cudnn_ReduceShape(
+        size_t *reduced_shape,
+        size_t shape_ndim,
+        size_t *shape,
+        size_t axes_ndim,
+        int *axes,
+        char keepdims) {
+    assert(shape_ndim >= axes_ndim);
+    size_t i_axis = 0;
+    size_t i_shape = 0;
+    for (size_t i = 0; i < shape_ndim; ++i) {
+        if (i_axis < axes_ndim && i == (size_t)axes[i_axis]) {
+            ++i_axis;
+            if (keepdims) {
+                reduced_shape[i_shape++] = 1;
+            }
+        } else {
+            reduced_shape[i_shape++] = shape[i];
+        }
+    }
+    assert(i_axis == axes_ndim);
+    assert(i_shape == shape_ndim - static_cast<int8_t>(!keepdims) * axes_ndim);
+    return i_shape;
+}
+#if defined(__cplusplus)
+#if 0
+{ /* satisfy cc-mode */
+#endif
+}  /* extern "C" { */
+#endif
+#endif // CUDNN_FOUND