RubyGems - cumo - Versions diffs - 0.2.5 → 0.3.0.pre1 - Mend

cumo 0.2.5 → 0.3.0.pre1

Files changed (28) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +14 -1
data/README.md +12 -1
data/cumo.gemspec +1 -1
data/ext/cumo/cuda/cudnn.c +80 -0
data/ext/cumo/cuda/cudnn_impl.cpp +572 -0
data/ext/cumo/cuda/runtime.c +1 -0
data/ext/cumo/cumo.c +5 -0
data/ext/cumo/extconf.rb +8 -2
data/ext/cumo/include/cumo.h +2 -2
data/ext/cumo/include/cumo/cuda/cudnn.h +205 -0
data/ext/cumo/include/cumo/hash_combine.hpp +17 -0
data/ext/cumo/include/cumo/intern.h +5 -0
data/ext/cumo/include/cumo/types/dfloat.h +1 -0
data/ext/cumo/include/cumo/types/sfloat.h +1 -0
data/ext/cumo/narray/gen/spec.rb +21 -0
data/ext/cumo/narray/gen/tmpl/batch_norm.c +197 -0
data/ext/cumo/narray/gen/tmpl/batch_norm_backward.c +191 -0
data/ext/cumo/narray/gen/tmpl/conv.c +216 -0
data/ext/cumo/narray/gen/tmpl/conv_grad_w.c +183 -0
data/ext/cumo/narray/gen/tmpl/conv_transpose.c +244 -0
data/ext/cumo/narray/gen/tmpl/gemm.c +14 -0
data/ext/cumo/narray/gen/tmpl/pooling_backward.c +136 -0
data/ext/cumo/narray/gen/tmpl/pooling_forward.c +136 -0
data/ext/cumo/narray/narray.c +29 -0
data/lib/cumo/cuda.rb +1 -0
data/lib/cumo/cuda/cudnn.rb +88 -0
metadata +18 -5

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1b28beaea182d622d304bcb3153e56aa3280993ec079aea44c00b915d1e92b77
-  data.tar.gz: 26fc0e1942a444e5f9cb4641b3e36f9985593a10de26188f0a4142e72314d82a
+  metadata.gz: 6fbd39b063f8c40636b699f956ab5ebf4905a58013fd249819845f3dc525f77a
+  data.tar.gz: 66c369f01877aa42e73dba6bcaf5a499f52084024171c7f1b5e561c1da08f7e0
 SHA512:
-  metadata.gz: a678cb7965fbbc9febf6b5f2f557f8be34f28c051fc0437a87506d3a067a34778a73b75dbeb56da14fd538062a8454355efd06bb686056db5b4df7cab9c04e86
-  data.tar.gz: 30ce98cae4e84ee7e9e73eae3ad76bcaca1e636462301d1afe1aa50e1f50633ed1b16756b90aaeba1a3e0870179d7e2dbee41696b175f0a454efee93e5f89591
+  metadata.gz: 6d98b07a55ead442c4edd2e2e3c648d58d26a1343938eee60ff1cf8ee3bfd9b0539c5c650e349eef5d116ef0f5cd095f077a9cd2586cddcd01023e8e7cdb225e
+  data.tar.gz: 22012ddfb97cde8ff78324c599351bd5aa16bac5747208c573def3b0f2d2f47f0e1d55dc83393685a0e54a006be08bab2c15631ed26c940ecd20cd07efb4a86d

data/CHANGELOG.md CHANGED Viewed

@@ -1,4 +1,17 @@
-# 0.2.5 (2019-03-04)-
+# 0.3.0.pre1 (2019-04-09)
+Enhancements:
+* Support cuDNN
+  * conv (cudnnConvolution)
+  * conv\_transpose (cudnnConvolutionBackwardData)
+  * conv\_grad\_w (cudnnConvolutionBackwardFilter)
+  * batch\_norm (cudnnBatchNormalization)
+  * batch\_norm\_backward (cudnnBatchNormalizationBackward)
+  * avg\_pool and max\_pool (cudnnPoolingForward)
+  * avg\_pool\_backward and max\_pool\_backward (cudnnPoolingBackward)
+# 0.2.5 (2019-03-04)
 Enhancements:

data/README.md CHANGED Viewed

@@ -22,6 +22,17 @@ export PATH="$CUDA_PATH/bin:$PATH"
 export LIBRARY_PATH="$CUDA_PATH/lib64:$CUDA_PATH/lib:$LIBRARY_PATH"
 ```
+To use cuDNN features, install cuDNN and set your environment variables as follows:
+```
+export CUDNN_ROOT_DIR=/path/to/cudnn
+export CPATH=$CUDNN_ROOT_DIR/include:$CPATH
+export LD_LIBRARY_PATH=$CUDNN_ROOT_DIR/lib64:$LD_LIBRARY_PATH
+export LIBRARY_PATH=$CUDNN_ROOT_DIR/lib64:$LIBRARY_PATH
+```
+FYI: I use [cudnnenv](https://github.com/unnonouno/cudnnenv) to install cudnn under my home directory like `export CUDNN_ROOT_DIR=/home/sonots/.cudnn/active/cuda`.
 ## Installation
 Add the following line to your Gemfile:
@@ -216,7 +227,7 @@ bundle exec gdb -x run.gdb --args ruby test/narray_test.rb
 You may put a breakpoint by calling `cumo_debug_breakpoint()` at C source codes.
-### Run tests only a specific line
+### Run tests only a specific line
 `--location` option is available as:
 ```

data/cumo.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 lib = File.expand_path("../lib", __FILE__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
-cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([\d.]+)"/)[1]
+cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([^"]+)"/)[1]
 numo_narray_version = File.read(File.join(__dir__, "numo-narray-version")).strip
 Gem::Specification.new do |spec|

data/ext/cumo/cuda/cudnn.c ADDED Viewed

@@ -0,0 +1,80 @@
+#include "cumo/cuda/cudnn.h"
+#include <assert.h>
+#include <ruby.h>
+#include "cumo/narray.h"
+#include "cumo/template.h"
+#include "cumo/cuda/runtime.h"
+VALUE cumo_cuda_eCUDNNError;
+VALUE cumo_cuda_mCUDNN;
+#define eCUDNNError cumo_cuda_eCUDNNError
+#define mCUDNN cumo_cuda_mCUDNN
+#ifdef CUDNN_FOUND
+void
+cumo_cuda_cudnn_check_status(cudnnStatus_t status)
+{
+    if (status != CUDNN_STATUS_SUCCESS) {
+        rb_raise(cumo_cuda_eCUDNNError, "%s (error=%d)", cudnnGetErrorString(status), status);
+    }
+}
+// Lazily initialize cudnn handle, and cache it
+cudnnHandle_t
+cumo_cuda_cudnn_handle()
+{
+    static cudnnHandle_t *handles = 0;  // handle is never destroyed
+    int device;
+    if (handles == 0) {
+        int i;
+        int device_count = cumo_cuda_runtime_get_device_count();
+        handles = malloc(sizeof(cudnnHandle_t) * device_count);
+        for (i = 0; i < device_count; ++i) {
+            handles[i] = 0;
+        }
+    }
+    device = cumo_cuda_runtime_get_device();
+    if (handles[device] == 0) {
+        cudnnCreate(&handles[device]);
+    }
+    return handles[device];
+}
+#endif // CUDNN_FOUND
+/*
+  Returns availability of cuDNN.
+  @return [Boolean] Returns true if cuDNN is available
+ */
+static VALUE
+rb_cudnn_available_p()
+{
+#if CUDNN_FOUND
+    return Qtrue;
+#else
+    return Qfalse;
+#endif
+}
+void
+Init_cumo_cuda_cudnn(void)
+{
+    VALUE mCumo = rb_define_module("Cumo");
+    VALUE mCUDA = rb_define_module_under(mCumo, "CUDA");
+    /*
+      Document-module: Cumo::CUDNN
+    */
+    mCUDNN = rb_define_module_under(mCUDA, "CUDNN");
+    rb_define_const(mCUDA, "Cudnn", mCUDNN); // alias
+    eCUDNNError = rb_define_class_under(mCUDA, "CUDNNError", rb_eStandardError);
+    rb_define_singleton_method(mCUDNN, "available?", RUBY_METHOD_FUNC(rb_cudnn_available_p), 0);
+    rb_define_const(mCUDNN, "CUDNN_POOLING_MAX", INT2NUM(CUDNN_POOLING_MAX));
+    rb_define_const(mCUDNN, "CUDNN_POOLING_MAX_DETERMINISTIC", INT2NUM(CUDNN_POOLING_MAX_DETERMINISTIC));
+    rb_define_const(mCUDNN, "CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", INT2NUM(CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING));
+    rb_define_const(mCUDNN, "CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", INT2NUM(CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING));
+}

data/ext/cumo/cuda/cudnn_impl.cpp ADDED Viewed

@@ -0,0 +1,572 @@
+#ifdef CUDNN_FOUND
+#include "cumo/cuda/cudnn.h"
+#include <assert.h>
+#include <ruby.h>
+#include <cudnn.h>
+#include "cumo/narray.h"
+#include "cumo/template.h"
+#include "cumo/cuda/runtime.h"
+#include "cumo/cuda/memory_pool.h"
+#include <unordered_map>
+#if defined(__cplusplus)
+extern "C" {
+#if 0
+} /* satisfy cc-mode */
+#endif
+#endif
+// cover_all=true is not supported
+size_t
+cumo_cuda_cudnn_GetConvOutDim(
+        size_t in_dim,
+        size_t kernel_size,
+        size_t stride,
+        size_t pad) {
+    int64_t numerator;
+    assert(stride > 0);
+    // if (cover_all) {
+    //     numerator = in_dim + pad * 2 - kernel_size + stride - 1;
+    // } else {
+    numerator = in_dim + pad * 2 - kernel_size;
+    // }
+    if (numerator < 0) {
+        rb_raise(rb_eRuntimeError, "Output size should be positive.");
+    }
+    return (size_t)(numerator / stride + 1);
+}
+// cover_all=true is not supported
+size_t
+cumo_cuda_cudnn_GetConvTransposeOutDim(
+        size_t in_dim,
+        size_t kernel_size,
+        size_t stride,
+        size_t pad) {
+    // if (cover_all) {
+    //     return stride * (in_dim - 1) + kernel_size - stride + 1 - 2 * pad;
+    // }
+    int64_t out_size = stride * (in_dim - 1) + kernel_size - 2 * pad;
+    if (out_size < 0) {
+        rb_raise(rb_eRuntimeError, "Output size should be positive.");
+    }
+    return (size_t)out_size;
+}
+cudnnStatus_t
+cumo_cuda_cudnn_CreateTensorDescriptor(
+        cudnnTensorDescriptor_t *desc,
+        VALUE a, cudnnDataType_t cudnn_dtype) {
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    cumo_narray_t *na;
+    CumoGetNArray(a, na);
+    int ndim = (int)(na->ndim);
+    size_t *shape = na->shape;
+    assert(cumo_na_check_contiguous(a) == Qtrue);
+    status = cudnnCreateTensorDescriptor(desc);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    if (ndim == 4) {
+        status = cudnnSetTensor4dDescriptor(
+                *desc, CUDNN_TENSOR_NCHW, cudnn_dtype, shape[0], shape[1], shape[2], shape[3]);
+    }
+    else {
+        int int_shape[CUMO_NA_MAX_DIMENSION];
+        for (int idim = 0; idim < ndim; ++idim) {
+            int_shape[idim] = (int)(shape[idim]);
+        }
+        int int_strides[CUMO_NA_MAX_DIMENSION]; // strides divided by item size
+        int stride = 1;
+        for (int idim = ndim - 1; idim >= 0; --idim) {
+            int_strides[idim] = stride;
+            stride *= int_shape[idim];
+        }
+        status = cudnnSetTensorNdDescriptor(*desc, cudnn_dtype, ndim, int_shape, int_strides);
+    }
+    return status;
+}
+cudnnStatus_t
+cumo_cuda_cudnn_CreateFilterDescriptor(
+        cudnnFilterDescriptor_t *desc,
+        VALUE a,
+        cudnnDataType_t cudnn_dtype) {
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    cumo_narray_t *na;
+    int ndim;
+    size_t *shape;
+    CumoGetNArray(a, na);
+    ndim = (int)(na->ndim);
+    shape = na->shape;
+    assert(cumo_na_check_contiguous(a) == Qtrue);
+    status = cudnnCreateFilterDescriptor(desc);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    if (ndim == 4) {
+        status = cudnnSetFilter4dDescriptor(
+                *desc, cudnn_dtype, CUDNN_TENSOR_NCHW, shape[0], shape[1], shape[2], shape[3]);
+    } else {
+        int int_shape[CUMO_NA_MAX_DIMENSION];
+        for (int idim = 0; idim < ndim; ++idim) {
+            int_shape[idim] = (int)(shape[idim]);
+        }
+        status = cudnnSetFilterNdDescriptor(*desc, cudnn_dtype, CUDNN_TENSOR_NCHW, ndim, int_shape);
+    }
+    return status;
+}
+cudnnStatus_t
+cumo_cuda_cudnn_CreateConvolutionDescriptor(
+        cudnnConvolutionDescriptor_t *desc,
+        size_t ndim,
+        int* int_stride,
+        int* int_pad,
+        cudnnDataType_t cudnn_dtype) {
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    int int_dilation[CUMO_NA_MAX_DIMENSION];
+    for (size_t idim = 0; idim < ndim; ++idim) {
+        int_dilation[idim] = 1;
+    }
+    status = cudnnCreateConvolutionDescriptor(desc);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    if (ndim == 2) {
+        status = cudnnSetConvolution2dDescriptor(
+                *desc,
+                int_pad[0],
+                int_pad[1],
+                int_stride[0],
+                int_stride[1],
+                int_dilation[0],
+                int_dilation[1],
+                CUDNN_CROSS_CORRELATION,
+                cudnn_dtype);
+    } else {
+        status = cudnnSetConvolutionNdDescriptor(
+                *desc,
+                ndim,
+                int_pad,
+                int_stride,
+                int_dilation,
+                CUDNN_CROSS_CORRELATION,
+                cudnn_dtype);
+    }
+    return status;
+}
+cudnnStatus_t
+cumo_cuda_cudnn_CreatePoolingDescriptor(
+        cudnnPoolingDescriptor_t *desc,
+        cudnnPoolingMode_t mode,
+        size_t ndim,
+        int* int_kernel_size,
+        int* int_stride,
+        int* int_pad) {
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    status = cudnnCreatePoolingDescriptor(desc);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    if (ndim == 2) {
+        status = cudnnSetPooling2dDescriptor(
+                *desc,
+                mode,
+                CUDNN_NOT_PROPAGATE_NAN,
+                int_kernel_size[0],
+                int_kernel_size[1],
+                int_pad[0],
+                int_pad[1],
+                int_stride[0],
+                int_stride[1]);
+    } else {
+        status = cudnnSetPoolingNdDescriptor(
+                *desc,
+                mode,
+                CUDNN_NOT_PROPAGATE_NAN,
+                ndim,
+                int_kernel_size,
+                int_pad,
+                int_stride);
+    }
+    return status;
+}
+// Borrowed from boost::hash_combine
+//
+// TODO(sonots): hash combine in 64bit
+static void HashCombine(std::size_t& seed, std::size_t hash_value) {
+    seed ^= hash_value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+// Partially Borrowed from ChainerX
+struct AlgoCacheKey {
+    size_t ndim;  // # of spatial dimensions
+    size_t x_shape[CUMO_NA_MAX_DIMENSION];
+    size_t w_shape[CUMO_NA_MAX_DIMENSION];
+    size_t y_shape[CUMO_NA_MAX_DIMENSION];
+    size_t pad[CUMO_NA_MAX_DIMENSION];
+    size_t stride[CUMO_NA_MAX_DIMENSION];
+    cudnnDataType_t dtype;
+    size_t max_workspace_size;
+    bool operator==(const AlgoCacheKey& other) const {
+        if (ndim != other.ndim) return false;
+        if (dtype != other.dtype) return false;
+        if (max_workspace_size != other.max_workspace_size) return false;
+        for (size_t idim = 0; idim < ndim + 2; ++idim) {
+            if (x_shape[idim] != other.x_shape[idim]) return false;
+        }
+        for (size_t idim = 0; idim < ndim + 2; ++idim) {
+            if (w_shape[idim] != other.w_shape[idim]) return false;
+        }
+        for (size_t idim = 0; idim < ndim + 2; ++idim) {
+            if (y_shape[idim] != other.y_shape[idim]) return false;
+        }
+        for (size_t idim = 0; idim < ndim; ++idim) {
+            if (pad[idim] != other.pad[idim]) return false;
+        }
+        for (size_t idim = 0; idim < ndim; ++idim) {
+            if (stride[idim] != other.stride[idim]) return false;
+        }
+        return true;
+    }
+    bool operator!=(const AlgoCacheKey& other) const { return !operator==(other); }
+};
+struct AlgoCacheKeyHash {
+    using result_type = std::size_t;
+    std::size_t operator()(const AlgoCacheKey& key) const {
+        std::size_t seed = 0;
+        size_t ndim = key.ndim;
+        HashCombine(seed, std::hash<size_t>()(key.ndim));
+        for (size_t idim = 0; idim < ndim + 2; ++idim) {
+            HashCombine(seed, std::hash<size_t>()(key.x_shape[idim]));
+        }
+        for (size_t idim = 0; idim < ndim + 2; ++idim) {
+            HashCombine(seed, std::hash<size_t>()(key.w_shape[idim]));
+        }
+        for (size_t idim = 0; idim < ndim + 2; ++idim) {
+            HashCombine(seed, std::hash<size_t>()(key.y_shape[idim]));
+        }
+        for (size_t idim = 0; idim < ndim; ++idim) {
+            HashCombine(seed, std::hash<size_t>()(key.pad[idim]));
+        }
+        for (size_t idim = 0; idim < ndim; ++idim) {
+            HashCombine(seed, std::hash<size_t>()(key.stride[idim]));
+        }
+        HashCombine(seed, std::hash<int>()((int)(key.dtype)));
+        HashCombine(seed, std::hash<size_t>()(key.max_workspace_size));
+        return seed;
+    }
+};
+using FwdAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionFwdAlgo_t, size_t>, AlgoCacheKeyHash>;
+using BwdDataAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionBwdDataAlgo_t, size_t>, AlgoCacheKeyHash>;
+using BwdFilterAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionBwdFilterAlgo_t, size_t>, AlgoCacheKeyHash>;
+// TODO: Another cache for another device
+static FwdAlgoCacheMap fwd_algo_cache_map_{};
+static BwdDataAlgoCacheMap bwd_data_algo_cache_map_{};
+static BwdFilterAlgoCacheMap bwd_filter_algo_cache_map_{};
+cudnnStatus_t
+cumo_cuda_cudnn_FindConvolutionForwardAlgorithm(
+        cudnnConvolutionFwdAlgoPerf_t *perf_result,
+        cudnnHandle_t handle,
+        cudnnTensorDescriptor_t x_desc,
+        VALUE x,
+        cudnnFilterDescriptor_t w_desc,
+        VALUE w,
+        cudnnConvolutionDescriptor_t conv_desc,
+        cudnnTensorDescriptor_t y_desc,
+        VALUE y,
+        size_t max_workspace_size,
+        int* int_stride,
+        int* int_pad,
+        size_t ndim,
+        cudnnDataType_t cudnn_dtype)
+{
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    cumo_narray_t *nx, *nw, *ny;
+    CumoGetNArray(x, nx);
+    CumoGetNArray(w, nw);
+    CumoGetNArray(y, ny);
+    auto key = AlgoCacheKey{};
+    key.ndim = ndim;
+    for (size_t idim = 0; idim < ndim + 2; ++idim) {
+        key.x_shape[idim] = nx->shape[idim];
+        key.w_shape[idim] = nw->shape[idim];
+        key.y_shape[idim] = ny->shape[idim];
+    }
+    for (size_t idim = 0; idim < ndim; ++idim) {
+        key.pad[idim]= int_pad[idim];
+        key.stride[idim]= int_stride[idim];
+    }
+    key.dtype = cudnn_dtype;
+    key.max_workspace_size = max_workspace_size;
+    auto& algo_cache_map = fwd_algo_cache_map_;
+    // TODO: thread-safe
+    auto it = algo_cache_map.find(key);
+    if (it != algo_cache_map.end()) {
+        auto pair = it->second;
+        perf_result->algo = pair.first;
+        perf_result->memory = pair.second;
+        return CUDNN_STATUS_SUCCESS;
+    }
+    char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
+    char* w_ptr = cumo_na_get_offset_pointer_for_read(w);
+    char* y_ptr = cumo_na_get_offset_pointer_for_read(y);
+    char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
+    int returned_algo_count{};
+    status = cudnnFindConvolutionForwardAlgorithmEx(
+                handle,
+                x_desc,
+                (void*)x_ptr,
+                w_desc,
+                (void*)w_ptr,
+                conv_desc,
+                y_desc,
+                (void*)y_ptr,
+                1,  // requested algo count,
+                &returned_algo_count,
+                perf_result,
+                (void*)workspace,
+                max_workspace_size);
+    cumo_cuda_runtime_free(workspace);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    assert(returned_algo_count == 1);
+    // TODO: thread-safe
+    algo_cache_map[key] = {perf_result->algo, perf_result->memory};
+    return status;
+}
+cudnnStatus_t
+cumo_cuda_cudnn_FindConvolutionBackwardDataAlgorithm(
+        cudnnConvolutionBwdDataAlgoPerf_t *perf_result,
+        cudnnHandle_t handle,
+        cudnnFilterDescriptor_t w_desc,
+        VALUE w,
+        cudnnTensorDescriptor_t x_desc,
+        VALUE x,
+        cudnnConvolutionDescriptor_t conv_desc,
+        cudnnTensorDescriptor_t y_desc,
+        VALUE y,
+        size_t max_workspace_size,
+        int* int_stride,
+        int* int_pad,
+        size_t ndim,
+        cudnnDataType_t cudnn_dtype)
+{
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    cumo_narray_t *nx, *nw, *ny;
+    CumoGetNArray(x, nx);
+    CumoGetNArray(w, nw);
+    CumoGetNArray(y, ny);
+    auto key = AlgoCacheKey{};
+    key.ndim = ndim;
+    for (size_t idim = 0; idim < ndim + 2; ++idim) {
+        key.x_shape[idim] = nx->shape[idim];
+        key.w_shape[idim] = nw->shape[idim];
+        key.y_shape[idim] = ny->shape[idim];
+    }
+    for (size_t idim = 0; idim < ndim; ++idim) {
+        key.pad[idim]= int_pad[idim];
+        key.stride[idim]= int_stride[idim];
+    }
+    key.dtype = cudnn_dtype;
+    key.max_workspace_size = max_workspace_size;
+    auto& algo_cache_map = bwd_data_algo_cache_map_;
+    // TODO: thread-safe
+    auto it = algo_cache_map.find(key);
+    if (it != algo_cache_map.end()) {
+        auto pair = it->second;
+        perf_result->algo = pair.first;
+        perf_result->memory = pair.second;
+        return CUDNN_STATUS_SUCCESS;
+    }
+    char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
+    char* w_ptr = cumo_na_get_offset_pointer_for_read(w);
+    char* y_ptr = cumo_na_get_offset_pointer_for_read(y);
+    char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
+    int returned_algo_count{};
+    status = cudnnFindConvolutionBackwardDataAlgorithmEx(
+                handle,
+                w_desc,
+                (void*)w_ptr,
+                x_desc,
+                (void*)x_ptr,
+                conv_desc,
+                y_desc,
+                (void*)y_ptr,
+                1,  // requested algo count,
+                &returned_algo_count,
+                perf_result,
+                (void*)workspace,
+                max_workspace_size);
+    cumo_cuda_runtime_free(workspace);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    assert(returned_algo_count == 1);
+    // TODO: thread-safe
+    algo_cache_map[key] = {perf_result->algo, perf_result->memory};
+    return status;
+}
+cudnnStatus_t
+cumo_cuda_cudnn_FindConvolutionBackwardFilterAlgorithm(
+        cudnnConvolutionBwdFilterAlgoPerf_t *perf_result,
+        cudnnHandle_t handle,
+        cudnnTensorDescriptor_t x_desc,
+        VALUE x,
+        cudnnTensorDescriptor_t gy_desc,
+        VALUE gy,
+        cudnnConvolutionDescriptor_t conv_desc,
+        cudnnFilterDescriptor_t gw_desc,
+        VALUE gw,
+        size_t max_workspace_size,
+        int* int_stride,
+        int* int_pad,
+        size_t ndim,
+        cudnnDataType_t cudnn_dtype)
+{
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    cumo_narray_t *nx, *ngy, *ngw;
+    CumoGetNArray(x, nx);
+    CumoGetNArray(gy, ngy);
+    CumoGetNArray(gw, ngw);
+    auto key = AlgoCacheKey{};
+    key.ndim = ndim;
+    for (size_t idim = 0; idim < ndim + 2; ++idim) {
+        key.x_shape[idim] = nx->shape[idim];
+        key.w_shape[idim] = ngw->shape[idim];
+        key.y_shape[idim] = ngy->shape[idim];
+    }
+    for (size_t idim = 0; idim < ndim; ++idim) {
+        key.pad[idim]= int_pad[idim];
+        key.stride[idim]= int_stride[idim];
+    }
+    key.dtype = cudnn_dtype;
+    key.max_workspace_size = max_workspace_size;
+    auto& algo_cache_map = bwd_filter_algo_cache_map_;
+    // TODO: thread-safe
+    auto it = algo_cache_map.find(key);
+    if (it != algo_cache_map.end()) {
+        auto pair = it->second;
+        perf_result->algo = pair.first;
+        perf_result->memory = pair.second;
+        return CUDNN_STATUS_SUCCESS;
+    }
+    char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
+    char* gy_ptr = cumo_na_get_offset_pointer_for_read(gy);
+    char* gw_ptr = cumo_na_get_offset_pointer_for_read(gw);
+    char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
+    int returned_algo_count{};
+    status = cudnnFindConvolutionBackwardFilterAlgorithmEx(
+                handle,
+                x_desc,
+                (void*)x_ptr,
+                gy_desc,
+                (void*)gy_ptr,
+                conv_desc,
+                gw_desc,
+                (void*)gw_ptr,
+                1,  // requested algo count,
+                &returned_algo_count,
+                perf_result,
+                (void*)workspace,
+                max_workspace_size);
+    cumo_cuda_runtime_free(workspace);
+    if (status != CUDNN_STATUS_SUCCESS) return status;
+    assert(returned_algo_count == 1);
+    // TODO: thread-safe
+    algo_cache_map[key] = {perf_result->algo, perf_result->memory};
+    return status;
+}
+// TODO(sonots): Support other than 4, 5 dimensional arrays by reshaping into 4-dimensional arrays as Chainer does.
+cudnnBatchNormMode_t
+cumo_cuda_cudnn_GetBatchNormMode(size_t ndim, int* axis) {
+    if (ndim == 1 && axis[0] == 0) {  // (1, channels, (depth, )height, width)
+        return CUDNN_BATCHNORM_PER_ACTIVATION;
+    }
+    if ((ndim == 3 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3) ||
+        (ndim == 4 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3 && axis[3] == 4)) {  // (1, channels, (1, )1, 1)
+        // TODO: Consider CUDNN_BATCHNORM_SPATIAL_PERSISTENT if we can afford to check for overflow, with or without blocking.
+        return CUDNN_BATCHNORM_SPATIAL;
+    }
+    rb_raise(rb_eRuntimeError, "Invalid axis for BatchNorm using cuDNN. Expected 1, 3 or 4 dimensions.");
+}
+cudnnStatus_t
+cumo_cuda_cudnn_CreateBNTensorDescriptor(
+        cudnnTensorDescriptor_t *desc,
+        cudnnTensorDescriptor_t x_desc,
+        cudnnBatchNormMode_t mode)
+{
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    status = cudnnCreateTensorDescriptor(desc);
+    if (status = CUDNN_STATUS_SUCCESS) return status;
+    status = cudnnDeriveBNTensorDescriptor(*desc, x_desc, mode);
+    return status;
+}
+size_t
+cumo_cuda_cudnn_ReduceShape(
+        size_t *reduced_shape,
+        size_t shape_ndim,
+        size_t *shape,
+        size_t axes_ndim,
+        int *axes,
+        char keepdims) {
+    assert(shape_ndim >= axes_ndim);
+    size_t i_axis = 0;
+    size_t i_shape = 0;
+    for (size_t i = 0; i < shape_ndim; ++i) {
+        if (i_axis < axes_ndim && i == (size_t)axes[i_axis]) {
+            ++i_axis;
+            if (keepdims) {
+                reduced_shape[i_shape++] = 1;
+            }
+        } else {
+            reduced_shape[i_shape++] = shape[i];
+        }
+    }
+    assert(i_axis == axes_ndim);
+    assert(i_shape == shape_ndim - static_cast<int8_t>(!keepdims) * axes_ndim);
+    return i_shape;
+}
+#if defined(__cplusplus)
+#if 0
+{ /* satisfy cc-mode */
+#endif
+}  /* extern "C" { */
+#endif
+#endif // CUDNN_FOUND