cumo 0.2.5 → 0.3.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -1
- data/README.md +12 -1
- data/cumo.gemspec +1 -1
- data/ext/cumo/cuda/cudnn.c +80 -0
- data/ext/cumo/cuda/cudnn_impl.cpp +572 -0
- data/ext/cumo/cuda/runtime.c +1 -0
- data/ext/cumo/cumo.c +5 -0
- data/ext/cumo/extconf.rb +8 -2
- data/ext/cumo/include/cumo.h +2 -2
- data/ext/cumo/include/cumo/cuda/cudnn.h +205 -0
- data/ext/cumo/include/cumo/hash_combine.hpp +17 -0
- data/ext/cumo/include/cumo/intern.h +5 -0
- data/ext/cumo/include/cumo/types/dfloat.h +1 -0
- data/ext/cumo/include/cumo/types/sfloat.h +1 -0
- data/ext/cumo/narray/gen/spec.rb +21 -0
- data/ext/cumo/narray/gen/tmpl/batch_norm.c +197 -0
- data/ext/cumo/narray/gen/tmpl/batch_norm_backward.c +191 -0
- data/ext/cumo/narray/gen/tmpl/conv.c +216 -0
- data/ext/cumo/narray/gen/tmpl/conv_grad_w.c +183 -0
- data/ext/cumo/narray/gen/tmpl/conv_transpose.c +244 -0
- data/ext/cumo/narray/gen/tmpl/gemm.c +14 -0
- data/ext/cumo/narray/gen/tmpl/pooling_backward.c +136 -0
- data/ext/cumo/narray/gen/tmpl/pooling_forward.c +136 -0
- data/ext/cumo/narray/narray.c +29 -0
- data/lib/cumo/cuda.rb +1 -0
- data/lib/cumo/cuda/cudnn.rb +88 -0
- metadata +18 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6fbd39b063f8c40636b699f956ab5ebf4905a58013fd249819845f3dc525f77a
|
4
|
+
data.tar.gz: 66c369f01877aa42e73dba6bcaf5a499f52084024171c7f1b5e561c1da08f7e0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6d98b07a55ead442c4edd2e2e3c648d58d26a1343938eee60ff1cf8ee3bfd9b0539c5c650e349eef5d116ef0f5cd095f077a9cd2586cddcd01023e8e7cdb225e
|
7
|
+
data.tar.gz: 22012ddfb97cde8ff78324c599351bd5aa16bac5747208c573def3b0f2d2f47f0e1d55dc83393685a0e54a006be08bab2c15631ed26c940ecd20cd07efb4a86d
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,17 @@
|
|
1
|
-
# 0.
|
1
|
+
# 0.3.0.pre1 (2019-04-09)
|
2
|
+
|
3
|
+
Enhancements:
|
4
|
+
|
5
|
+
* Support cuDNN
|
6
|
+
* conv (cudnnConvolution)
|
7
|
+
* conv\_transpose (cudnnConvolutionBackwardData)
|
8
|
+
* conv\_grad\_w (cudnnConvolutionBackwardFilter)
|
9
|
+
* batch\_norm (cudnnBatchNormalization)
|
10
|
+
* batch\_norm\_backward (cudnnBatchNormalizationBackward)
|
11
|
+
* avg\_pool and max\_pool (cudnnPoolingForward)
|
12
|
+
* avg\_pool\_backward and max\_pool\_backward (cudnnPoolingBackward)
|
13
|
+
|
14
|
+
# 0.2.5 (2019-03-04)
|
2
15
|
|
3
16
|
Enhancements:
|
4
17
|
|
data/README.md
CHANGED
@@ -22,6 +22,17 @@ export PATH="$CUDA_PATH/bin:$PATH"
|
|
22
22
|
export LIBRARY_PATH="$CUDA_PATH/lib64:$CUDA_PATH/lib:$LIBRARY_PATH"
|
23
23
|
```
|
24
24
|
|
25
|
+
To use cuDNN features, install cuDNN and set your environment variables as follows:
|
26
|
+
|
27
|
+
```
|
28
|
+
export CUDNN_ROOT_DIR=/path/to/cudnn
|
29
|
+
export CPATH=$CUDNN_ROOT_DIR/include:$CPATH
|
30
|
+
export LD_LIBRARY_PATH=$CUDNN_ROOT_DIR/lib64:$LD_LIBRARY_PATH
|
31
|
+
export LIBRARY_PATH=$CUDNN_ROOT_DIR/lib64:$LIBRARY_PATH
|
32
|
+
```
|
33
|
+
|
34
|
+
FYI: I use [cudnnenv](https://github.com/unnonouno/cudnnenv) to install cudnn under my home directory like `export CUDNN_ROOT_DIR=/home/sonots/.cudnn/active/cuda`.
|
35
|
+
|
25
36
|
## Installation
|
26
37
|
|
27
38
|
Add the following line to your Gemfile:
|
@@ -216,7 +227,7 @@ bundle exec gdb -x run.gdb --args ruby test/narray_test.rb
|
|
216
227
|
|
217
228
|
You may put a breakpoint by calling `cumo_debug_breakpoint()` at C source codes.
|
218
229
|
|
219
|
-
### Run tests only a specific line
|
230
|
+
### Run tests only a specific line
|
220
231
|
`--location` option is available as:
|
221
232
|
|
222
233
|
```
|
data/cumo.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
lib = File.expand_path("../lib", __FILE__)
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
4
|
|
5
|
-
cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([
|
5
|
+
cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([^"]+)"/)[1]
|
6
6
|
numo_narray_version = File.read(File.join(__dir__, "numo-narray-version")).strip
|
7
7
|
|
8
8
|
Gem::Specification.new do |spec|
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#include "cumo/cuda/cudnn.h"
|
2
|
+
|
3
|
+
#include <assert.h>
|
4
|
+
#include <ruby.h>
|
5
|
+
#include "cumo/narray.h"
|
6
|
+
#include "cumo/template.h"
|
7
|
+
#include "cumo/cuda/runtime.h"
|
8
|
+
|
9
|
+
VALUE cumo_cuda_eCUDNNError;
|
10
|
+
VALUE cumo_cuda_mCUDNN;
|
11
|
+
#define eCUDNNError cumo_cuda_eCUDNNError
|
12
|
+
#define mCUDNN cumo_cuda_mCUDNN
|
13
|
+
|
14
|
+
#ifdef CUDNN_FOUND
|
15
|
+
|
16
|
+
void
|
17
|
+
cumo_cuda_cudnn_check_status(cudnnStatus_t status)
|
18
|
+
{
|
19
|
+
if (status != CUDNN_STATUS_SUCCESS) {
|
20
|
+
rb_raise(cumo_cuda_eCUDNNError, "%s (error=%d)", cudnnGetErrorString(status), status);
|
21
|
+
}
|
22
|
+
}
|
23
|
+
|
24
|
+
// Lazily initialize cudnn handle, and cache it
|
25
|
+
cudnnHandle_t
|
26
|
+
cumo_cuda_cudnn_handle()
|
27
|
+
{
|
28
|
+
static cudnnHandle_t *handles = 0; // handle is never destroyed
|
29
|
+
int device;
|
30
|
+
if (handles == 0) {
|
31
|
+
int i;
|
32
|
+
int device_count = cumo_cuda_runtime_get_device_count();
|
33
|
+
handles = malloc(sizeof(cudnnHandle_t) * device_count);
|
34
|
+
for (i = 0; i < device_count; ++i) {
|
35
|
+
handles[i] = 0;
|
36
|
+
}
|
37
|
+
}
|
38
|
+
device = cumo_cuda_runtime_get_device();
|
39
|
+
if (handles[device] == 0) {
|
40
|
+
cudnnCreate(&handles[device]);
|
41
|
+
}
|
42
|
+
return handles[device];
|
43
|
+
}
|
44
|
+
|
45
|
+
#endif // CUDNN_FOUND
|
46
|
+
|
47
|
+
/*
|
48
|
+
Returns availability of cuDNN.
|
49
|
+
|
50
|
+
@return [Boolean] Returns true if cuDNN is available
|
51
|
+
*/
|
52
|
+
static VALUE
|
53
|
+
rb_cudnn_available_p()
|
54
|
+
{
|
55
|
+
#if CUDNN_FOUND
|
56
|
+
return Qtrue;
|
57
|
+
#else
|
58
|
+
return Qfalse;
|
59
|
+
#endif
|
60
|
+
}
|
61
|
+
|
62
|
+
void
|
63
|
+
Init_cumo_cuda_cudnn(void)
|
64
|
+
{
|
65
|
+
VALUE mCumo = rb_define_module("Cumo");
|
66
|
+
VALUE mCUDA = rb_define_module_under(mCumo, "CUDA");
|
67
|
+
|
68
|
+
/*
|
69
|
+
Document-module: Cumo::CUDNN
|
70
|
+
*/
|
71
|
+
mCUDNN = rb_define_module_under(mCUDA, "CUDNN");
|
72
|
+
rb_define_const(mCUDA, "Cudnn", mCUDNN); // alias
|
73
|
+
eCUDNNError = rb_define_class_under(mCUDA, "CUDNNError", rb_eStandardError);
|
74
|
+
|
75
|
+
rb_define_singleton_method(mCUDNN, "available?", RUBY_METHOD_FUNC(rb_cudnn_available_p), 0);
|
76
|
+
rb_define_const(mCUDNN, "CUDNN_POOLING_MAX", INT2NUM(CUDNN_POOLING_MAX));
|
77
|
+
rb_define_const(mCUDNN, "CUDNN_POOLING_MAX_DETERMINISTIC", INT2NUM(CUDNN_POOLING_MAX_DETERMINISTIC));
|
78
|
+
rb_define_const(mCUDNN, "CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", INT2NUM(CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING));
|
79
|
+
rb_define_const(mCUDNN, "CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", INT2NUM(CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING));
|
80
|
+
}
|
@@ -0,0 +1,572 @@
|
|
1
|
+
#ifdef CUDNN_FOUND
|
2
|
+
|
3
|
+
#include "cumo/cuda/cudnn.h"
|
4
|
+
|
5
|
+
#include <assert.h>
|
6
|
+
#include <ruby.h>
|
7
|
+
#include <cudnn.h>
|
8
|
+
#include "cumo/narray.h"
|
9
|
+
#include "cumo/template.h"
|
10
|
+
#include "cumo/cuda/runtime.h"
|
11
|
+
#include "cumo/cuda/memory_pool.h"
|
12
|
+
|
13
|
+
#include <unordered_map>
|
14
|
+
|
15
|
+
#if defined(__cplusplus)
|
16
|
+
extern "C" {
|
17
|
+
#if 0
|
18
|
+
} /* satisfy cc-mode */
|
19
|
+
#endif
|
20
|
+
#endif
|
21
|
+
|
22
|
+
// cover_all=true is not supported
|
23
|
+
size_t
|
24
|
+
cumo_cuda_cudnn_GetConvOutDim(
|
25
|
+
size_t in_dim,
|
26
|
+
size_t kernel_size,
|
27
|
+
size_t stride,
|
28
|
+
size_t pad) {
|
29
|
+
int64_t numerator;
|
30
|
+
assert(stride > 0);
|
31
|
+
// if (cover_all) {
|
32
|
+
// numerator = in_dim + pad * 2 - kernel_size + stride - 1;
|
33
|
+
// } else {
|
34
|
+
numerator = in_dim + pad * 2 - kernel_size;
|
35
|
+
// }
|
36
|
+
if (numerator < 0) {
|
37
|
+
rb_raise(rb_eRuntimeError, "Output size should be positive.");
|
38
|
+
}
|
39
|
+
return (size_t)(numerator / stride + 1);
|
40
|
+
}
|
41
|
+
|
42
|
+
// cover_all=true is not supported
|
43
|
+
size_t
|
44
|
+
cumo_cuda_cudnn_GetConvTransposeOutDim(
|
45
|
+
size_t in_dim,
|
46
|
+
size_t kernel_size,
|
47
|
+
size_t stride,
|
48
|
+
size_t pad) {
|
49
|
+
// if (cover_all) {
|
50
|
+
// return stride * (in_dim - 1) + kernel_size - stride + 1 - 2 * pad;
|
51
|
+
// }
|
52
|
+
int64_t out_size = stride * (in_dim - 1) + kernel_size - 2 * pad;
|
53
|
+
if (out_size < 0) {
|
54
|
+
rb_raise(rb_eRuntimeError, "Output size should be positive.");
|
55
|
+
}
|
56
|
+
return (size_t)out_size;
|
57
|
+
}
|
58
|
+
|
59
|
+
cudnnStatus_t
|
60
|
+
cumo_cuda_cudnn_CreateTensorDescriptor(
|
61
|
+
cudnnTensorDescriptor_t *desc,
|
62
|
+
VALUE a, cudnnDataType_t cudnn_dtype) {
|
63
|
+
cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
|
64
|
+
cumo_narray_t *na;
|
65
|
+
CumoGetNArray(a, na);
|
66
|
+
int ndim = (int)(na->ndim);
|
67
|
+
size_t *shape = na->shape;
|
68
|
+
|
69
|
+
assert(cumo_na_check_contiguous(a) == Qtrue);
|
70
|
+
status = cudnnCreateTensorDescriptor(desc);
|
71
|
+
if (status != CUDNN_STATUS_SUCCESS) return status;
|
72
|
+
|
73
|
+
if (ndim == 4) {
|
74
|
+
status = cudnnSetTensor4dDescriptor(
|
75
|
+
*desc, CUDNN_TENSOR_NCHW, cudnn_dtype, shape[0], shape[1], shape[2], shape[3]);
|
76
|
+
}
|
77
|
+
else {
|
78
|
+
int int_shape[CUMO_NA_MAX_DIMENSION];
|
79
|
+
for (int idim = 0; idim < ndim; ++idim) {
|
80
|
+
int_shape[idim] = (int)(shape[idim]);
|
81
|
+
}
|
82
|
+
int int_strides[CUMO_NA_MAX_DIMENSION]; // strides divided by item size
|
83
|
+
int stride = 1;
|
84
|
+
for (int idim = ndim - 1; idim >= 0; --idim) {
|
85
|
+
int_strides[idim] = stride;
|
86
|
+
stride *= int_shape[idim];
|
87
|
+
}
|
88
|
+
status = cudnnSetTensorNdDescriptor(*desc, cudnn_dtype, ndim, int_shape, int_strides);
|
89
|
+
}
|
90
|
+
return status;
|
91
|
+
}
|
92
|
+
|
93
|
+
cudnnStatus_t
|
94
|
+
cumo_cuda_cudnn_CreateFilterDescriptor(
|
95
|
+
cudnnFilterDescriptor_t *desc,
|
96
|
+
VALUE a,
|
97
|
+
cudnnDataType_t cudnn_dtype) {
|
98
|
+
cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
|
99
|
+
cumo_narray_t *na;
|
100
|
+
int ndim;
|
101
|
+
size_t *shape;
|
102
|
+
|
103
|
+
CumoGetNArray(a, na);
|
104
|
+
ndim = (int)(na->ndim);
|
105
|
+
shape = na->shape;
|
106
|
+
|
107
|
+
assert(cumo_na_check_contiguous(a) == Qtrue);
|
108
|
+
status = cudnnCreateFilterDescriptor(desc);
|
109
|
+
if (status != CUDNN_STATUS_SUCCESS) return status;
|
110
|
+
|
111
|
+
if (ndim == 4) {
|
112
|
+
status = cudnnSetFilter4dDescriptor(
|
113
|
+
*desc, cudnn_dtype, CUDNN_TENSOR_NCHW, shape[0], shape[1], shape[2], shape[3]);
|
114
|
+
} else {
|
115
|
+
int int_shape[CUMO_NA_MAX_DIMENSION];
|
116
|
+
for (int idim = 0; idim < ndim; ++idim) {
|
117
|
+
int_shape[idim] = (int)(shape[idim]);
|
118
|
+
}
|
119
|
+
status = cudnnSetFilterNdDescriptor(*desc, cudnn_dtype, CUDNN_TENSOR_NCHW, ndim, int_shape);
|
120
|
+
}
|
121
|
+
|
122
|
+
return status;
|
123
|
+
}
|
124
|
+
|
125
|
+
cudnnStatus_t
|
126
|
+
cumo_cuda_cudnn_CreateConvolutionDescriptor(
|
127
|
+
cudnnConvolutionDescriptor_t *desc,
|
128
|
+
size_t ndim,
|
129
|
+
int* int_stride,
|
130
|
+
int* int_pad,
|
131
|
+
cudnnDataType_t cudnn_dtype) {
|
132
|
+
cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
|
133
|
+
int int_dilation[CUMO_NA_MAX_DIMENSION];
|
134
|
+
for (size_t idim = 0; idim < ndim; ++idim) {
|
135
|
+
int_dilation[idim] = 1;
|
136
|
+
}
|
137
|
+
|
138
|
+
status = cudnnCreateConvolutionDescriptor(desc);
|
139
|
+
if (status != CUDNN_STATUS_SUCCESS) return status;
|
140
|
+
|
141
|
+
if (ndim == 2) {
|
142
|
+
status = cudnnSetConvolution2dDescriptor(
|
143
|
+
*desc,
|
144
|
+
int_pad[0],
|
145
|
+
int_pad[1],
|
146
|
+
int_stride[0],
|
147
|
+
int_stride[1],
|
148
|
+
int_dilation[0],
|
149
|
+
int_dilation[1],
|
150
|
+
CUDNN_CROSS_CORRELATION,
|
151
|
+
cudnn_dtype);
|
152
|
+
} else {
|
153
|
+
status = cudnnSetConvolutionNdDescriptor(
|
154
|
+
*desc,
|
155
|
+
ndim,
|
156
|
+
int_pad,
|
157
|
+
int_stride,
|
158
|
+
int_dilation,
|
159
|
+
CUDNN_CROSS_CORRELATION,
|
160
|
+
cudnn_dtype);
|
161
|
+
}
|
162
|
+
|
163
|
+
return status;
|
164
|
+
}
|
165
|
+
|
166
|
+
cudnnStatus_t
|
167
|
+
cumo_cuda_cudnn_CreatePoolingDescriptor(
|
168
|
+
cudnnPoolingDescriptor_t *desc,
|
169
|
+
cudnnPoolingMode_t mode,
|
170
|
+
size_t ndim,
|
171
|
+
int* int_kernel_size,
|
172
|
+
int* int_stride,
|
173
|
+
int* int_pad) {
|
174
|
+
cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
|
175
|
+
|
176
|
+
status = cudnnCreatePoolingDescriptor(desc);
|
177
|
+
if (status != CUDNN_STATUS_SUCCESS) return status;
|
178
|
+
|
179
|
+
if (ndim == 2) {
|
180
|
+
status = cudnnSetPooling2dDescriptor(
|
181
|
+
*desc,
|
182
|
+
mode,
|
183
|
+
CUDNN_NOT_PROPAGATE_NAN,
|
184
|
+
int_kernel_size[0],
|
185
|
+
int_kernel_size[1],
|
186
|
+
int_pad[0],
|
187
|
+
int_pad[1],
|
188
|
+
int_stride[0],
|
189
|
+
int_stride[1]);
|
190
|
+
} else {
|
191
|
+
status = cudnnSetPoolingNdDescriptor(
|
192
|
+
*desc,
|
193
|
+
mode,
|
194
|
+
CUDNN_NOT_PROPAGATE_NAN,
|
195
|
+
ndim,
|
196
|
+
int_kernel_size,
|
197
|
+
int_pad,
|
198
|
+
int_stride);
|
199
|
+
}
|
200
|
+
|
201
|
+
return status;
|
202
|
+
}
|
203
|
+
|
204
|
+
// Borrowed from boost::hash_combine
|
205
|
+
//
|
206
|
+
// TODO(sonots): hash combine in 64bit
|
207
|
+
static void HashCombine(std::size_t& seed, std::size_t hash_value) {
|
208
|
+
seed ^= hash_value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
|
209
|
+
}
|
210
|
+
|
211
|
+
// Partially Borrowed from ChainerX
|
212
|
+
struct AlgoCacheKey {
|
213
|
+
size_t ndim; // # of spatial dimensions
|
214
|
+
size_t x_shape[CUMO_NA_MAX_DIMENSION];
|
215
|
+
size_t w_shape[CUMO_NA_MAX_DIMENSION];
|
216
|
+
size_t y_shape[CUMO_NA_MAX_DIMENSION];
|
217
|
+
size_t pad[CUMO_NA_MAX_DIMENSION];
|
218
|
+
size_t stride[CUMO_NA_MAX_DIMENSION];
|
219
|
+
cudnnDataType_t dtype;
|
220
|
+
size_t max_workspace_size;
|
221
|
+
|
222
|
+
bool operator==(const AlgoCacheKey& other) const {
|
223
|
+
if (ndim != other.ndim) return false;
|
224
|
+
if (dtype != other.dtype) return false;
|
225
|
+
if (max_workspace_size != other.max_workspace_size) return false;
|
226
|
+
for (size_t idim = 0; idim < ndim + 2; ++idim) {
|
227
|
+
if (x_shape[idim] != other.x_shape[idim]) return false;
|
228
|
+
}
|
229
|
+
for (size_t idim = 0; idim < ndim + 2; ++idim) {
|
230
|
+
if (w_shape[idim] != other.w_shape[idim]) return false;
|
231
|
+
}
|
232
|
+
for (size_t idim = 0; idim < ndim + 2; ++idim) {
|
233
|
+
if (y_shape[idim] != other.y_shape[idim]) return false;
|
234
|
+
}
|
235
|
+
for (size_t idim = 0; idim < ndim; ++idim) {
|
236
|
+
if (pad[idim] != other.pad[idim]) return false;
|
237
|
+
}
|
238
|
+
for (size_t idim = 0; idim < ndim; ++idim) {
|
239
|
+
if (stride[idim] != other.stride[idim]) return false;
|
240
|
+
}
|
241
|
+
return true;
|
242
|
+
}
|
243
|
+
|
244
|
+
bool operator!=(const AlgoCacheKey& other) const { return !operator==(other); }
|
245
|
+
};
|
246
|
+
|
247
|
+
struct AlgoCacheKeyHash {
|
248
|
+
using result_type = std::size_t;
|
249
|
+
std::size_t operator()(const AlgoCacheKey& key) const {
|
250
|
+
std::size_t seed = 0;
|
251
|
+
size_t ndim = key.ndim;
|
252
|
+
HashCombine(seed, std::hash<size_t>()(key.ndim));
|
253
|
+
for (size_t idim = 0; idim < ndim + 2; ++idim) {
|
254
|
+
HashCombine(seed, std::hash<size_t>()(key.x_shape[idim]));
|
255
|
+
}
|
256
|
+
for (size_t idim = 0; idim < ndim + 2; ++idim) {
|
257
|
+
HashCombine(seed, std::hash<size_t>()(key.w_shape[idim]));
|
258
|
+
}
|
259
|
+
for (size_t idim = 0; idim < ndim + 2; ++idim) {
|
260
|
+
HashCombine(seed, std::hash<size_t>()(key.y_shape[idim]));
|
261
|
+
}
|
262
|
+
for (size_t idim = 0; idim < ndim; ++idim) {
|
263
|
+
HashCombine(seed, std::hash<size_t>()(key.pad[idim]));
|
264
|
+
}
|
265
|
+
for (size_t idim = 0; idim < ndim; ++idim) {
|
266
|
+
HashCombine(seed, std::hash<size_t>()(key.stride[idim]));
|
267
|
+
}
|
268
|
+
HashCombine(seed, std::hash<int>()((int)(key.dtype)));
|
269
|
+
HashCombine(seed, std::hash<size_t>()(key.max_workspace_size));
|
270
|
+
return seed;
|
271
|
+
}
|
272
|
+
};
|
273
|
+
|
274
|
+
using FwdAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionFwdAlgo_t, size_t>, AlgoCacheKeyHash>;
|
275
|
+
using BwdDataAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionBwdDataAlgo_t, size_t>, AlgoCacheKeyHash>;
|
276
|
+
using BwdFilterAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionBwdFilterAlgo_t, size_t>, AlgoCacheKeyHash>;
|
277
|
+
|
278
|
+
// TODO: Another cache for another device
|
279
|
+
static FwdAlgoCacheMap fwd_algo_cache_map_{};
|
280
|
+
static BwdDataAlgoCacheMap bwd_data_algo_cache_map_{};
|
281
|
+
static BwdFilterAlgoCacheMap bwd_filter_algo_cache_map_{};
|
282
|
+
|
283
|
+
cudnnStatus_t
|
284
|
+
cumo_cuda_cudnn_FindConvolutionForwardAlgorithm(
|
285
|
+
cudnnConvolutionFwdAlgoPerf_t *perf_result,
|
286
|
+
cudnnHandle_t handle,
|
287
|
+
cudnnTensorDescriptor_t x_desc,
|
288
|
+
VALUE x,
|
289
|
+
cudnnFilterDescriptor_t w_desc,
|
290
|
+
VALUE w,
|
291
|
+
cudnnConvolutionDescriptor_t conv_desc,
|
292
|
+
cudnnTensorDescriptor_t y_desc,
|
293
|
+
VALUE y,
|
294
|
+
size_t max_workspace_size,
|
295
|
+
int* int_stride,
|
296
|
+
int* int_pad,
|
297
|
+
size_t ndim,
|
298
|
+
cudnnDataType_t cudnn_dtype)
|
299
|
+
{
|
300
|
+
cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
|
301
|
+
cumo_narray_t *nx, *nw, *ny;
|
302
|
+
CumoGetNArray(x, nx);
|
303
|
+
CumoGetNArray(w, nw);
|
304
|
+
CumoGetNArray(y, ny);
|
305
|
+
|
306
|
+
auto key = AlgoCacheKey{};
|
307
|
+
key.ndim = ndim;
|
308
|
+
for (size_t idim = 0; idim < ndim + 2; ++idim) {
|
309
|
+
key.x_shape[idim] = nx->shape[idim];
|
310
|
+
key.w_shape[idim] = nw->shape[idim];
|
311
|
+
key.y_shape[idim] = ny->shape[idim];
|
312
|
+
}
|
313
|
+
for (size_t idim = 0; idim < ndim; ++idim) {
|
314
|
+
key.pad[idim]= int_pad[idim];
|
315
|
+
key.stride[idim]= int_stride[idim];
|
316
|
+
}
|
317
|
+
key.dtype = cudnn_dtype;
|
318
|
+
key.max_workspace_size = max_workspace_size;
|
319
|
+
|
320
|
+
auto& algo_cache_map = fwd_algo_cache_map_;
|
321
|
+
// TODO: thread-safe
|
322
|
+
auto it = algo_cache_map.find(key);
|
323
|
+
if (it != algo_cache_map.end()) {
|
324
|
+
auto pair = it->second;
|
325
|
+
perf_result->algo = pair.first;
|
326
|
+
perf_result->memory = pair.second;
|
327
|
+
return CUDNN_STATUS_SUCCESS;
|
328
|
+
}
|
329
|
+
|
330
|
+
char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
|
331
|
+
char* w_ptr = cumo_na_get_offset_pointer_for_read(w);
|
332
|
+
char* y_ptr = cumo_na_get_offset_pointer_for_read(y);
|
333
|
+
|
334
|
+
char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
|
335
|
+
int returned_algo_count{};
|
336
|
+
status = cudnnFindConvolutionForwardAlgorithmEx(
|
337
|
+
handle,
|
338
|
+
x_desc,
|
339
|
+
(void*)x_ptr,
|
340
|
+
w_desc,
|
341
|
+
(void*)w_ptr,
|
342
|
+
conv_desc,
|
343
|
+
y_desc,
|
344
|
+
(void*)y_ptr,
|
345
|
+
1, // requested algo count,
|
346
|
+
&returned_algo_count,
|
347
|
+
perf_result,
|
348
|
+
(void*)workspace,
|
349
|
+
max_workspace_size);
|
350
|
+
cumo_cuda_runtime_free(workspace);
|
351
|
+
if (status != CUDNN_STATUS_SUCCESS) return status;
|
352
|
+
assert(returned_algo_count == 1);
|
353
|
+
|
354
|
+
// TODO: thread-safe
|
355
|
+
algo_cache_map[key] = {perf_result->algo, perf_result->memory};
|
356
|
+
return status;
|
357
|
+
}
|
358
|
+
|
359
|
+
cudnnStatus_t
|
360
|
+
cumo_cuda_cudnn_FindConvolutionBackwardDataAlgorithm(
|
361
|
+
cudnnConvolutionBwdDataAlgoPerf_t *perf_result,
|
362
|
+
cudnnHandle_t handle,
|
363
|
+
cudnnFilterDescriptor_t w_desc,
|
364
|
+
VALUE w,
|
365
|
+
cudnnTensorDescriptor_t x_desc,
|
366
|
+
VALUE x,
|
367
|
+
cudnnConvolutionDescriptor_t conv_desc,
|
368
|
+
cudnnTensorDescriptor_t y_desc,
|
369
|
+
VALUE y,
|
370
|
+
size_t max_workspace_size,
|
371
|
+
int* int_stride,
|
372
|
+
int* int_pad,
|
373
|
+
size_t ndim,
|
374
|
+
cudnnDataType_t cudnn_dtype)
|
375
|
+
{
|
376
|
+
cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
|
377
|
+
cumo_narray_t *nx, *nw, *ny;
|
378
|
+
CumoGetNArray(x, nx);
|
379
|
+
CumoGetNArray(w, nw);
|
380
|
+
CumoGetNArray(y, ny);
|
381
|
+
|
382
|
+
auto key = AlgoCacheKey{};
|
383
|
+
key.ndim = ndim;
|
384
|
+
for (size_t idim = 0; idim < ndim + 2; ++idim) {
|
385
|
+
key.x_shape[idim] = nx->shape[idim];
|
386
|
+
key.w_shape[idim] = nw->shape[idim];
|
387
|
+
key.y_shape[idim] = ny->shape[idim];
|
388
|
+
}
|
389
|
+
for (size_t idim = 0; idim < ndim; ++idim) {
|
390
|
+
key.pad[idim]= int_pad[idim];
|
391
|
+
key.stride[idim]= int_stride[idim];
|
392
|
+
}
|
393
|
+
key.dtype = cudnn_dtype;
|
394
|
+
key.max_workspace_size = max_workspace_size;
|
395
|
+
|
396
|
+
auto& algo_cache_map = bwd_data_algo_cache_map_;
|
397
|
+
// TODO: thread-safe
|
398
|
+
auto it = algo_cache_map.find(key);
|
399
|
+
if (it != algo_cache_map.end()) {
|
400
|
+
auto pair = it->second;
|
401
|
+
perf_result->algo = pair.first;
|
402
|
+
perf_result->memory = pair.second;
|
403
|
+
return CUDNN_STATUS_SUCCESS;
|
404
|
+
}
|
405
|
+
|
406
|
+
char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
|
407
|
+
char* w_ptr = cumo_na_get_offset_pointer_for_read(w);
|
408
|
+
char* y_ptr = cumo_na_get_offset_pointer_for_read(y);
|
409
|
+
|
410
|
+
char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
|
411
|
+
int returned_algo_count{};
|
412
|
+
status = cudnnFindConvolutionBackwardDataAlgorithmEx(
|
413
|
+
handle,
|
414
|
+
w_desc,
|
415
|
+
(void*)w_ptr,
|
416
|
+
x_desc,
|
417
|
+
(void*)x_ptr,
|
418
|
+
conv_desc,
|
419
|
+
y_desc,
|
420
|
+
(void*)y_ptr,
|
421
|
+
1, // requested algo count,
|
422
|
+
&returned_algo_count,
|
423
|
+
perf_result,
|
424
|
+
(void*)workspace,
|
425
|
+
max_workspace_size);
|
426
|
+
cumo_cuda_runtime_free(workspace);
|
427
|
+
if (status != CUDNN_STATUS_SUCCESS) return status;
|
428
|
+
assert(returned_algo_count == 1);
|
429
|
+
|
430
|
+
// TODO: thread-safe
|
431
|
+
algo_cache_map[key] = {perf_result->algo, perf_result->memory};
|
432
|
+
return status;
|
433
|
+
}
|
434
|
+
|
435
|
+
cudnnStatus_t
|
436
|
+
cumo_cuda_cudnn_FindConvolutionBackwardFilterAlgorithm(
|
437
|
+
cudnnConvolutionBwdFilterAlgoPerf_t *perf_result,
|
438
|
+
cudnnHandle_t handle,
|
439
|
+
cudnnTensorDescriptor_t x_desc,
|
440
|
+
VALUE x,
|
441
|
+
cudnnTensorDescriptor_t gy_desc,
|
442
|
+
VALUE gy,
|
443
|
+
cudnnConvolutionDescriptor_t conv_desc,
|
444
|
+
cudnnFilterDescriptor_t gw_desc,
|
445
|
+
VALUE gw,
|
446
|
+
size_t max_workspace_size,
|
447
|
+
int* int_stride,
|
448
|
+
int* int_pad,
|
449
|
+
size_t ndim,
|
450
|
+
cudnnDataType_t cudnn_dtype)
|
451
|
+
{
|
452
|
+
cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
|
453
|
+
cumo_narray_t *nx, *ngy, *ngw;
|
454
|
+
CumoGetNArray(x, nx);
|
455
|
+
CumoGetNArray(gy, ngy);
|
456
|
+
CumoGetNArray(gw, ngw);
|
457
|
+
|
458
|
+
auto key = AlgoCacheKey{};
|
459
|
+
key.ndim = ndim;
|
460
|
+
for (size_t idim = 0; idim < ndim + 2; ++idim) {
|
461
|
+
key.x_shape[idim] = nx->shape[idim];
|
462
|
+
key.w_shape[idim] = ngw->shape[idim];
|
463
|
+
key.y_shape[idim] = ngy->shape[idim];
|
464
|
+
}
|
465
|
+
for (size_t idim = 0; idim < ndim; ++idim) {
|
466
|
+
key.pad[idim]= int_pad[idim];
|
467
|
+
key.stride[idim]= int_stride[idim];
|
468
|
+
}
|
469
|
+
key.dtype = cudnn_dtype;
|
470
|
+
key.max_workspace_size = max_workspace_size;
|
471
|
+
|
472
|
+
auto& algo_cache_map = bwd_filter_algo_cache_map_;
|
473
|
+
// TODO: thread-safe
|
474
|
+
auto it = algo_cache_map.find(key);
|
475
|
+
if (it != algo_cache_map.end()) {
|
476
|
+
auto pair = it->second;
|
477
|
+
perf_result->algo = pair.first;
|
478
|
+
perf_result->memory = pair.second;
|
479
|
+
return CUDNN_STATUS_SUCCESS;
|
480
|
+
}
|
481
|
+
|
482
|
+
char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
|
483
|
+
char* gy_ptr = cumo_na_get_offset_pointer_for_read(gy);
|
484
|
+
char* gw_ptr = cumo_na_get_offset_pointer_for_read(gw);
|
485
|
+
|
486
|
+
char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
|
487
|
+
int returned_algo_count{};
|
488
|
+
status = cudnnFindConvolutionBackwardFilterAlgorithmEx(
|
489
|
+
handle,
|
490
|
+
x_desc,
|
491
|
+
(void*)x_ptr,
|
492
|
+
gy_desc,
|
493
|
+
(void*)gy_ptr,
|
494
|
+
conv_desc,
|
495
|
+
gw_desc,
|
496
|
+
(void*)gw_ptr,
|
497
|
+
1, // requested algo count,
|
498
|
+
&returned_algo_count,
|
499
|
+
perf_result,
|
500
|
+
(void*)workspace,
|
501
|
+
max_workspace_size);
|
502
|
+
cumo_cuda_runtime_free(workspace);
|
503
|
+
if (status != CUDNN_STATUS_SUCCESS) return status;
|
504
|
+
assert(returned_algo_count == 1);
|
505
|
+
|
506
|
+
// TODO: thread-safe
|
507
|
+
algo_cache_map[key] = {perf_result->algo, perf_result->memory};
|
508
|
+
return status;
|
509
|
+
}
|
510
|
+
|
511
|
+
// TODO(sonots): Support other than 4, 5 dimensional arrays by reshaping into 4-dimensional arrays as Chainer does.
|
512
|
+
cudnnBatchNormMode_t
|
513
|
+
cumo_cuda_cudnn_GetBatchNormMode(size_t ndim, int* axis) {
|
514
|
+
if (ndim == 1 && axis[0] == 0) { // (1, channels, (depth, )height, width)
|
515
|
+
return CUDNN_BATCHNORM_PER_ACTIVATION;
|
516
|
+
}
|
517
|
+
if ((ndim == 3 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3) ||
|
518
|
+
(ndim == 4 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3 && axis[3] == 4)) { // (1, channels, (1, )1, 1)
|
519
|
+
// TODO: Consider CUDNN_BATCHNORM_SPATIAL_PERSISTENT if we can afford to check for overflow, with or without blocking.
|
520
|
+
return CUDNN_BATCHNORM_SPATIAL;
|
521
|
+
}
|
522
|
+
rb_raise(rb_eRuntimeError, "Invalid axis for BatchNorm using cuDNN. Expected 1, 3 or 4 dimensions.");
|
523
|
+
}
|
524
|
+
|
525
|
+
cudnnStatus_t
|
526
|
+
cumo_cuda_cudnn_CreateBNTensorDescriptor(
|
527
|
+
cudnnTensorDescriptor_t *desc,
|
528
|
+
cudnnTensorDescriptor_t x_desc,
|
529
|
+
cudnnBatchNormMode_t mode)
|
530
|
+
{
|
531
|
+
cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
|
532
|
+
status = cudnnCreateTensorDescriptor(desc);
|
533
|
+
if (status = CUDNN_STATUS_SUCCESS) return status;
|
534
|
+
|
535
|
+
status = cudnnDeriveBNTensorDescriptor(*desc, x_desc, mode);
|
536
|
+
return status;
|
537
|
+
}
|
538
|
+
|
539
|
+
size_t
|
540
|
+
cumo_cuda_cudnn_ReduceShape(
|
541
|
+
size_t *reduced_shape,
|
542
|
+
size_t shape_ndim,
|
543
|
+
size_t *shape,
|
544
|
+
size_t axes_ndim,
|
545
|
+
int *axes,
|
546
|
+
char keepdims) {
|
547
|
+
assert(shape_ndim >= axes_ndim);
|
548
|
+
size_t i_axis = 0;
|
549
|
+
size_t i_shape = 0;
|
550
|
+
for (size_t i = 0; i < shape_ndim; ++i) {
|
551
|
+
if (i_axis < axes_ndim && i == (size_t)axes[i_axis]) {
|
552
|
+
++i_axis;
|
553
|
+
if (keepdims) {
|
554
|
+
reduced_shape[i_shape++] = 1;
|
555
|
+
}
|
556
|
+
} else {
|
557
|
+
reduced_shape[i_shape++] = shape[i];
|
558
|
+
}
|
559
|
+
}
|
560
|
+
assert(i_axis == axes_ndim);
|
561
|
+
assert(i_shape == shape_ndim - static_cast<int8_t>(!keepdims) * axes_ndim);
|
562
|
+
return i_shape;
|
563
|
+
}
|
564
|
+
|
565
|
+
#if defined(__cplusplus)
|
566
|
+
#if 0
|
567
|
+
{ /* satisfy cc-mode */
|
568
|
+
#endif
|
569
|
+
} /* extern "C" { */
|
570
|
+
#endif
|
571
|
+
|
572
|
+
#endif // CUDNN_FOUND
|