cumo 0.2.5 → 0.3.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1b28beaea182d622d304bcb3153e56aa3280993ec079aea44c00b915d1e92b77
4
- data.tar.gz: 26fc0e1942a444e5f9cb4641b3e36f9985593a10de26188f0a4142e72314d82a
3
+ metadata.gz: 6fbd39b063f8c40636b699f956ab5ebf4905a58013fd249819845f3dc525f77a
4
+ data.tar.gz: 66c369f01877aa42e73dba6bcaf5a499f52084024171c7f1b5e561c1da08f7e0
5
5
  SHA512:
6
- metadata.gz: a678cb7965fbbc9febf6b5f2f557f8be34f28c051fc0437a87506d3a067a34778a73b75dbeb56da14fd538062a8454355efd06bb686056db5b4df7cab9c04e86
7
- data.tar.gz: 30ce98cae4e84ee7e9e73eae3ad76bcaca1e636462301d1afe1aa50e1f50633ed1b16756b90aaeba1a3e0870179d7e2dbee41696b175f0a454efee93e5f89591
6
+ metadata.gz: 6d98b07a55ead442c4edd2e2e3c648d58d26a1343938eee60ff1cf8ee3bfd9b0539c5c650e349eef5d116ef0f5cd095f077a9cd2586cddcd01023e8e7cdb225e
7
+ data.tar.gz: 22012ddfb97cde8ff78324c599351bd5aa16bac5747208c573def3b0f2d2f47f0e1d55dc83393685a0e54a006be08bab2c15631ed26c940ecd20cd07efb4a86d
data/CHANGELOG.md CHANGED
@@ -1,4 +1,17 @@
1
- # 0.2.5 (2019-03-04)-
1
+ # 0.3.0.pre1 (2019-04-09)
2
+
3
+ Enhancements:
4
+
5
+ * Support cuDNN
6
+ * conv (cudnnConvolution)
7
+ * conv\_transpose (cudnnConvolutionBackwardData)
8
+ * conv\_grad\_w (cudnnConvolutionBackwardFilter)
9
+ * batch\_norm (cudnnBatchNormalization)
10
+ * batch\_norm\_backward (cudnnBatchNormalizationBackward)
11
+ * avg\_pool and max\_pool (cudnnPoolingForward)
12
+ * avg\_pool\_backward and max\_pool\_backward (cudnnPoolingBackward)
13
+
14
+ # 0.2.5 (2019-03-04)
2
15
 
3
16
  Enhancements:
4
17
 
data/README.md CHANGED
@@ -22,6 +22,17 @@ export PATH="$CUDA_PATH/bin:$PATH"
22
22
  export LIBRARY_PATH="$CUDA_PATH/lib64:$CUDA_PATH/lib:$LIBRARY_PATH"
23
23
  ```
24
24
 
25
+ To use cuDNN features, install cuDNN and set your environment variables as follows:
26
+
27
+ ```
28
+ export CUDNN_ROOT_DIR=/path/to/cudnn
29
+ export CPATH=$CUDNN_ROOT_DIR/include:$CPATH
30
+ export LD_LIBRARY_PATH=$CUDNN_ROOT_DIR/lib64:$LD_LIBRARY_PATH
31
+ export LIBRARY_PATH=$CUDNN_ROOT_DIR/lib64:$LIBRARY_PATH
32
+ ```
33
+
34
+ FYI: I use [cudnnenv](https://github.com/unnonouno/cudnnenv) to install cudnn under my home directory like `export CUDNN_ROOT_DIR=/home/sonots/.cudnn/active/cuda`.
35
+
25
36
  ## Installation
26
37
 
27
38
  Add the following line to your Gemfile:
@@ -216,7 +227,7 @@ bundle exec gdb -x run.gdb --args ruby test/narray_test.rb
216
227
 
217
228
  You may put a breakpoint by calling `cumo_debug_breakpoint()` at C source codes.
218
229
 
219
- ### Run tests only a specific line
230
+ ### Run tests only a specific line
220
231
  `--location` option is available as:
221
232
 
222
233
  ```
data/cumo.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
  lib = File.expand_path("../lib", __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
 
5
- cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([\d.]+)"/)[1]
5
+ cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([^"]+)"/)[1]
6
6
  numo_narray_version = File.read(File.join(__dir__, "numo-narray-version")).strip
7
7
 
8
8
  Gem::Specification.new do |spec|
@@ -0,0 +1,80 @@
1
+ #include "cumo/cuda/cudnn.h"
2
+
3
+ #include <assert.h>
4
+ #include <ruby.h>
5
+ #include "cumo/narray.h"
6
+ #include "cumo/template.h"
7
+ #include "cumo/cuda/runtime.h"
8
+
9
+ VALUE cumo_cuda_eCUDNNError;
10
+ VALUE cumo_cuda_mCUDNN;
11
+ #define eCUDNNError cumo_cuda_eCUDNNError
12
+ #define mCUDNN cumo_cuda_mCUDNN
13
+
14
+ #ifdef CUDNN_FOUND
15
+
16
+ void
17
+ cumo_cuda_cudnn_check_status(cudnnStatus_t status)
18
+ {
19
+ if (status != CUDNN_STATUS_SUCCESS) {
20
+ rb_raise(cumo_cuda_eCUDNNError, "%s (error=%d)", cudnnGetErrorString(status), status);
21
+ }
22
+ }
23
+
24
+ // Lazily initialize cudnn handle, and cache it
25
+ cudnnHandle_t
26
+ cumo_cuda_cudnn_handle()
27
+ {
28
+ static cudnnHandle_t *handles = 0; // handle is never destroyed
29
+ int device;
30
+ if (handles == 0) {
31
+ int i;
32
+ int device_count = cumo_cuda_runtime_get_device_count();
33
+ handles = malloc(sizeof(cudnnHandle_t) * device_count);
34
+ for (i = 0; i < device_count; ++i) {
35
+ handles[i] = 0;
36
+ }
37
+ }
38
+ device = cumo_cuda_runtime_get_device();
39
+ if (handles[device] == 0) {
40
+ cudnnCreate(&handles[device]);
41
+ }
42
+ return handles[device];
43
+ }
44
+
45
+ #endif // CUDNN_FOUND
46
+
47
+ /*
48
+ Returns availability of cuDNN.
49
+
50
+ @return [Boolean] Returns true if cuDNN is available
51
+ */
52
+ static VALUE
53
+ rb_cudnn_available_p()
54
+ {
55
+ #if CUDNN_FOUND
56
+ return Qtrue;
57
+ #else
58
+ return Qfalse;
59
+ #endif
60
+ }
61
+
62
+ void
63
+ Init_cumo_cuda_cudnn(void)
64
+ {
65
+ VALUE mCumo = rb_define_module("Cumo");
66
+ VALUE mCUDA = rb_define_module_under(mCumo, "CUDA");
67
+
68
+ /*
69
+ Document-module: Cumo::CUDNN
70
+ */
71
+ mCUDNN = rb_define_module_under(mCUDA, "CUDNN");
72
+ rb_define_const(mCUDA, "Cudnn", mCUDNN); // alias
73
+ eCUDNNError = rb_define_class_under(mCUDA, "CUDNNError", rb_eStandardError);
74
+
75
+ rb_define_singleton_method(mCUDNN, "available?", RUBY_METHOD_FUNC(rb_cudnn_available_p), 0);
76
+ rb_define_const(mCUDNN, "CUDNN_POOLING_MAX", INT2NUM(CUDNN_POOLING_MAX));
77
+ rb_define_const(mCUDNN, "CUDNN_POOLING_MAX_DETERMINISTIC", INT2NUM(CUDNN_POOLING_MAX_DETERMINISTIC));
78
+ rb_define_const(mCUDNN, "CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", INT2NUM(CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING));
79
+ rb_define_const(mCUDNN, "CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", INT2NUM(CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING));
80
+ }
@@ -0,0 +1,572 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ #include "cumo/cuda/cudnn.h"
4
+
5
+ #include <assert.h>
6
+ #include <ruby.h>
7
+ #include <cudnn.h>
8
+ #include "cumo/narray.h"
9
+ #include "cumo/template.h"
10
+ #include "cumo/cuda/runtime.h"
11
+ #include "cumo/cuda/memory_pool.h"
12
+
13
+ #include <unordered_map>
14
+
15
+ #if defined(__cplusplus)
16
+ extern "C" {
17
+ #if 0
18
+ } /* satisfy cc-mode */
19
+ #endif
20
+ #endif
21
+
22
+ // cover_all=true is not supported
23
+ size_t
24
+ cumo_cuda_cudnn_GetConvOutDim(
25
+ size_t in_dim,
26
+ size_t kernel_size,
27
+ size_t stride,
28
+ size_t pad) {
29
+ int64_t numerator;
30
+ assert(stride > 0);
31
+ // if (cover_all) {
32
+ // numerator = in_dim + pad * 2 - kernel_size + stride - 1;
33
+ // } else {
34
+ numerator = in_dim + pad * 2 - kernel_size;
35
+ // }
36
+ if (numerator < 0) {
37
+ rb_raise(rb_eRuntimeError, "Output size should be positive.");
38
+ }
39
+ return (size_t)(numerator / stride + 1);
40
+ }
41
+
42
+ // cover_all=true is not supported
43
+ size_t
44
+ cumo_cuda_cudnn_GetConvTransposeOutDim(
45
+ size_t in_dim,
46
+ size_t kernel_size,
47
+ size_t stride,
48
+ size_t pad) {
49
+ // if (cover_all) {
50
+ // return stride * (in_dim - 1) + kernel_size - stride + 1 - 2 * pad;
51
+ // }
52
+ int64_t out_size = stride * (in_dim - 1) + kernel_size - 2 * pad;
53
+ if (out_size < 0) {
54
+ rb_raise(rb_eRuntimeError, "Output size should be positive.");
55
+ }
56
+ return (size_t)out_size;
57
+ }
58
+
59
+ cudnnStatus_t
60
+ cumo_cuda_cudnn_CreateTensorDescriptor(
61
+ cudnnTensorDescriptor_t *desc,
62
+ VALUE a, cudnnDataType_t cudnn_dtype) {
63
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
64
+ cumo_narray_t *na;
65
+ CumoGetNArray(a, na);
66
+ int ndim = (int)(na->ndim);
67
+ size_t *shape = na->shape;
68
+
69
+ assert(cumo_na_check_contiguous(a) == Qtrue);
70
+ status = cudnnCreateTensorDescriptor(desc);
71
+ if (status != CUDNN_STATUS_SUCCESS) return status;
72
+
73
+ if (ndim == 4) {
74
+ status = cudnnSetTensor4dDescriptor(
75
+ *desc, CUDNN_TENSOR_NCHW, cudnn_dtype, shape[0], shape[1], shape[2], shape[3]);
76
+ }
77
+ else {
78
+ int int_shape[CUMO_NA_MAX_DIMENSION];
79
+ for (int idim = 0; idim < ndim; ++idim) {
80
+ int_shape[idim] = (int)(shape[idim]);
81
+ }
82
+ int int_strides[CUMO_NA_MAX_DIMENSION]; // strides divided by item size
83
+ int stride = 1;
84
+ for (int idim = ndim - 1; idim >= 0; --idim) {
85
+ int_strides[idim] = stride;
86
+ stride *= int_shape[idim];
87
+ }
88
+ status = cudnnSetTensorNdDescriptor(*desc, cudnn_dtype, ndim, int_shape, int_strides);
89
+ }
90
+ return status;
91
+ }
92
+
93
+ cudnnStatus_t
94
+ cumo_cuda_cudnn_CreateFilterDescriptor(
95
+ cudnnFilterDescriptor_t *desc,
96
+ VALUE a,
97
+ cudnnDataType_t cudnn_dtype) {
98
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
99
+ cumo_narray_t *na;
100
+ int ndim;
101
+ size_t *shape;
102
+
103
+ CumoGetNArray(a, na);
104
+ ndim = (int)(na->ndim);
105
+ shape = na->shape;
106
+
107
+ assert(cumo_na_check_contiguous(a) == Qtrue);
108
+ status = cudnnCreateFilterDescriptor(desc);
109
+ if (status != CUDNN_STATUS_SUCCESS) return status;
110
+
111
+ if (ndim == 4) {
112
+ status = cudnnSetFilter4dDescriptor(
113
+ *desc, cudnn_dtype, CUDNN_TENSOR_NCHW, shape[0], shape[1], shape[2], shape[3]);
114
+ } else {
115
+ int int_shape[CUMO_NA_MAX_DIMENSION];
116
+ for (int idim = 0; idim < ndim; ++idim) {
117
+ int_shape[idim] = (int)(shape[idim]);
118
+ }
119
+ status = cudnnSetFilterNdDescriptor(*desc, cudnn_dtype, CUDNN_TENSOR_NCHW, ndim, int_shape);
120
+ }
121
+
122
+ return status;
123
+ }
124
+
125
+ cudnnStatus_t
126
+ cumo_cuda_cudnn_CreateConvolutionDescriptor(
127
+ cudnnConvolutionDescriptor_t *desc,
128
+ size_t ndim,
129
+ int* int_stride,
130
+ int* int_pad,
131
+ cudnnDataType_t cudnn_dtype) {
132
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
133
+ int int_dilation[CUMO_NA_MAX_DIMENSION];
134
+ for (size_t idim = 0; idim < ndim; ++idim) {
135
+ int_dilation[idim] = 1;
136
+ }
137
+
138
+ status = cudnnCreateConvolutionDescriptor(desc);
139
+ if (status != CUDNN_STATUS_SUCCESS) return status;
140
+
141
+ if (ndim == 2) {
142
+ status = cudnnSetConvolution2dDescriptor(
143
+ *desc,
144
+ int_pad[0],
145
+ int_pad[1],
146
+ int_stride[0],
147
+ int_stride[1],
148
+ int_dilation[0],
149
+ int_dilation[1],
150
+ CUDNN_CROSS_CORRELATION,
151
+ cudnn_dtype);
152
+ } else {
153
+ status = cudnnSetConvolutionNdDescriptor(
154
+ *desc,
155
+ ndim,
156
+ int_pad,
157
+ int_stride,
158
+ int_dilation,
159
+ CUDNN_CROSS_CORRELATION,
160
+ cudnn_dtype);
161
+ }
162
+
163
+ return status;
164
+ }
165
+
166
+ cudnnStatus_t
167
+ cumo_cuda_cudnn_CreatePoolingDescriptor(
168
+ cudnnPoolingDescriptor_t *desc,
169
+ cudnnPoolingMode_t mode,
170
+ size_t ndim,
171
+ int* int_kernel_size,
172
+ int* int_stride,
173
+ int* int_pad) {
174
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
175
+
176
+ status = cudnnCreatePoolingDescriptor(desc);
177
+ if (status != CUDNN_STATUS_SUCCESS) return status;
178
+
179
+ if (ndim == 2) {
180
+ status = cudnnSetPooling2dDescriptor(
181
+ *desc,
182
+ mode,
183
+ CUDNN_NOT_PROPAGATE_NAN,
184
+ int_kernel_size[0],
185
+ int_kernel_size[1],
186
+ int_pad[0],
187
+ int_pad[1],
188
+ int_stride[0],
189
+ int_stride[1]);
190
+ } else {
191
+ status = cudnnSetPoolingNdDescriptor(
192
+ *desc,
193
+ mode,
194
+ CUDNN_NOT_PROPAGATE_NAN,
195
+ ndim,
196
+ int_kernel_size,
197
+ int_pad,
198
+ int_stride);
199
+ }
200
+
201
+ return status;
202
+ }
203
+
204
+ // Borrowed from boost::hash_combine
205
+ //
206
+ // TODO(sonots): hash combine in 64bit
207
+ static void HashCombine(std::size_t& seed, std::size_t hash_value) {
208
+ seed ^= hash_value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
209
+ }
210
+
211
+ // Partially Borrowed from ChainerX
212
+ struct AlgoCacheKey {
213
+ size_t ndim; // # of spatial dimensions
214
+ size_t x_shape[CUMO_NA_MAX_DIMENSION];
215
+ size_t w_shape[CUMO_NA_MAX_DIMENSION];
216
+ size_t y_shape[CUMO_NA_MAX_DIMENSION];
217
+ size_t pad[CUMO_NA_MAX_DIMENSION];
218
+ size_t stride[CUMO_NA_MAX_DIMENSION];
219
+ cudnnDataType_t dtype;
220
+ size_t max_workspace_size;
221
+
222
+ bool operator==(const AlgoCacheKey& other) const {
223
+ if (ndim != other.ndim) return false;
224
+ if (dtype != other.dtype) return false;
225
+ if (max_workspace_size != other.max_workspace_size) return false;
226
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
227
+ if (x_shape[idim] != other.x_shape[idim]) return false;
228
+ }
229
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
230
+ if (w_shape[idim] != other.w_shape[idim]) return false;
231
+ }
232
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
233
+ if (y_shape[idim] != other.y_shape[idim]) return false;
234
+ }
235
+ for (size_t idim = 0; idim < ndim; ++idim) {
236
+ if (pad[idim] != other.pad[idim]) return false;
237
+ }
238
+ for (size_t idim = 0; idim < ndim; ++idim) {
239
+ if (stride[idim] != other.stride[idim]) return false;
240
+ }
241
+ return true;
242
+ }
243
+
244
+ bool operator!=(const AlgoCacheKey& other) const { return !operator==(other); }
245
+ };
246
+
247
+ struct AlgoCacheKeyHash {
248
+ using result_type = std::size_t;
249
+ std::size_t operator()(const AlgoCacheKey& key) const {
250
+ std::size_t seed = 0;
251
+ size_t ndim = key.ndim;
252
+ HashCombine(seed, std::hash<size_t>()(key.ndim));
253
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
254
+ HashCombine(seed, std::hash<size_t>()(key.x_shape[idim]));
255
+ }
256
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
257
+ HashCombine(seed, std::hash<size_t>()(key.w_shape[idim]));
258
+ }
259
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
260
+ HashCombine(seed, std::hash<size_t>()(key.y_shape[idim]));
261
+ }
262
+ for (size_t idim = 0; idim < ndim; ++idim) {
263
+ HashCombine(seed, std::hash<size_t>()(key.pad[idim]));
264
+ }
265
+ for (size_t idim = 0; idim < ndim; ++idim) {
266
+ HashCombine(seed, std::hash<size_t>()(key.stride[idim]));
267
+ }
268
+ HashCombine(seed, std::hash<int>()((int)(key.dtype)));
269
+ HashCombine(seed, std::hash<size_t>()(key.max_workspace_size));
270
+ return seed;
271
+ }
272
+ };
273
+
274
+ using FwdAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionFwdAlgo_t, size_t>, AlgoCacheKeyHash>;
275
+ using BwdDataAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionBwdDataAlgo_t, size_t>, AlgoCacheKeyHash>;
276
+ using BwdFilterAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionBwdFilterAlgo_t, size_t>, AlgoCacheKeyHash>;
277
+
278
+ // TODO: Another cache for another device
279
+ static FwdAlgoCacheMap fwd_algo_cache_map_{};
280
+ static BwdDataAlgoCacheMap bwd_data_algo_cache_map_{};
281
+ static BwdFilterAlgoCacheMap bwd_filter_algo_cache_map_{};
282
+
283
+ cudnnStatus_t
284
+ cumo_cuda_cudnn_FindConvolutionForwardAlgorithm(
285
+ cudnnConvolutionFwdAlgoPerf_t *perf_result,
286
+ cudnnHandle_t handle,
287
+ cudnnTensorDescriptor_t x_desc,
288
+ VALUE x,
289
+ cudnnFilterDescriptor_t w_desc,
290
+ VALUE w,
291
+ cudnnConvolutionDescriptor_t conv_desc,
292
+ cudnnTensorDescriptor_t y_desc,
293
+ VALUE y,
294
+ size_t max_workspace_size,
295
+ int* int_stride,
296
+ int* int_pad,
297
+ size_t ndim,
298
+ cudnnDataType_t cudnn_dtype)
299
+ {
300
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
301
+ cumo_narray_t *nx, *nw, *ny;
302
+ CumoGetNArray(x, nx);
303
+ CumoGetNArray(w, nw);
304
+ CumoGetNArray(y, ny);
305
+
306
+ auto key = AlgoCacheKey{};
307
+ key.ndim = ndim;
308
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
309
+ key.x_shape[idim] = nx->shape[idim];
310
+ key.w_shape[idim] = nw->shape[idim];
311
+ key.y_shape[idim] = ny->shape[idim];
312
+ }
313
+ for (size_t idim = 0; idim < ndim; ++idim) {
314
+ key.pad[idim]= int_pad[idim];
315
+ key.stride[idim]= int_stride[idim];
316
+ }
317
+ key.dtype = cudnn_dtype;
318
+ key.max_workspace_size = max_workspace_size;
319
+
320
+ auto& algo_cache_map = fwd_algo_cache_map_;
321
+ // TODO: thread-safe
322
+ auto it = algo_cache_map.find(key);
323
+ if (it != algo_cache_map.end()) {
324
+ auto pair = it->second;
325
+ perf_result->algo = pair.first;
326
+ perf_result->memory = pair.second;
327
+ return CUDNN_STATUS_SUCCESS;
328
+ }
329
+
330
+ char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
331
+ char* w_ptr = cumo_na_get_offset_pointer_for_read(w);
332
+ char* y_ptr = cumo_na_get_offset_pointer_for_read(y);
333
+
334
+ char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
335
+ int returned_algo_count{};
336
+ status = cudnnFindConvolutionForwardAlgorithmEx(
337
+ handle,
338
+ x_desc,
339
+ (void*)x_ptr,
340
+ w_desc,
341
+ (void*)w_ptr,
342
+ conv_desc,
343
+ y_desc,
344
+ (void*)y_ptr,
345
+ 1, // requested algo count,
346
+ &returned_algo_count,
347
+ perf_result,
348
+ (void*)workspace,
349
+ max_workspace_size);
350
+ cumo_cuda_runtime_free(workspace);
351
+ if (status != CUDNN_STATUS_SUCCESS) return status;
352
+ assert(returned_algo_count == 1);
353
+
354
+ // TODO: thread-safe
355
+ algo_cache_map[key] = {perf_result->algo, perf_result->memory};
356
+ return status;
357
+ }
358
+
359
+ cudnnStatus_t
360
+ cumo_cuda_cudnn_FindConvolutionBackwardDataAlgorithm(
361
+ cudnnConvolutionBwdDataAlgoPerf_t *perf_result,
362
+ cudnnHandle_t handle,
363
+ cudnnFilterDescriptor_t w_desc,
364
+ VALUE w,
365
+ cudnnTensorDescriptor_t x_desc,
366
+ VALUE x,
367
+ cudnnConvolutionDescriptor_t conv_desc,
368
+ cudnnTensorDescriptor_t y_desc,
369
+ VALUE y,
370
+ size_t max_workspace_size,
371
+ int* int_stride,
372
+ int* int_pad,
373
+ size_t ndim,
374
+ cudnnDataType_t cudnn_dtype)
375
+ {
376
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
377
+ cumo_narray_t *nx, *nw, *ny;
378
+ CumoGetNArray(x, nx);
379
+ CumoGetNArray(w, nw);
380
+ CumoGetNArray(y, ny);
381
+
382
+ auto key = AlgoCacheKey{};
383
+ key.ndim = ndim;
384
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
385
+ key.x_shape[idim] = nx->shape[idim];
386
+ key.w_shape[idim] = nw->shape[idim];
387
+ key.y_shape[idim] = ny->shape[idim];
388
+ }
389
+ for (size_t idim = 0; idim < ndim; ++idim) {
390
+ key.pad[idim]= int_pad[idim];
391
+ key.stride[idim]= int_stride[idim];
392
+ }
393
+ key.dtype = cudnn_dtype;
394
+ key.max_workspace_size = max_workspace_size;
395
+
396
+ auto& algo_cache_map = bwd_data_algo_cache_map_;
397
+ // TODO: thread-safe
398
+ auto it = algo_cache_map.find(key);
399
+ if (it != algo_cache_map.end()) {
400
+ auto pair = it->second;
401
+ perf_result->algo = pair.first;
402
+ perf_result->memory = pair.second;
403
+ return CUDNN_STATUS_SUCCESS;
404
+ }
405
+
406
+ char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
407
+ char* w_ptr = cumo_na_get_offset_pointer_for_read(w);
408
+ char* y_ptr = cumo_na_get_offset_pointer_for_read(y);
409
+
410
+ char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
411
+ int returned_algo_count{};
412
+ status = cudnnFindConvolutionBackwardDataAlgorithmEx(
413
+ handle,
414
+ w_desc,
415
+ (void*)w_ptr,
416
+ x_desc,
417
+ (void*)x_ptr,
418
+ conv_desc,
419
+ y_desc,
420
+ (void*)y_ptr,
421
+ 1, // requested algo count,
422
+ &returned_algo_count,
423
+ perf_result,
424
+ (void*)workspace,
425
+ max_workspace_size);
426
+ cumo_cuda_runtime_free(workspace);
427
+ if (status != CUDNN_STATUS_SUCCESS) return status;
428
+ assert(returned_algo_count == 1);
429
+
430
+ // TODO: thread-safe
431
+ algo_cache_map[key] = {perf_result->algo, perf_result->memory};
432
+ return status;
433
+ }
434
+
435
+ cudnnStatus_t
436
+ cumo_cuda_cudnn_FindConvolutionBackwardFilterAlgorithm(
437
+ cudnnConvolutionBwdFilterAlgoPerf_t *perf_result,
438
+ cudnnHandle_t handle,
439
+ cudnnTensorDescriptor_t x_desc,
440
+ VALUE x,
441
+ cudnnTensorDescriptor_t gy_desc,
442
+ VALUE gy,
443
+ cudnnConvolutionDescriptor_t conv_desc,
444
+ cudnnFilterDescriptor_t gw_desc,
445
+ VALUE gw,
446
+ size_t max_workspace_size,
447
+ int* int_stride,
448
+ int* int_pad,
449
+ size_t ndim,
450
+ cudnnDataType_t cudnn_dtype)
451
+ {
452
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
453
+ cumo_narray_t *nx, *ngy, *ngw;
454
+ CumoGetNArray(x, nx);
455
+ CumoGetNArray(gy, ngy);
456
+ CumoGetNArray(gw, ngw);
457
+
458
+ auto key = AlgoCacheKey{};
459
+ key.ndim = ndim;
460
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
461
+ key.x_shape[idim] = nx->shape[idim];
462
+ key.w_shape[idim] = ngw->shape[idim];
463
+ key.y_shape[idim] = ngy->shape[idim];
464
+ }
465
+ for (size_t idim = 0; idim < ndim; ++idim) {
466
+ key.pad[idim]= int_pad[idim];
467
+ key.stride[idim]= int_stride[idim];
468
+ }
469
+ key.dtype = cudnn_dtype;
470
+ key.max_workspace_size = max_workspace_size;
471
+
472
+ auto& algo_cache_map = bwd_filter_algo_cache_map_;
473
+ // TODO: thread-safe
474
+ auto it = algo_cache_map.find(key);
475
+ if (it != algo_cache_map.end()) {
476
+ auto pair = it->second;
477
+ perf_result->algo = pair.first;
478
+ perf_result->memory = pair.second;
479
+ return CUDNN_STATUS_SUCCESS;
480
+ }
481
+
482
+ char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
483
+ char* gy_ptr = cumo_na_get_offset_pointer_for_read(gy);
484
+ char* gw_ptr = cumo_na_get_offset_pointer_for_read(gw);
485
+
486
+ char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
487
+ int returned_algo_count{};
488
+ status = cudnnFindConvolutionBackwardFilterAlgorithmEx(
489
+ handle,
490
+ x_desc,
491
+ (void*)x_ptr,
492
+ gy_desc,
493
+ (void*)gy_ptr,
494
+ conv_desc,
495
+ gw_desc,
496
+ (void*)gw_ptr,
497
+ 1, // requested algo count,
498
+ &returned_algo_count,
499
+ perf_result,
500
+ (void*)workspace,
501
+ max_workspace_size);
502
+ cumo_cuda_runtime_free(workspace);
503
+ if (status != CUDNN_STATUS_SUCCESS) return status;
504
+ assert(returned_algo_count == 1);
505
+
506
+ // TODO: thread-safe
507
+ algo_cache_map[key] = {perf_result->algo, perf_result->memory};
508
+ return status;
509
+ }
510
+
511
+ // TODO(sonots): Support other than 4, 5 dimensional arrays by reshaping into 4-dimensional arrays as Chainer does.
512
+ cudnnBatchNormMode_t
513
+ cumo_cuda_cudnn_GetBatchNormMode(size_t ndim, int* axis) {
514
+ if (ndim == 1 && axis[0] == 0) { // (1, channels, (depth, )height, width)
515
+ return CUDNN_BATCHNORM_PER_ACTIVATION;
516
+ }
517
+ if ((ndim == 3 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3) ||
518
+ (ndim == 4 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3 && axis[3] == 4)) { // (1, channels, (1, )1, 1)
519
+ // TODO: Consider CUDNN_BATCHNORM_SPATIAL_PERSISTENT if we can afford to check for overflow, with or without blocking.
520
+ return CUDNN_BATCHNORM_SPATIAL;
521
+ }
522
+ rb_raise(rb_eRuntimeError, "Invalid axis for BatchNorm using cuDNN. Expected 1, 3 or 4 dimensions.");
523
+ }
524
+
525
+ cudnnStatus_t
526
+ cumo_cuda_cudnn_CreateBNTensorDescriptor(
527
+ cudnnTensorDescriptor_t *desc,
528
+ cudnnTensorDescriptor_t x_desc,
529
+ cudnnBatchNormMode_t mode)
530
+ {
531
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
532
+ status = cudnnCreateTensorDescriptor(desc);
533
+ if (status = CUDNN_STATUS_SUCCESS) return status;
534
+
535
+ status = cudnnDeriveBNTensorDescriptor(*desc, x_desc, mode);
536
+ return status;
537
+ }
538
+
539
+ size_t
540
+ cumo_cuda_cudnn_ReduceShape(
541
+ size_t *reduced_shape,
542
+ size_t shape_ndim,
543
+ size_t *shape,
544
+ size_t axes_ndim,
545
+ int *axes,
546
+ char keepdims) {
547
+ assert(shape_ndim >= axes_ndim);
548
+ size_t i_axis = 0;
549
+ size_t i_shape = 0;
550
+ for (size_t i = 0; i < shape_ndim; ++i) {
551
+ if (i_axis < axes_ndim && i == (size_t)axes[i_axis]) {
552
+ ++i_axis;
553
+ if (keepdims) {
554
+ reduced_shape[i_shape++] = 1;
555
+ }
556
+ } else {
557
+ reduced_shape[i_shape++] = shape[i];
558
+ }
559
+ }
560
+ assert(i_axis == axes_ndim);
561
+ assert(i_shape == shape_ndim - static_cast<int8_t>(!keepdims) * axes_ndim);
562
+ return i_shape;
563
+ }
564
+
565
+ #if defined(__cplusplus)
566
+ #if 0
567
+ { /* satisfy cc-mode */
568
+ #endif
569
+ } /* extern "C" { */
570
+ #endif
571
+
572
+ #endif // CUDNN_FOUND