cumo 0.2.5 → 0.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1b28beaea182d622d304bcb3153e56aa3280993ec079aea44c00b915d1e92b77
4
- data.tar.gz: 26fc0e1942a444e5f9cb4641b3e36f9985593a10de26188f0a4142e72314d82a
3
+ metadata.gz: 6fbd39b063f8c40636b699f956ab5ebf4905a58013fd249819845f3dc525f77a
4
+ data.tar.gz: 66c369f01877aa42e73dba6bcaf5a499f52084024171c7f1b5e561c1da08f7e0
5
5
  SHA512:
6
- metadata.gz: a678cb7965fbbc9febf6b5f2f557f8be34f28c051fc0437a87506d3a067a34778a73b75dbeb56da14fd538062a8454355efd06bb686056db5b4df7cab9c04e86
7
- data.tar.gz: 30ce98cae4e84ee7e9e73eae3ad76bcaca1e636462301d1afe1aa50e1f50633ed1b16756b90aaeba1a3e0870179d7e2dbee41696b175f0a454efee93e5f89591
6
+ metadata.gz: 6d98b07a55ead442c4edd2e2e3c648d58d26a1343938eee60ff1cf8ee3bfd9b0539c5c650e349eef5d116ef0f5cd095f077a9cd2586cddcd01023e8e7cdb225e
7
+ data.tar.gz: 22012ddfb97cde8ff78324c599351bd5aa16bac5747208c573def3b0f2d2f47f0e1d55dc83393685a0e54a006be08bab2c15631ed26c940ecd20cd07efb4a86d
data/CHANGELOG.md CHANGED
@@ -1,4 +1,17 @@
1
- # 0.2.5 (2019-03-04)-
1
+ # 0.3.0.pre1 (2019-04-09)
2
+
3
+ Enhancements:
4
+
5
+ * Support cuDNN
6
+ * conv (cudnnConvolution)
7
+ * conv\_transpose (cudnnConvolutionBackwardData)
8
+ * conv\_grad\_w (cudnnConvolutionBackwardFilter)
9
+ * batch\_norm (cudnnBatchNormalization)
10
+ * batch\_norm\_backward (cudnnBatchNormalizationBackward)
11
+ * avg\_pool and max\_pool (cudnnPoolingForward)
12
+ * avg\_pool\_backward and max\_pool\_backward (cudnnPoolingBackward)
13
+
14
+ # 0.2.5 (2019-03-04)
2
15
 
3
16
  Enhancements:
4
17
 
data/README.md CHANGED
@@ -22,6 +22,17 @@ export PATH="$CUDA_PATH/bin:$PATH"
22
22
  export LIBRARY_PATH="$CUDA_PATH/lib64:$CUDA_PATH/lib:$LIBRARY_PATH"
23
23
  ```
24
24
 
25
+ To use cuDNN features, install cuDNN and set your environment variables as follows:
26
+
27
+ ```
28
+ export CUDNN_ROOT_DIR=/path/to/cudnn
29
+ export CPATH=$CUDNN_ROOT_DIR/include:$CPATH
30
+ export LD_LIBRARY_PATH=$CUDNN_ROOT_DIR/lib64:$LD_LIBRARY_PATH
31
+ export LIBRARY_PATH=$CUDNN_ROOT_DIR/lib64:$LIBRARY_PATH
32
+ ```
33
+
34
+ FYI: I use [cudnnenv](https://github.com/unnonouno/cudnnenv) to install cudnn under my home directory like `export CUDNN_ROOT_DIR=/home/sonots/.cudnn/active/cuda`.
35
+
25
36
  ## Installation
26
37
 
27
38
  Add the following line to your Gemfile:
@@ -216,7 +227,7 @@ bundle exec gdb -x run.gdb --args ruby test/narray_test.rb
216
227
 
217
228
  You may put a breakpoint by calling `cumo_debug_breakpoint()` at C source codes.
218
229
 
219
- ### Run tests only a specific line
230
+ ### Run tests only a specific line
220
231
  `--location` option is available as:
221
232
 
222
233
  ```
data/cumo.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
  lib = File.expand_path("../lib", __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
 
5
- cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([\d.]+)"/)[1]
5
+ cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([^"]+)"/)[1]
6
6
  numo_narray_version = File.read(File.join(__dir__, "numo-narray-version")).strip
7
7
 
8
8
  Gem::Specification.new do |spec|
@@ -0,0 +1,80 @@
1
+ #include "cumo/cuda/cudnn.h"
2
+
3
+ #include <assert.h>
4
+ #include <ruby.h>
5
+ #include "cumo/narray.h"
6
+ #include "cumo/template.h"
7
+ #include "cumo/cuda/runtime.h"
8
+
9
+ VALUE cumo_cuda_eCUDNNError;
10
+ VALUE cumo_cuda_mCUDNN;
11
+ #define eCUDNNError cumo_cuda_eCUDNNError
12
+ #define mCUDNN cumo_cuda_mCUDNN
13
+
14
+ #ifdef CUDNN_FOUND
15
+
16
+ void
17
+ cumo_cuda_cudnn_check_status(cudnnStatus_t status)
18
+ {
19
+ if (status != CUDNN_STATUS_SUCCESS) {
20
+ rb_raise(cumo_cuda_eCUDNNError, "%s (error=%d)", cudnnGetErrorString(status), status);
21
+ }
22
+ }
23
+
24
+ // Lazily initialize cudnn handle, and cache it
25
+ cudnnHandle_t
26
+ cumo_cuda_cudnn_handle()
27
+ {
28
+ static cudnnHandle_t *handles = 0; // handle is never destroyed
29
+ int device;
30
+ if (handles == 0) {
31
+ int i;
32
+ int device_count = cumo_cuda_runtime_get_device_count();
33
+ handles = malloc(sizeof(cudnnHandle_t) * device_count);
34
+ for (i = 0; i < device_count; ++i) {
35
+ handles[i] = 0;
36
+ }
37
+ }
38
+ device = cumo_cuda_runtime_get_device();
39
+ if (handles[device] == 0) {
40
+ cudnnCreate(&handles[device]);
41
+ }
42
+ return handles[device];
43
+ }
44
+
45
+ #endif // CUDNN_FOUND
46
+
47
+ /*
48
+ Returns availability of cuDNN.
49
+
50
+ @return [Boolean] Returns true if cuDNN is available
51
+ */
52
+ static VALUE
53
+ rb_cudnn_available_p()
54
+ {
55
+ #if CUDNN_FOUND
56
+ return Qtrue;
57
+ #else
58
+ return Qfalse;
59
+ #endif
60
+ }
61
+
62
+ void
63
+ Init_cumo_cuda_cudnn(void)
64
+ {
65
+ VALUE mCumo = rb_define_module("Cumo");
66
+ VALUE mCUDA = rb_define_module_under(mCumo, "CUDA");
67
+
68
+ /*
69
+ Document-module: Cumo::CUDNN
70
+ */
71
+ mCUDNN = rb_define_module_under(mCUDA, "CUDNN");
72
+ rb_define_const(mCUDA, "Cudnn", mCUDNN); // alias
73
+ eCUDNNError = rb_define_class_under(mCUDA, "CUDNNError", rb_eStandardError);
74
+
75
+ rb_define_singleton_method(mCUDNN, "available?", RUBY_METHOD_FUNC(rb_cudnn_available_p), 0);
76
+ rb_define_const(mCUDNN, "CUDNN_POOLING_MAX", INT2NUM(CUDNN_POOLING_MAX));
77
+ rb_define_const(mCUDNN, "CUDNN_POOLING_MAX_DETERMINISTIC", INT2NUM(CUDNN_POOLING_MAX_DETERMINISTIC));
78
+ rb_define_const(mCUDNN, "CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", INT2NUM(CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING));
79
+ rb_define_const(mCUDNN, "CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", INT2NUM(CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING));
80
+ }
@@ -0,0 +1,572 @@
1
+ #ifdef CUDNN_FOUND
2
+
3
+ #include "cumo/cuda/cudnn.h"
4
+
5
+ #include <assert.h>
6
+ #include <ruby.h>
7
+ #include <cudnn.h>
8
+ #include "cumo/narray.h"
9
+ #include "cumo/template.h"
10
+ #include "cumo/cuda/runtime.h"
11
+ #include "cumo/cuda/memory_pool.h"
12
+
13
+ #include <unordered_map>
14
+
15
+ #if defined(__cplusplus)
16
+ extern "C" {
17
+ #if 0
18
+ } /* satisfy cc-mode */
19
+ #endif
20
+ #endif
21
+
22
+ // cover_all=true is not supported
23
+ size_t
24
+ cumo_cuda_cudnn_GetConvOutDim(
25
+ size_t in_dim,
26
+ size_t kernel_size,
27
+ size_t stride,
28
+ size_t pad) {
29
+ int64_t numerator;
30
+ assert(stride > 0);
31
+ // if (cover_all) {
32
+ // numerator = in_dim + pad * 2 - kernel_size + stride - 1;
33
+ // } else {
34
+ numerator = in_dim + pad * 2 - kernel_size;
35
+ // }
36
+ if (numerator < 0) {
37
+ rb_raise(rb_eRuntimeError, "Output size should be positive.");
38
+ }
39
+ return (size_t)(numerator / stride + 1);
40
+ }
41
+
42
+ // cover_all=true is not supported
43
+ size_t
44
+ cumo_cuda_cudnn_GetConvTransposeOutDim(
45
+ size_t in_dim,
46
+ size_t kernel_size,
47
+ size_t stride,
48
+ size_t pad) {
49
+ // if (cover_all) {
50
+ // return stride * (in_dim - 1) + kernel_size - stride + 1 - 2 * pad;
51
+ // }
52
+ int64_t out_size = stride * (in_dim - 1) + kernel_size - 2 * pad;
53
+ if (out_size < 0) {
54
+ rb_raise(rb_eRuntimeError, "Output size should be positive.");
55
+ }
56
+ return (size_t)out_size;
57
+ }
58
+
59
+ cudnnStatus_t
60
+ cumo_cuda_cudnn_CreateTensorDescriptor(
61
+ cudnnTensorDescriptor_t *desc,
62
+ VALUE a, cudnnDataType_t cudnn_dtype) {
63
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
64
+ cumo_narray_t *na;
65
+ CumoGetNArray(a, na);
66
+ int ndim = (int)(na->ndim);
67
+ size_t *shape = na->shape;
68
+
69
+ assert(cumo_na_check_contiguous(a) == Qtrue);
70
+ status = cudnnCreateTensorDescriptor(desc);
71
+ if (status != CUDNN_STATUS_SUCCESS) return status;
72
+
73
+ if (ndim == 4) {
74
+ status = cudnnSetTensor4dDescriptor(
75
+ *desc, CUDNN_TENSOR_NCHW, cudnn_dtype, shape[0], shape[1], shape[2], shape[3]);
76
+ }
77
+ else {
78
+ int int_shape[CUMO_NA_MAX_DIMENSION];
79
+ for (int idim = 0; idim < ndim; ++idim) {
80
+ int_shape[idim] = (int)(shape[idim]);
81
+ }
82
+ int int_strides[CUMO_NA_MAX_DIMENSION]; // strides divided by item size
83
+ int stride = 1;
84
+ for (int idim = ndim - 1; idim >= 0; --idim) {
85
+ int_strides[idim] = stride;
86
+ stride *= int_shape[idim];
87
+ }
88
+ status = cudnnSetTensorNdDescriptor(*desc, cudnn_dtype, ndim, int_shape, int_strides);
89
+ }
90
+ return status;
91
+ }
92
+
93
+ cudnnStatus_t
94
+ cumo_cuda_cudnn_CreateFilterDescriptor(
95
+ cudnnFilterDescriptor_t *desc,
96
+ VALUE a,
97
+ cudnnDataType_t cudnn_dtype) {
98
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
99
+ cumo_narray_t *na;
100
+ int ndim;
101
+ size_t *shape;
102
+
103
+ CumoGetNArray(a, na);
104
+ ndim = (int)(na->ndim);
105
+ shape = na->shape;
106
+
107
+ assert(cumo_na_check_contiguous(a) == Qtrue);
108
+ status = cudnnCreateFilterDescriptor(desc);
109
+ if (status != CUDNN_STATUS_SUCCESS) return status;
110
+
111
+ if (ndim == 4) {
112
+ status = cudnnSetFilter4dDescriptor(
113
+ *desc, cudnn_dtype, CUDNN_TENSOR_NCHW, shape[0], shape[1], shape[2], shape[3]);
114
+ } else {
115
+ int int_shape[CUMO_NA_MAX_DIMENSION];
116
+ for (int idim = 0; idim < ndim; ++idim) {
117
+ int_shape[idim] = (int)(shape[idim]);
118
+ }
119
+ status = cudnnSetFilterNdDescriptor(*desc, cudnn_dtype, CUDNN_TENSOR_NCHW, ndim, int_shape);
120
+ }
121
+
122
+ return status;
123
+ }
124
+
125
+ cudnnStatus_t
126
+ cumo_cuda_cudnn_CreateConvolutionDescriptor(
127
+ cudnnConvolutionDescriptor_t *desc,
128
+ size_t ndim,
129
+ int* int_stride,
130
+ int* int_pad,
131
+ cudnnDataType_t cudnn_dtype) {
132
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
133
+ int int_dilation[CUMO_NA_MAX_DIMENSION];
134
+ for (size_t idim = 0; idim < ndim; ++idim) {
135
+ int_dilation[idim] = 1;
136
+ }
137
+
138
+ status = cudnnCreateConvolutionDescriptor(desc);
139
+ if (status != CUDNN_STATUS_SUCCESS) return status;
140
+
141
+ if (ndim == 2) {
142
+ status = cudnnSetConvolution2dDescriptor(
143
+ *desc,
144
+ int_pad[0],
145
+ int_pad[1],
146
+ int_stride[0],
147
+ int_stride[1],
148
+ int_dilation[0],
149
+ int_dilation[1],
150
+ CUDNN_CROSS_CORRELATION,
151
+ cudnn_dtype);
152
+ } else {
153
+ status = cudnnSetConvolutionNdDescriptor(
154
+ *desc,
155
+ ndim,
156
+ int_pad,
157
+ int_stride,
158
+ int_dilation,
159
+ CUDNN_CROSS_CORRELATION,
160
+ cudnn_dtype);
161
+ }
162
+
163
+ return status;
164
+ }
165
+
166
+ cudnnStatus_t
167
+ cumo_cuda_cudnn_CreatePoolingDescriptor(
168
+ cudnnPoolingDescriptor_t *desc,
169
+ cudnnPoolingMode_t mode,
170
+ size_t ndim,
171
+ int* int_kernel_size,
172
+ int* int_stride,
173
+ int* int_pad) {
174
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
175
+
176
+ status = cudnnCreatePoolingDescriptor(desc);
177
+ if (status != CUDNN_STATUS_SUCCESS) return status;
178
+
179
+ if (ndim == 2) {
180
+ status = cudnnSetPooling2dDescriptor(
181
+ *desc,
182
+ mode,
183
+ CUDNN_NOT_PROPAGATE_NAN,
184
+ int_kernel_size[0],
185
+ int_kernel_size[1],
186
+ int_pad[0],
187
+ int_pad[1],
188
+ int_stride[0],
189
+ int_stride[1]);
190
+ } else {
191
+ status = cudnnSetPoolingNdDescriptor(
192
+ *desc,
193
+ mode,
194
+ CUDNN_NOT_PROPAGATE_NAN,
195
+ ndim,
196
+ int_kernel_size,
197
+ int_pad,
198
+ int_stride);
199
+ }
200
+
201
+ return status;
202
+ }
203
+
204
+ // Borrowed from boost::hash_combine
205
+ //
206
+ // TODO(sonots): hash combine in 64bit
207
+ static void HashCombine(std::size_t& seed, std::size_t hash_value) {
208
+ seed ^= hash_value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
209
+ }
210
+
211
+ // Partially Borrowed from ChainerX
212
+ struct AlgoCacheKey {
213
+ size_t ndim; // # of spatial dimensions
214
+ size_t x_shape[CUMO_NA_MAX_DIMENSION];
215
+ size_t w_shape[CUMO_NA_MAX_DIMENSION];
216
+ size_t y_shape[CUMO_NA_MAX_DIMENSION];
217
+ size_t pad[CUMO_NA_MAX_DIMENSION];
218
+ size_t stride[CUMO_NA_MAX_DIMENSION];
219
+ cudnnDataType_t dtype;
220
+ size_t max_workspace_size;
221
+
222
+ bool operator==(const AlgoCacheKey& other) const {
223
+ if (ndim != other.ndim) return false;
224
+ if (dtype != other.dtype) return false;
225
+ if (max_workspace_size != other.max_workspace_size) return false;
226
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
227
+ if (x_shape[idim] != other.x_shape[idim]) return false;
228
+ }
229
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
230
+ if (w_shape[idim] != other.w_shape[idim]) return false;
231
+ }
232
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
233
+ if (y_shape[idim] != other.y_shape[idim]) return false;
234
+ }
235
+ for (size_t idim = 0; idim < ndim; ++idim) {
236
+ if (pad[idim] != other.pad[idim]) return false;
237
+ }
238
+ for (size_t idim = 0; idim < ndim; ++idim) {
239
+ if (stride[idim] != other.stride[idim]) return false;
240
+ }
241
+ return true;
242
+ }
243
+
244
+ bool operator!=(const AlgoCacheKey& other) const { return !operator==(other); }
245
+ };
246
+
247
+ struct AlgoCacheKeyHash {
248
+ using result_type = std::size_t;
249
+ std::size_t operator()(const AlgoCacheKey& key) const {
250
+ std::size_t seed = 0;
251
+ size_t ndim = key.ndim;
252
+ HashCombine(seed, std::hash<size_t>()(key.ndim));
253
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
254
+ HashCombine(seed, std::hash<size_t>()(key.x_shape[idim]));
255
+ }
256
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
257
+ HashCombine(seed, std::hash<size_t>()(key.w_shape[idim]));
258
+ }
259
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
260
+ HashCombine(seed, std::hash<size_t>()(key.y_shape[idim]));
261
+ }
262
+ for (size_t idim = 0; idim < ndim; ++idim) {
263
+ HashCombine(seed, std::hash<size_t>()(key.pad[idim]));
264
+ }
265
+ for (size_t idim = 0; idim < ndim; ++idim) {
266
+ HashCombine(seed, std::hash<size_t>()(key.stride[idim]));
267
+ }
268
+ HashCombine(seed, std::hash<int>()((int)(key.dtype)));
269
+ HashCombine(seed, std::hash<size_t>()(key.max_workspace_size));
270
+ return seed;
271
+ }
272
+ };
273
+
274
+ using FwdAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionFwdAlgo_t, size_t>, AlgoCacheKeyHash>;
275
+ using BwdDataAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionBwdDataAlgo_t, size_t>, AlgoCacheKeyHash>;
276
+ using BwdFilterAlgoCacheMap = std::unordered_map<AlgoCacheKey, std::pair<cudnnConvolutionBwdFilterAlgo_t, size_t>, AlgoCacheKeyHash>;
277
+
278
+ // TODO: Another cache for another device
279
+ static FwdAlgoCacheMap fwd_algo_cache_map_{};
280
+ static BwdDataAlgoCacheMap bwd_data_algo_cache_map_{};
281
+ static BwdFilterAlgoCacheMap bwd_filter_algo_cache_map_{};
282
+
283
+ cudnnStatus_t
284
+ cumo_cuda_cudnn_FindConvolutionForwardAlgorithm(
285
+ cudnnConvolutionFwdAlgoPerf_t *perf_result,
286
+ cudnnHandle_t handle,
287
+ cudnnTensorDescriptor_t x_desc,
288
+ VALUE x,
289
+ cudnnFilterDescriptor_t w_desc,
290
+ VALUE w,
291
+ cudnnConvolutionDescriptor_t conv_desc,
292
+ cudnnTensorDescriptor_t y_desc,
293
+ VALUE y,
294
+ size_t max_workspace_size,
295
+ int* int_stride,
296
+ int* int_pad,
297
+ size_t ndim,
298
+ cudnnDataType_t cudnn_dtype)
299
+ {
300
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
301
+ cumo_narray_t *nx, *nw, *ny;
302
+ CumoGetNArray(x, nx);
303
+ CumoGetNArray(w, nw);
304
+ CumoGetNArray(y, ny);
305
+
306
+ auto key = AlgoCacheKey{};
307
+ key.ndim = ndim;
308
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
309
+ key.x_shape[idim] = nx->shape[idim];
310
+ key.w_shape[idim] = nw->shape[idim];
311
+ key.y_shape[idim] = ny->shape[idim];
312
+ }
313
+ for (size_t idim = 0; idim < ndim; ++idim) {
314
+ key.pad[idim]= int_pad[idim];
315
+ key.stride[idim]= int_stride[idim];
316
+ }
317
+ key.dtype = cudnn_dtype;
318
+ key.max_workspace_size = max_workspace_size;
319
+
320
+ auto& algo_cache_map = fwd_algo_cache_map_;
321
+ // TODO: thread-safe
322
+ auto it = algo_cache_map.find(key);
323
+ if (it != algo_cache_map.end()) {
324
+ auto pair = it->second;
325
+ perf_result->algo = pair.first;
326
+ perf_result->memory = pair.second;
327
+ return CUDNN_STATUS_SUCCESS;
328
+ }
329
+
330
+ char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
331
+ char* w_ptr = cumo_na_get_offset_pointer_for_read(w);
332
+ char* y_ptr = cumo_na_get_offset_pointer_for_read(y);
333
+
334
+ char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
335
+ int returned_algo_count{};
336
+ status = cudnnFindConvolutionForwardAlgorithmEx(
337
+ handle,
338
+ x_desc,
339
+ (void*)x_ptr,
340
+ w_desc,
341
+ (void*)w_ptr,
342
+ conv_desc,
343
+ y_desc,
344
+ (void*)y_ptr,
345
+ 1, // requested algo count,
346
+ &returned_algo_count,
347
+ perf_result,
348
+ (void*)workspace,
349
+ max_workspace_size);
350
+ cumo_cuda_runtime_free(workspace);
351
+ if (status != CUDNN_STATUS_SUCCESS) return status;
352
+ assert(returned_algo_count == 1);
353
+
354
+ // TODO: thread-safe
355
+ algo_cache_map[key] = {perf_result->algo, perf_result->memory};
356
+ return status;
357
+ }
358
+
359
+ cudnnStatus_t
360
+ cumo_cuda_cudnn_FindConvolutionBackwardDataAlgorithm(
361
+ cudnnConvolutionBwdDataAlgoPerf_t *perf_result,
362
+ cudnnHandle_t handle,
363
+ cudnnFilterDescriptor_t w_desc,
364
+ VALUE w,
365
+ cudnnTensorDescriptor_t x_desc,
366
+ VALUE x,
367
+ cudnnConvolutionDescriptor_t conv_desc,
368
+ cudnnTensorDescriptor_t y_desc,
369
+ VALUE y,
370
+ size_t max_workspace_size,
371
+ int* int_stride,
372
+ int* int_pad,
373
+ size_t ndim,
374
+ cudnnDataType_t cudnn_dtype)
375
+ {
376
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
377
+ cumo_narray_t *nx, *nw, *ny;
378
+ CumoGetNArray(x, nx);
379
+ CumoGetNArray(w, nw);
380
+ CumoGetNArray(y, ny);
381
+
382
+ auto key = AlgoCacheKey{};
383
+ key.ndim = ndim;
384
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
385
+ key.x_shape[idim] = nx->shape[idim];
386
+ key.w_shape[idim] = nw->shape[idim];
387
+ key.y_shape[idim] = ny->shape[idim];
388
+ }
389
+ for (size_t idim = 0; idim < ndim; ++idim) {
390
+ key.pad[idim]= int_pad[idim];
391
+ key.stride[idim]= int_stride[idim];
392
+ }
393
+ key.dtype = cudnn_dtype;
394
+ key.max_workspace_size = max_workspace_size;
395
+
396
+ auto& algo_cache_map = bwd_data_algo_cache_map_;
397
+ // TODO: thread-safe
398
+ auto it = algo_cache_map.find(key);
399
+ if (it != algo_cache_map.end()) {
400
+ auto pair = it->second;
401
+ perf_result->algo = pair.first;
402
+ perf_result->memory = pair.second;
403
+ return CUDNN_STATUS_SUCCESS;
404
+ }
405
+
406
+ char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
407
+ char* w_ptr = cumo_na_get_offset_pointer_for_read(w);
408
+ char* y_ptr = cumo_na_get_offset_pointer_for_read(y);
409
+
410
+ char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
411
+ int returned_algo_count{};
412
+ status = cudnnFindConvolutionBackwardDataAlgorithmEx(
413
+ handle,
414
+ w_desc,
415
+ (void*)w_ptr,
416
+ x_desc,
417
+ (void*)x_ptr,
418
+ conv_desc,
419
+ y_desc,
420
+ (void*)y_ptr,
421
+ 1, // requested algo count,
422
+ &returned_algo_count,
423
+ perf_result,
424
+ (void*)workspace,
425
+ max_workspace_size);
426
+ cumo_cuda_runtime_free(workspace);
427
+ if (status != CUDNN_STATUS_SUCCESS) return status;
428
+ assert(returned_algo_count == 1);
429
+
430
+ // TODO: thread-safe
431
+ algo_cache_map[key] = {perf_result->algo, perf_result->memory};
432
+ return status;
433
+ }
434
+
435
+ cudnnStatus_t
436
+ cumo_cuda_cudnn_FindConvolutionBackwardFilterAlgorithm(
437
+ cudnnConvolutionBwdFilterAlgoPerf_t *perf_result,
438
+ cudnnHandle_t handle,
439
+ cudnnTensorDescriptor_t x_desc,
440
+ VALUE x,
441
+ cudnnTensorDescriptor_t gy_desc,
442
+ VALUE gy,
443
+ cudnnConvolutionDescriptor_t conv_desc,
444
+ cudnnFilterDescriptor_t gw_desc,
445
+ VALUE gw,
446
+ size_t max_workspace_size,
447
+ int* int_stride,
448
+ int* int_pad,
449
+ size_t ndim,
450
+ cudnnDataType_t cudnn_dtype)
451
+ {
452
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
453
+ cumo_narray_t *nx, *ngy, *ngw;
454
+ CumoGetNArray(x, nx);
455
+ CumoGetNArray(gy, ngy);
456
+ CumoGetNArray(gw, ngw);
457
+
458
+ auto key = AlgoCacheKey{};
459
+ key.ndim = ndim;
460
+ for (size_t idim = 0; idim < ndim + 2; ++idim) {
461
+ key.x_shape[idim] = nx->shape[idim];
462
+ key.w_shape[idim] = ngw->shape[idim];
463
+ key.y_shape[idim] = ngy->shape[idim];
464
+ }
465
+ for (size_t idim = 0; idim < ndim; ++idim) {
466
+ key.pad[idim]= int_pad[idim];
467
+ key.stride[idim]= int_stride[idim];
468
+ }
469
+ key.dtype = cudnn_dtype;
470
+ key.max_workspace_size = max_workspace_size;
471
+
472
+ auto& algo_cache_map = bwd_filter_algo_cache_map_;
473
+ // TODO: thread-safe
474
+ auto it = algo_cache_map.find(key);
475
+ if (it != algo_cache_map.end()) {
476
+ auto pair = it->second;
477
+ perf_result->algo = pair.first;
478
+ perf_result->memory = pair.second;
479
+ return CUDNN_STATUS_SUCCESS;
480
+ }
481
+
482
+ char* x_ptr = cumo_na_get_offset_pointer_for_read(x);
483
+ char* gy_ptr = cumo_na_get_offset_pointer_for_read(gy);
484
+ char* gw_ptr = cumo_na_get_offset_pointer_for_read(gw);
485
+
486
+ char* workspace = cumo_cuda_runtime_malloc(max_workspace_size);
487
+ int returned_algo_count{};
488
+ status = cudnnFindConvolutionBackwardFilterAlgorithmEx(
489
+ handle,
490
+ x_desc,
491
+ (void*)x_ptr,
492
+ gy_desc,
493
+ (void*)gy_ptr,
494
+ conv_desc,
495
+ gw_desc,
496
+ (void*)gw_ptr,
497
+ 1, // requested algo count,
498
+ &returned_algo_count,
499
+ perf_result,
500
+ (void*)workspace,
501
+ max_workspace_size);
502
+ cumo_cuda_runtime_free(workspace);
503
+ if (status != CUDNN_STATUS_SUCCESS) return status;
504
+ assert(returned_algo_count == 1);
505
+
506
+ // TODO: thread-safe
507
+ algo_cache_map[key] = {perf_result->algo, perf_result->memory};
508
+ return status;
509
+ }
510
+
511
+ // TODO(sonots): Support other than 4, 5 dimensional arrays by reshaping into 4-dimensional arrays as Chainer does.
512
+ cudnnBatchNormMode_t
513
+ cumo_cuda_cudnn_GetBatchNormMode(size_t ndim, int* axis) {
514
+ if (ndim == 1 && axis[0] == 0) { // (1, channels, (depth, )height, width)
515
+ return CUDNN_BATCHNORM_PER_ACTIVATION;
516
+ }
517
+ if ((ndim == 3 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3) ||
518
+ (ndim == 4 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3 && axis[3] == 4)) { // (1, channels, (1, )1, 1)
519
+ // TODO: Consider CUDNN_BATCHNORM_SPATIAL_PERSISTENT if we can afford to check for overflow, with or without blocking.
520
+ return CUDNN_BATCHNORM_SPATIAL;
521
+ }
522
+ rb_raise(rb_eRuntimeError, "Invalid axis for BatchNorm using cuDNN. Expected 1, 3 or 4 dimensions.");
523
+ }
524
+
525
+ cudnnStatus_t
526
+ cumo_cuda_cudnn_CreateBNTensorDescriptor(
527
+ cudnnTensorDescriptor_t *desc,
528
+ cudnnTensorDescriptor_t x_desc,
529
+ cudnnBatchNormMode_t mode)
530
+ {
531
+ cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
532
+ status = cudnnCreateTensorDescriptor(desc);
533
+ if (status = CUDNN_STATUS_SUCCESS) return status;
534
+
535
+ status = cudnnDeriveBNTensorDescriptor(*desc, x_desc, mode);
536
+ return status;
537
+ }
538
+
539
+ size_t
540
+ cumo_cuda_cudnn_ReduceShape(
541
+ size_t *reduced_shape,
542
+ size_t shape_ndim,
543
+ size_t *shape,
544
+ size_t axes_ndim,
545
+ int *axes,
546
+ char keepdims) {
547
+ assert(shape_ndim >= axes_ndim);
548
+ size_t i_axis = 0;
549
+ size_t i_shape = 0;
550
+ for (size_t i = 0; i < shape_ndim; ++i) {
551
+ if (i_axis < axes_ndim && i == (size_t)axes[i_axis]) {
552
+ ++i_axis;
553
+ if (keepdims) {
554
+ reduced_shape[i_shape++] = 1;
555
+ }
556
+ } else {
557
+ reduced_shape[i_shape++] = shape[i];
558
+ }
559
+ }
560
+ assert(i_axis == axes_ndim);
561
+ assert(i_shape == shape_ndim - static_cast<int8_t>(!keepdims) * axes_ndim);
562
+ return i_shape;
563
+ }
564
+
565
+ #if defined(__cplusplus)
566
+ #if 0
567
+ { /* satisfy cc-mode */
568
+ #endif
569
+ } /* extern "C" { */
570
+ #endif
571
+
572
+ #endif // CUDNN_FOUND