nvidia-cudnn-cu13 9.12.0.46__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,669 @@
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /* cudnn_adv : cuDNN's advanced and experimental features.
51
+
52
+ */
53
+
54
+ #if !defined(CUDNN_ADV_H_)
55
+ #define CUDNN_ADV_H_
56
+
57
+ #include <stdint.h>
58
+
59
+ #include "cudnn_version.h"
60
+ #include "cudnn_ops.h"
61
+
62
+ /* These version numbers are autogenerated, do not edit manually. */
63
+ #define CUDNN_ADV_MAJOR 9
64
+ #define CUDNN_ADV_MINOR 12
65
+ #define CUDNN_ADV_PATCH 0
66
+
67
+ #if (CUDNN_ADV_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_MINOR != CUDNN_MINOR) || (CUDNN_ADV_PATCH != CUDNN_PATCHLEVEL)
68
+ #error Version mismatch in cuDNN ADV INFER!!!
69
+ #endif
70
+
71
+ #if defined(__cplusplus)
72
+ extern "C" {
73
+ #endif
74
+
75
+ /* BASIC RNN API */
76
+
77
+ typedef enum {
78
+ CUDNN_RNN_ALGO_STANDARD = 0,
79
+ CUDNN_RNN_ALGO_PERSIST_STATIC = 1,
80
+ CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2,
81
+ CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
82
+ CUDNN_RNN_ALGO_COUNT = 4,
83
+ } cudnnRNNAlgo_t;
84
+
85
+ typedef enum {
86
+ CUDNN_FWD_MODE_INFERENCE = 0,
87
+ CUDNN_FWD_MODE_TRAINING = 1,
88
+ } cudnnForwardMode_t;
89
+
90
+ typedef enum {
91
+ CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
92
+ CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
93
+ CUDNN_LSTM = 2, /* LSTM with optional recurrent projection and clipping */
94
+ CUDNN_GRU = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
95
+ } cudnnRNNMode_t;
96
+
97
+ typedef enum {
98
+ CUDNN_RNN_NO_BIAS = 0, /* rnn cell formulas do not use biases */
99
+ CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
100
+ CUDNN_RNN_DOUBLE_BIAS = 2, /* default, rnn cell formulas use two bias vectors */
101
+ CUDNN_RNN_SINGLE_REC_BIAS = 3 /* rnn cell formulas use one recurrent bias in recurrent GEMM */
102
+ } cudnnRNNBiasMode_t;
103
+
104
+ typedef enum {
105
+ CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
106
+ CUDNN_BIDIRECTIONAL = 1, /* output concatination at each layer */
107
+ } cudnnDirectionMode_t;
108
+
109
+ typedef enum {
110
+ CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
111
+ CUDNN_SKIP_INPUT = 1, /* fixed identity matrix in the first layer input GEMM */
112
+ } cudnnRNNInputMode_t;
113
+
114
+ typedef enum {
115
+ CUDNN_RNN_CLIP_NONE = 0, /* disables LSTM cell clipping */
116
+ CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
117
+ } cudnnRNNClipMode_t;
118
+
119
+ typedef enum {
120
+ CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0, /* padded, outer stride from one time-step to the next */
121
+ CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1, /* sequence length sorted and packed as in basic RNN api */
122
+ CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
123
+ } cudnnRNNDataLayout_t;
124
+
125
+ /* For auxFlags in cudnnSetRNNDescriptor_v8() */
126
+ #define CUDNN_RNN_PADDED_IO_DISABLED 0
127
+ #define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
128
+
129
+ struct cudnnRNNStruct;
130
+ typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
131
+
132
+ struct cudnnRNNDataStruct;
133
+ typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
134
+
135
+ cudnnStatus_t CUDNNWINAPI
136
+ cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
137
+
138
+ cudnnStatus_t CUDNNWINAPI
139
+ cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
140
+
141
+ /*
142
+ * mathPrec in cudnnSetRNNDescriptor_v8() specifies compute precision.
143
+ * Compute precision is further modified by mathType that sets the
144
+ * preferred option for using NVIDIA Tensor Cores. dataType specify
145
+ * input/output data type and weight/bias type.
146
+ */
147
+
148
+ cudnnStatus_t CUDNNWINAPI
149
+ cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
150
+ cudnnRNNAlgo_t algo,
151
+ cudnnRNNMode_t cellMode,
152
+ cudnnRNNBiasMode_t biasMode,
153
+ cudnnDirectionMode_t dirMode,
154
+ cudnnRNNInputMode_t inputMode,
155
+ cudnnDataType_t dataType,
156
+ cudnnDataType_t mathPrec,
157
+ cudnnMathType_t mathType,
158
+ int32_t inputSize,
159
+ int32_t hiddenSize,
160
+ int32_t projSize,
161
+ int32_t numLayers,
162
+ cudnnDropoutDescriptor_t dropoutDesc,
163
+ uint32_t auxFlags);
164
+
165
+ cudnnStatus_t CUDNNWINAPI
166
+ cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
167
+ cudnnRNNAlgo_t *algo,
168
+ cudnnRNNMode_t *cellMode,
169
+ cudnnRNNBiasMode_t *biasMode,
170
+ cudnnDirectionMode_t *dirMode,
171
+ cudnnRNNInputMode_t *inputMode,
172
+ cudnnDataType_t *dataType,
173
+ cudnnDataType_t *mathPrec,
174
+ cudnnMathType_t *mathType,
175
+ int32_t *inputSize,
176
+ int32_t *hiddenSize,
177
+ int32_t *projSize,
178
+ int32_t *numLayers,
179
+ cudnnDropoutDescriptor_t *dropoutDesc,
180
+ uint32_t *auxFlags);
181
+
182
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
183
+ cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
184
+ cudnnRNNClipMode_t clipMode,
185
+ cudnnNanPropagation_t clipNanOpt,
186
+ double lclip,
187
+ double rclip);
188
+
189
+ cudnnStatus_t CUDNNWINAPI
190
+ cudnnRNNSetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t clipMode, double lclip, double rclip);
191
+
192
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
193
+ cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
194
+ cudnnRNNClipMode_t *clipMode,
195
+ cudnnNanPropagation_t *clipNanOpt,
196
+ double *lclip,
197
+ double *rclip);
198
+
199
+ cudnnStatus_t CUDNNWINAPI
200
+ cudnnRNNGetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t *clipMode, double *lclip, double *rclip);
201
+
202
+ cudnnStatus_t CUDNNWINAPI
203
+ cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
204
+
205
+ cudnnStatus_t CUDNNWINAPI
206
+ cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
207
+ cudnnRNNDescriptor_t rnnDesc,
208
+ cudnnForwardMode_t fwdMode,
209
+ cudnnRNNDataDescriptor_t xDesc,
210
+ size_t *workSpaceSize,
211
+ size_t *reserveSpaceSize);
212
+
213
+ cudnnStatus_t CUDNNWINAPI
214
+ cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
215
+
216
+ cudnnStatus_t CUDNNWINAPI
217
+ cudnnGetRNNWeightParams(cudnnHandle_t handle,
218
+ cudnnRNNDescriptor_t rnnDesc,
219
+ int32_t pseudoLayer,
220
+ size_t weightSpaceSize,
221
+ const void *weightSpace,
222
+ int32_t linLayerID,
223
+ cudnnTensorDescriptor_t mDesc,
224
+ void **mAddr,
225
+ cudnnTensorDescriptor_t bDesc,
226
+ void **bAddr);
227
+
228
+ cudnnStatus_t CUDNNWINAPI
229
+ cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
230
+
231
+ cudnnStatus_t CUDNNWINAPI
232
+ cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
233
+
234
+ cudnnStatus_t CUDNNWINAPI
235
+ cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
236
+ cudnnDataType_t dataType,
237
+ cudnnRNNDataLayout_t layout,
238
+ int maxSeqLength,
239
+ int batchSize,
240
+ int vectorSize,
241
+ const int seqLengthArray[], /* length of each sequence in the batch */
242
+ void *paddingFill); /* symbol for filling padding position in output */
243
+
244
+ cudnnStatus_t CUDNNWINAPI
245
+ cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
246
+ cudnnDataType_t *dataType,
247
+ cudnnRNNDataLayout_t *layout,
248
+ int *maxSeqLength,
249
+ int *batchSize,
250
+ int *vectorSize,
251
+ int arrayLengthRequested,
252
+ int seqLengthArray[],
253
+ void *paddingFill);
254
+
255
+ cudnnStatus_t CUDNNWINAPI
256
+ cudnnRNNForward(cudnnHandle_t handle,
257
+ cudnnRNNDescriptor_t rnnDesc,
258
+ cudnnForwardMode_t fwdMode,
259
+ const int32_t devSeqLengths[],
260
+ cudnnRNNDataDescriptor_t xDesc,
261
+ const void *x,
262
+ cudnnRNNDataDescriptor_t yDesc,
263
+ void *y,
264
+ cudnnTensorDescriptor_t hDesc,
265
+ const void *hx,
266
+ void *hy,
267
+ cudnnTensorDescriptor_t cDesc,
268
+ const void *cx,
269
+ void *cy,
270
+ size_t weightSpaceSize,
271
+ const void *weightSpace,
272
+ size_t workSpaceSize,
273
+ void *workSpace,
274
+ size_t reserveSpaceSize,
275
+ void *reserveSpace);
276
+
277
+ /* Sequence data descriptor */
278
+
279
+ typedef enum {
280
+ CUDNN_SEQDATA_TIME_DIM = 0, /* index in time */
281
+ CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
282
+ CUDNN_SEQDATA_BEAM_DIM = 2, /* index in beam */
283
+ CUDNN_SEQDATA_VECT_DIM = 3 /* index in vector */
284
+ } cudnnSeqDataAxis_t;
285
+
286
+ struct cudnnSeqDataStruct;
287
+ typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t CUDNN_DEPRECATED;
288
+
289
+ #define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
290
+
291
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
292
+ cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
293
+
294
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
295
+ cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
296
+
297
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
298
+ cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
299
+ cudnnDataType_t dataType,
300
+ int nbDims,
301
+ const int dimA[],
302
+ const cudnnSeqDataAxis_t axes[],
303
+ size_t seqLengthArraySize,
304
+ const int seqLengthArray[],
305
+ void *paddingFill);
306
+
307
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
308
+ cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
309
+ cudnnDataType_t *dataType,
310
+ int *nbDims,
311
+ int nbDimsRequested,
312
+ int dimA[],
313
+ cudnnSeqDataAxis_t axes[],
314
+ size_t *seqLengthArraySize,
315
+ size_t seqLengthSizeRequested,
316
+ int seqLengthArray[],
317
+ void *paddingFill);
318
+
319
+ /* Multihead Attention */
320
+
321
+ /*
322
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
323
+ * Use the bitwise OR operator to combine several settings listed below. Additional
324
+ * minor options can be added here w/o changing or introducing new API functions.
325
+ */
326
+ #define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0 /* multiple Q-s map to a single (K,V) set when beam size > 1 */
327
+ #define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
328
+ #define CUDNN_ATTN_DISABLE_PROJ_BIASES 0 /* no biases in attention input and output projections */
329
+ #define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1) /* use biases in attention input and output projections */
330
+
331
+ struct cudnnAttnStruct;
332
+ typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t CUDNN_DEPRECATED;
333
+
334
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
335
+ cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
336
+
337
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
338
+ cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
339
+
340
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
341
+ cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
342
+ unsigned attnMode,
343
+ int nHeads,
344
+ double smScaler,
345
+ cudnnDataType_t dataType,
346
+ cudnnDataType_t computePrec,
347
+ cudnnMathType_t mathType,
348
+ cudnnDropoutDescriptor_t attnDropoutDesc,
349
+ cudnnDropoutDescriptor_t postDropoutDesc,
350
+ int qSize,
351
+ int kSize,
352
+ int vSize,
353
+ int qProjSize,
354
+ int kProjSize,
355
+ int vProjSize,
356
+ int oProjSize,
357
+ int qoMaxSeqLength,
358
+ int kvMaxSeqLength,
359
+ int maxBatchSize,
360
+ int maxBeamSize);
361
+
362
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
363
+ cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
364
+ unsigned *attnMode,
365
+ int *nHeads,
366
+ double *smScaler,
367
+ cudnnDataType_t *dataType,
368
+ cudnnDataType_t *computePrec,
369
+ cudnnMathType_t *mathType,
370
+ cudnnDropoutDescriptor_t *attnDropoutDesc,
371
+ cudnnDropoutDescriptor_t *postDropoutDesc,
372
+ int *qSize,
373
+ int *kSize,
374
+ int *vSize,
375
+ int *qProjSize,
376
+ int *kProjSize,
377
+ int *vProjSize,
378
+ int *oProjSize,
379
+ int *qoMaxSeqLength,
380
+ int *kvMaxSeqLength,
381
+ int *maxBatchSize,
382
+ int *maxBeamSize);
383
+
384
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
385
+ cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
386
+ const cudnnAttnDescriptor_t attnDesc,
387
+ size_t *weightSizeInBytes,
388
+ size_t *workSpaceSizeInBytes,
389
+ size_t *reserveSpaceSizeInBytes);
390
+
391
+ typedef enum {
392
+ CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
393
+ CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
394
+ CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
395
+ CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
396
+ CUDNN_MH_ATTN_Q_BIASES = 4, /* input projection bias tensor for 'queries' */
397
+ CUDNN_MH_ATTN_K_BIASES = 5, /* input projection bias for 'keys' */
398
+ CUDNN_MH_ATTN_V_BIASES = 6, /* input projection bias for 'values' */
399
+ CUDNN_MH_ATTN_O_BIASES = 7, /* output projection biases */
400
+ } cudnnMultiHeadAttnWeightKind_t;
401
+
402
+ #define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
403
+
404
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
405
+ cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
406
+ const cudnnAttnDescriptor_t attnDesc,
407
+ cudnnMultiHeadAttnWeightKind_t wKind,
408
+ size_t weightSizeInBytes,
409
+ const void *weights,
410
+ cudnnTensorDescriptor_t wDesc,
411
+ void **wAddr);
412
+
413
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
414
+ cudnnMultiHeadAttnForward(cudnnHandle_t handle,
415
+ const cudnnAttnDescriptor_t attnDesc,
416
+ int currIdx,
417
+ const int loWinIdx[],
418
+ const int hiWinIdx[],
419
+ const int devSeqLengthsQO[],
420
+ const int devSeqLengthsKV[],
421
+ const cudnnSeqDataDescriptor_t qDesc,
422
+ const void *queries,
423
+ const void *residuals,
424
+ const cudnnSeqDataDescriptor_t kDesc,
425
+ const void *keys,
426
+ const cudnnSeqDataDescriptor_t vDesc,
427
+ const void *values,
428
+ const cudnnSeqDataDescriptor_t oDesc,
429
+ void *out,
430
+ size_t weightSizeInBytes,
431
+ const void *weights,
432
+ size_t workSpaceSizeInBytes,
433
+ void *workSpace,
434
+ size_t reserveSpaceSizeInBytes,
435
+ void *reserveSpace);
436
+
437
+ /*
438
+ * \brief Cross-library version checker.
439
+ * This function is implemented differently in each sub-library. Each sublib
440
+ * checks whether its own version matches that of its dependencies.
441
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
442
+ * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
443
+ */
444
+ cudnnStatus_t CUDNNWINAPI
445
+ cudnnAdvVersionCheck(void);
446
+
447
+ typedef enum {
448
+ CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
449
+ CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
450
+ } cudnnWgradMode_t;
451
+
452
+ cudnnStatus_t CUDNNWINAPI
453
+ cudnnRNNBackwardData_v8(cudnnHandle_t handle,
454
+ cudnnRNNDescriptor_t rnnDesc,
455
+ const int32_t devSeqLengths[],
456
+ cudnnRNNDataDescriptor_t yDesc,
457
+ const void *y,
458
+ const void *dy,
459
+ cudnnRNNDataDescriptor_t xDesc,
460
+ void *dx,
461
+ cudnnTensorDescriptor_t hDesc,
462
+ const void *hx,
463
+ const void *dhy,
464
+ void *dhx,
465
+ cudnnTensorDescriptor_t cDesc,
466
+ const void *cx,
467
+ const void *dcy,
468
+ void *dcx,
469
+ size_t weightSpaceSize,
470
+ const void *weightSpace,
471
+ size_t workSpaceSize,
472
+ void *workSpace,
473
+ size_t reserveSpaceSize,
474
+ void *reserveSpace);
475
+
476
+ cudnnStatus_t CUDNNWINAPI
477
+ cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
478
+ cudnnRNNDescriptor_t rnnDesc,
479
+ cudnnWgradMode_t addGrad,
480
+ const int32_t devSeqLengths[],
481
+ cudnnRNNDataDescriptor_t xDesc,
482
+ const void *x,
483
+ cudnnTensorDescriptor_t hDesc,
484
+ const void *hx,
485
+ cudnnRNNDataDescriptor_t yDesc,
486
+ const void *y,
487
+ size_t weightSpaceSize,
488
+ void *dweightSpace,
489
+ size_t workSpaceSize,
490
+ void *workSpace,
491
+ size_t reserveSpaceSize,
492
+ void *reserveSpace);
493
+
494
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
495
+ cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
496
+ const cudnnAttnDescriptor_t attnDesc,
497
+ const int loWinIdx[],
498
+ const int hiWinIdx[],
499
+ const int devSeqLengthsDQDO[],
500
+ const int devSeqLengthsDKDV[],
501
+ const cudnnSeqDataDescriptor_t doDesc,
502
+ const void *dout,
503
+ const cudnnSeqDataDescriptor_t dqDesc,
504
+ void *dqueries,
505
+ const void *queries,
506
+ const cudnnSeqDataDescriptor_t dkDesc,
507
+ void *dkeys,
508
+ const void *keys,
509
+ const cudnnSeqDataDescriptor_t dvDesc,
510
+ void *dvalues,
511
+ const void *values,
512
+ size_t weightSizeInBytes,
513
+ const void *weights,
514
+ size_t workSpaceSizeInBytes,
515
+ void *workSpace,
516
+ size_t reserveSpaceSizeInBytes,
517
+ void *reserveSpace);
518
+
519
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
520
+ cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
521
+ const cudnnAttnDescriptor_t attnDesc,
522
+ cudnnWgradMode_t addGrad,
523
+ const cudnnSeqDataDescriptor_t qDesc,
524
+ const void *queries,
525
+ const cudnnSeqDataDescriptor_t kDesc,
526
+ const void *keys,
527
+ const cudnnSeqDataDescriptor_t vDesc,
528
+ const void *values,
529
+ const cudnnSeqDataDescriptor_t doDesc,
530
+ const void *dout,
531
+ size_t weightSizeInBytes,
532
+ const void *weights,
533
+ void *dweights,
534
+ size_t workSpaceSizeInBytes,
535
+ void *workSpace,
536
+ size_t reserveSpaceSizeInBytes,
537
+ void *reserveSpace);
538
+
539
+ /*
540
+ * CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
541
+ */
542
+ /* Input normalization mode for loss function */
543
+ typedef enum {
544
+ CUDNN_LOSS_NORMALIZATION_NONE = 0,
545
+ CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
546
+ } cudnnLossNormalizationMode_t;
547
+
548
+ cudnnStatus_t CUDNNWINAPI
549
+ cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
550
+
551
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
552
+ cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
553
+
554
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
555
+ cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
556
+ cudnnDataType_t compType,
557
+ cudnnLossNormalizationMode_t normMode,
558
+ cudnnNanPropagation_t gradMode);
559
+
560
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
561
+ cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
562
+ cudnnDataType_t compType,
563
+ cudnnLossNormalizationMode_t normMode,
564
+ cudnnNanPropagation_t gradMode,
565
+ int maxLabelLength);
566
+
567
+ cudnnStatus_t CUDNNWINAPI
568
+ cudnnSetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
569
+ cudnnDataType_t compType,
570
+ cudnnLossNormalizationMode_t normMode,
571
+ cudnnCTCGradMode_t ctcGradMode,
572
+ int maxLabelLength);
573
+
574
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
575
+ cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
576
+
577
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
578
+ cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
579
+ cudnnDataType_t *compType,
580
+ cudnnLossNormalizationMode_t *normMode,
581
+ cudnnNanPropagation_t *gradMode);
582
+
583
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
584
+ cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
585
+ cudnnDataType_t *compType,
586
+ cudnnLossNormalizationMode_t *normMode,
587
+ cudnnNanPropagation_t *gradMode,
588
+ int *maxLabelLength);
589
+
590
+ cudnnStatus_t CUDNNWINAPI
591
+ cudnnGetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
592
+ cudnnDataType_t *compType,
593
+ cudnnLossNormalizationMode_t *normMode,
594
+ cudnnCTCGradMode_t *ctcGradMode,
595
+ int *maxLabelLength);
596
+
597
+ cudnnStatus_t CUDNNWINAPI
598
+ cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
599
+
600
+ /* return the ctc costs and gradients, given the probabilities and labels */
601
+ cudnnStatus_t CUDNNWINAPI
602
+ cudnnCTCLoss(
603
+ cudnnHandle_t handle,
604
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
605
+ timing steps, N is the mini batch size, A is the alphabet size) */
606
+ const void *probs, /* probabilities after softmax, in GPU memory */
607
+ const int hostLabels[], /* labels, in CPU memory */
608
+ const int hostLabelLengths[], /* the length of each label, in CPU memory */
609
+ const int hostInputLengths[], /* the lengths of timing steps in each batch, in CPU memory */
610
+ void *costs, /* the returned costs of CTC, in GPU memory */
611
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
612
+ void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
613
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
614
+ cudnnCTCLossDescriptor_t ctcLossDesc,
615
+ void *workspace, /* pointer to the workspace, in GPU memory */
616
+ size_t workSpaceSizeInBytes); /* size of the workspace */
617
+
618
+ /* return the ctc costs and gradients, given the probabilities and labels */
619
+ cudnnStatus_t CUDNNWINAPI
620
+ cudnnCTCLoss_v8(
621
+ cudnnHandle_t handle,
622
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
623
+ cudnnCTCLossDescriptor_t ctcLossDesc,
624
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
625
+ timing steps, N is the mini batch size, A is the alphabet size) */
626
+ const void *probs, /* probabilities after softmax, in GPU memory */
627
+ const int labels[], /* labels, in GPU memory */
628
+ const int labelLengths[], /* the length of each label, in GPU memory */
629
+ const int inputLengths[], /* the lengths of timing steps in each batch, in GPU memory */
630
+ void *costs, /* the returned costs of CTC, in GPU memory */
631
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
632
+ void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
633
+ size_t workSpaceSizeInBytes, /* size of the workspace */
634
+ void *workspace); /* pointer to the workspace, in GPU memory */
635
+
636
+ /* return the workspace size needed for ctc */
637
+ cudnnStatus_t CUDNNWINAPI
638
+ cudnnGetCTCLossWorkspaceSize(
639
+ cudnnHandle_t handle,
640
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
641
+ timing steps, N is the mini batch size, A is the alphabet size) */
642
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
643
+ dimensions are T,N,A. To compute costs
644
+ only, set it to NULL */
645
+ const int *labels, /* labels, in CPU memory */
646
+ const int *labelLengths, /* the length of each label, in CPU memory */
647
+ const int *inputLengths, /* the lengths of timing steps in each batch, in CPU memory */
648
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
649
+ cudnnCTCLossDescriptor_t ctcLossDesc,
650
+ size_t *sizeInBytes); /* pointer to the returned workspace size */
651
+
652
+ /* return the workspace size needed for ctc */
653
+ cudnnStatus_t CUDNNWINAPI
654
+ cudnnGetCTCLossWorkspaceSize_v8(
655
+ cudnnHandle_t handle,
656
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
657
+ cudnnCTCLossDescriptor_t ctcLossDesc,
658
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
659
+ timing steps, N is the mini batch size, A is the alphabet size) */
660
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
661
+ dimensions are T,N,A. To compute costs
662
+ only, set it to NULL */
663
+ size_t *sizeInBytes); /* pointer to the returned workspace size */
664
+
665
+ #if defined(__cplusplus)
666
+ }
667
+ #endif
668
+
669
+ #endif /* CUDNN_ADV_H_ */