triton-windows 3.2.0.post12__cp313-cp313-win_amd64.whl → 3.3.0a0.post12__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of triton-windows might be problematic. Click here for more details.
- triton/_C/libtriton.pyd +0 -0
- triton/__init__.py +3 -3
- triton/_internal_testing.py +59 -4
- triton/_utils.py +35 -0
- triton/backends/amd/compiler.py +121 -74
- triton/backends/amd/driver.py +77 -43
- triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +28 -49
- triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +35 -9
- triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +761 -284
- triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +9 -3
- triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +1391 -0
- triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +3 -3
- triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +44 -0
- triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +288 -0
- triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +110 -14
- triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +504 -103
- triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +2 -1
- triton/backends/amd/include/hip/amd_detail/host_defines.h +4 -0
- triton/backends/amd/include/hip/hip_ext.h +4 -2
- triton/backends/amd/include/hip/hip_fp8.h +33 -0
- triton/backends/amd/include/hip/hip_runtime_api.h +375 -33
- triton/backends/amd/include/hip/hip_version.h +3 -3
- triton/backends/amd/include/hip/hiprtc.h +25 -25
- triton/backends/amd/include/hsa/amd_hsa_elf.h +40 -14
- triton/backends/amd/include/hsa/hsa.h +11 -2
- triton/backends/amd/include/hsa/hsa_api_trace.h +30 -17
- triton/backends/amd/include/hsa/hsa_api_trace_version.h +68 -0
- triton/backends/amd/include/hsa/hsa_ext_amd.h +83 -27
- triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +46 -46
- triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +416 -0
- triton/backends/amd/include/roctracer/hip_ostream_ops.h +84 -4
- triton/backends/amd/include/roctracer/hsa_ostream_ops.h +260 -0
- triton/backends/amd/include/roctracer/hsa_prof_str.h +51 -19
- triton/backends/amd/lib/asanrtl.bc +0 -0
- triton/backends/compiler.py +25 -225
- triton/backends/driver.py +7 -2
- triton/backends/nvidia/bin/ptxas.exe +0 -0
- triton/backends/nvidia/compiler.py +135 -90
- triton/backends/nvidia/driver.c +0 -1
- triton/backends/nvidia/driver.py +135 -49
- triton/backends/nvidia/include/cuda.h +2162 -241
- triton/backends/nvidia/lib/x64/cuda.lib +0 -0
- triton/compiler/__init__.py +2 -2
- triton/compiler/code_generator.py +334 -231
- triton/compiler/compiler.py +77 -66
- triton/language/__init__.py +22 -5
- triton/language/core.py +448 -74
- triton/language/extra/cuda/_experimental_tma.py +3 -5
- triton/language/math.py +1 -1
- triton/language/random.py +2 -1
- triton/language/semantic.py +206 -52
- triton/language/standard.py +35 -18
- triton/runtime/_allocation.py +32 -0
- triton/runtime/autotuner.py +27 -32
- triton/runtime/build.py +1 -48
- triton/runtime/cache.py +6 -6
- triton/runtime/errors.py +10 -0
- triton/runtime/interpreter.py +179 -45
- triton/runtime/jit.py +149 -190
- triton/testing.py +39 -11
- triton/tools/compile.py +27 -20
- triton/tools/{compile.c → extra/cuda/compile.c} +1 -0
- triton/tools/mxfp.py +301 -0
- {triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/METADATA +5 -2
- {triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/RECORD +68 -59
- {triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/top_level.txt +2 -0
- /triton/tools/{compile.h → extra/cuda/compile.h} +0 -0
- {triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/WHEEL +0 -0
|
@@ -89,6 +89,7 @@ typedef uint64_t cuuint64_t;
|
|
|
89
89
|
#define cuDeviceTotalMem cuDeviceTotalMem_v2
|
|
90
90
|
#define cuCtxCreate cuCtxCreate_v2
|
|
91
91
|
#define cuCtxCreate_v3 cuCtxCreate_v3
|
|
92
|
+
#define cuCtxCreate_v4 cuCtxCreate_v4
|
|
92
93
|
#define cuModuleGetGlobal cuModuleGetGlobal_v2
|
|
93
94
|
#define cuMemGetInfo cuMemGetInfo_v2
|
|
94
95
|
#define cuMemAlloc cuMemAlloc_v2
|
|
@@ -115,6 +116,8 @@ typedef uint64_t cuuint64_t;
|
|
|
115
116
|
#define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
|
|
116
117
|
#define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
|
|
117
118
|
#define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
|
|
119
|
+
#define cuMemcpyBatchAsync __CUDA_API_PTSZ(cuMemcpyBatchAsync)
|
|
120
|
+
#define cuMemcpy3DBatchAsync __CUDA_API_PTSZ(cuMemcpy3DBatchAsync)
|
|
118
121
|
#define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2)
|
|
119
122
|
#define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2)
|
|
120
123
|
#define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2)
|
|
@@ -183,7 +186,9 @@ typedef uint64_t cuuint64_t;
|
|
|
183
186
|
#define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority)
|
|
184
187
|
#define cuStreamGetId __CUDA_API_PTSZ(cuStreamGetId)
|
|
185
188
|
#define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags)
|
|
189
|
+
#define cuStreamGetDevice __CUDA_API_PTSZ(cuStreamGetDevice)
|
|
186
190
|
#define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx)
|
|
191
|
+
#define cuStreamGetCtx_v2 __CUDA_API_PTSZ(cuStreamGetCtx_v2)
|
|
187
192
|
#define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent)
|
|
188
193
|
#define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture)
|
|
189
194
|
#define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing)
|
|
@@ -202,6 +207,7 @@ typedef uint64_t cuuint64_t;
|
|
|
202
207
|
#define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources)
|
|
203
208
|
#define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources)
|
|
204
209
|
|
|
210
|
+
|
|
205
211
|
#define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel)
|
|
206
212
|
|
|
207
213
|
#define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync)
|
|
@@ -223,6 +229,8 @@ typedef uint64_t cuuint64_t;
|
|
|
223
229
|
|
|
224
230
|
#endif
|
|
225
231
|
|
|
232
|
+
#define cuMemBatchDecompressAsync __CUDA_API_PTSZ(cuMemBatchDecompressAsync)
|
|
233
|
+
|
|
226
234
|
/**
|
|
227
235
|
* \file cuda.h
|
|
228
236
|
* \brief Header file for the CUDA Toolkit application programming interface.
|
|
@@ -244,7 +252,7 @@ typedef uint64_t cuuint64_t;
|
|
|
244
252
|
/**
|
|
245
253
|
* CUDA API version number
|
|
246
254
|
*/
|
|
247
|
-
#define CUDA_VERSION
|
|
255
|
+
#define CUDA_VERSION 12080
|
|
248
256
|
|
|
249
257
|
#ifdef __cplusplus
|
|
250
258
|
extern "C" {
|
|
@@ -263,7 +271,7 @@ typedef CUdeviceptr_v2 CUdeviceptr; /**< CUDA device po
|
|
|
263
271
|
|
|
264
272
|
typedef int CUdevice_v1; /**< CUDA device */
|
|
265
273
|
typedef CUdevice_v1 CUdevice; /**< CUDA device */
|
|
266
|
-
typedef struct CUctx_st *CUcontext; /**<
|
|
274
|
+
typedef struct CUctx_st *CUcontext; /**< A regular context handle */
|
|
267
275
|
typedef struct CUmod_st *CUmodule; /**< CUDA module */
|
|
268
276
|
typedef struct CUfunc_st *CUfunction; /**< CUDA function */
|
|
269
277
|
typedef struct CUlib_st *CUlibrary; /**< CUDA library */
|
|
@@ -289,6 +297,12 @@ typedef struct CUuserObject_st *CUuserObject; /**< CUDA user obje
|
|
|
289
297
|
typedef cuuint64_t CUgraphConditionalHandle; /**< CUDA graph conditional handle */
|
|
290
298
|
typedef struct CUgraphDeviceUpdatableNode_st *CUgraphDeviceNode; /**< CUDA graph device node handle */
|
|
291
299
|
typedef struct CUasyncCallbackEntry_st *CUasyncCallbackHandle; /**< CUDA async notification callback handle */
|
|
300
|
+
/*!
|
|
301
|
+
* \typedef typedef struct CUgreenCtx_st* CUgreenCtx
|
|
302
|
+
* A green context handle. This handle can be used safely from only one CPU thread at a time.
|
|
303
|
+
* Created via ::cuGreenCtxCreate
|
|
304
|
+
*/
|
|
305
|
+
typedef struct CUgreenCtx_st *CUgreenCtx;
|
|
292
306
|
|
|
293
307
|
#ifndef CU_UUID_HAS_BEEN_DEFINED
|
|
294
308
|
#define CU_UUID_HAS_BEEN_DEFINED
|
|
@@ -617,41 +631,58 @@ typedef void (*CUasyncCallback)(CUasyncNotificationInfo *info, void *userData, C
|
|
|
617
631
|
* Array formats
|
|
618
632
|
*/
|
|
619
633
|
typedef enum CUarray_format_enum {
|
|
620
|
-
CU_AD_FORMAT_UNSIGNED_INT8
|
|
621
|
-
CU_AD_FORMAT_UNSIGNED_INT16
|
|
622
|
-
CU_AD_FORMAT_UNSIGNED_INT32
|
|
623
|
-
CU_AD_FORMAT_SIGNED_INT8
|
|
624
|
-
CU_AD_FORMAT_SIGNED_INT16
|
|
625
|
-
CU_AD_FORMAT_SIGNED_INT32
|
|
626
|
-
CU_AD_FORMAT_HALF
|
|
627
|
-
CU_AD_FORMAT_FLOAT
|
|
628
|
-
CU_AD_FORMAT_NV12
|
|
629
|
-
CU_AD_FORMAT_UNORM_INT8X1
|
|
630
|
-
CU_AD_FORMAT_UNORM_INT8X2
|
|
631
|
-
CU_AD_FORMAT_UNORM_INT8X4
|
|
632
|
-
CU_AD_FORMAT_UNORM_INT16X1
|
|
633
|
-
CU_AD_FORMAT_UNORM_INT16X2
|
|
634
|
-
CU_AD_FORMAT_UNORM_INT16X4
|
|
635
|
-
CU_AD_FORMAT_SNORM_INT8X1
|
|
636
|
-
CU_AD_FORMAT_SNORM_INT8X2
|
|
637
|
-
CU_AD_FORMAT_SNORM_INT8X4
|
|
638
|
-
CU_AD_FORMAT_SNORM_INT16X1
|
|
639
|
-
CU_AD_FORMAT_SNORM_INT16X2
|
|
640
|
-
CU_AD_FORMAT_SNORM_INT16X4
|
|
641
|
-
CU_AD_FORMAT_BC1_UNORM
|
|
642
|
-
CU_AD_FORMAT_BC1_UNORM_SRGB
|
|
643
|
-
CU_AD_FORMAT_BC2_UNORM
|
|
644
|
-
CU_AD_FORMAT_BC2_UNORM_SRGB
|
|
645
|
-
CU_AD_FORMAT_BC3_UNORM
|
|
646
|
-
CU_AD_FORMAT_BC3_UNORM_SRGB
|
|
647
|
-
CU_AD_FORMAT_BC4_UNORM
|
|
648
|
-
CU_AD_FORMAT_BC4_SNORM
|
|
649
|
-
CU_AD_FORMAT_BC5_UNORM
|
|
650
|
-
CU_AD_FORMAT_BC5_SNORM
|
|
651
|
-
CU_AD_FORMAT_BC6H_UF16
|
|
652
|
-
CU_AD_FORMAT_BC6H_SF16
|
|
653
|
-
CU_AD_FORMAT_BC7_UNORM
|
|
654
|
-
CU_AD_FORMAT_BC7_UNORM_SRGB
|
|
634
|
+
CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
|
|
635
|
+
CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
|
|
636
|
+
CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
|
|
637
|
+
CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */
|
|
638
|
+
CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */
|
|
639
|
+
CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
|
|
640
|
+
CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
|
|
641
|
+
CU_AD_FORMAT_FLOAT = 0x20, /**< 32-bit floating point */
|
|
642
|
+
CU_AD_FORMAT_NV12 = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
|
|
643
|
+
CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */
|
|
644
|
+
CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */
|
|
645
|
+
CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */
|
|
646
|
+
CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */
|
|
647
|
+
CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */
|
|
648
|
+
CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */
|
|
649
|
+
CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, /**< 1 channel signed 8-bit normalized integer */
|
|
650
|
+
CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, /**< 2 channel signed 8-bit normalized integer */
|
|
651
|
+
CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, /**< 4 channel signed 8-bit normalized integer */
|
|
652
|
+
CU_AD_FORMAT_SNORM_INT16X1 = 0xc9, /**< 1 channel signed 16-bit normalized integer */
|
|
653
|
+
CU_AD_FORMAT_SNORM_INT16X2 = 0xca, /**< 2 channel signed 16-bit normalized integer */
|
|
654
|
+
CU_AD_FORMAT_SNORM_INT16X4 = 0xcb, /**< 4 channel signed 16-bit normalized integer */
|
|
655
|
+
CU_AD_FORMAT_BC1_UNORM = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
|
|
656
|
+
CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
|
|
657
|
+
CU_AD_FORMAT_BC2_UNORM = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
|
|
658
|
+
CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/
|
|
659
|
+
CU_AD_FORMAT_BC3_UNORM = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
|
|
660
|
+
CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/
|
|
661
|
+
CU_AD_FORMAT_BC4_UNORM = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
|
|
662
|
+
CU_AD_FORMAT_BC4_SNORM = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */
|
|
663
|
+
CU_AD_FORMAT_BC5_UNORM = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
|
|
664
|
+
CU_AD_FORMAT_BC5_SNORM = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */
|
|
665
|
+
CU_AD_FORMAT_BC6H_UF16 = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
|
|
666
|
+
CU_AD_FORMAT_BC6H_SF16 = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */
|
|
667
|
+
CU_AD_FORMAT_BC7_UNORM = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
|
|
668
|
+
CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
|
|
669
|
+
CU_AD_FORMAT_P010 = 0x9f, /**< 10-bit YUV planar format, with 4:2:0 sampling */
|
|
670
|
+
CU_AD_FORMAT_P016 = 0xa1, /**< 16-bit YUV planar format, with 4:2:0 sampling */
|
|
671
|
+
CU_AD_FORMAT_NV16 = 0xa2, /**< 8-bit YUV planar format, with 4:2:2 sampling */
|
|
672
|
+
CU_AD_FORMAT_P210 = 0xa3, /**< 10-bit YUV planar format, with 4:2:2 sampling */
|
|
673
|
+
CU_AD_FORMAT_P216 = 0xa4, /**< 16-bit YUV planar format, with 4:2:2 sampling */
|
|
674
|
+
CU_AD_FORMAT_YUY2 = 0xa5, /**< 2 channel, 8-bit YUV packed planar format, with 4:2:2 sampling */
|
|
675
|
+
CU_AD_FORMAT_Y210 = 0xa6, /**< 2 channel, 10-bit YUV packed planar format, with 4:2:2 sampling */
|
|
676
|
+
CU_AD_FORMAT_Y216 = 0xa7, /**< 2 channel, 16-bit YUV packed planar format, with 4:2:2 sampling */
|
|
677
|
+
CU_AD_FORMAT_AYUV = 0xa8, /**< 4 channel, 8-bit YUV packed planar format, with 4:4:4 sampling */
|
|
678
|
+
CU_AD_FORMAT_Y410 = 0xa9, /**< 10-bit YUV packed planar format, with 4:4:4 sampling */
|
|
679
|
+
CU_AD_FORMAT_Y416 = 0xb1, /**< 4 channel, 12-bit YUV packed planar format, with 4:4:4 sampling */
|
|
680
|
+
CU_AD_FORMAT_Y444_PLANAR8 = 0xb2, /**< 3 channel 8-bit YUV planar format, with 4:4:4 sampling */
|
|
681
|
+
CU_AD_FORMAT_Y444_PLANAR10 = 0xb3, /**< 3 channel 10-bit YUV planar format, with 4:4:4 sampling */
|
|
682
|
+
CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4, /**< 3 channel 8-bit YUV semi-planar format, with 4:4:4 sampling */
|
|
683
|
+
CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5, /**< 3 channel 16-bit YUV semi-planar format, with 4:4:4 sampling */
|
|
684
|
+
CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50, /**< 4 channel unorm R10G10B10A2 RGB format */
|
|
685
|
+
CU_AD_FORMAT_MAX = 0x7FFFFFFF
|
|
655
686
|
} CUarray_format;
|
|
656
687
|
|
|
657
688
|
/**
|
|
@@ -811,11 +842,17 @@ typedef enum CUdevice_attribute_enum {
|
|
|
811
842
|
CU_DEVICE_ATTRIBUTE_TENSOR_MAP_ACCESS_SUPPORTED = 127, /**< Device supports accessing memory using Tensor Map. */
|
|
812
843
|
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED = 128, /**< Device supports exporting memory to a fabric handle with cuMemExportToShareableHandle() or requested with cuMemCreate() */
|
|
813
844
|
CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS = 129, /**< Device supports unified function pointers. */
|
|
814
|
-
CU_DEVICE_ATTRIBUTE_NUMA_CONFIG = 130,
|
|
815
|
-
CU_DEVICE_ATTRIBUTE_NUMA_ID = 131,
|
|
845
|
+
CU_DEVICE_ATTRIBUTE_NUMA_CONFIG = 130, /**< NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum */
|
|
846
|
+
CU_DEVICE_ATTRIBUTE_NUMA_ID = 131, /**< NUMA node ID of the GPU memory */
|
|
816
847
|
CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED = 132, /**< Device supports switch multicast and reduction operations. */
|
|
817
848
|
CU_DEVICE_ATTRIBUTE_MPS_ENABLED = 133, /**< Indicates if contexts created on this device will be shared via MPS */
|
|
818
849
|
CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID = 134, /**< NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA. */
|
|
850
|
+
CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED = 135, /**< Device supports CIG with D3D12. */
|
|
851
|
+
CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK = 136, /**< The returned valued shall be interpreted as a bitmask, where the individual bits are described by the ::CUmemDecompressAlgorithm enum. */
|
|
852
|
+
CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_MAXIMUM_LENGTH = 137, /**< The returned valued is the maximum length in bytes of a single decompress operation that is allowed. */
|
|
853
|
+
CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID = 139, /**< The combined 16-bit PCI device ID and 16-bit PCI vendor ID. */
|
|
854
|
+
CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID = 140, /**< The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID. */
|
|
855
|
+
CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED = 143, /**< Device supports HOST_NUMA location IPC between nodes in a multi-node system. */
|
|
819
856
|
CU_DEVICE_ATTRIBUTE_MAX
|
|
820
857
|
} CUdevice_attribute;
|
|
821
858
|
|
|
@@ -860,6 +897,7 @@ typedef enum CUpointer_attribute_enum {
|
|
|
860
897
|
CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18, /**< Size of the actual underlying mapping that the pointer belongs to **/
|
|
861
898
|
CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19, /**< The start address of the mapping that the pointer belongs to **/
|
|
862
899
|
CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20 /**< A process-wide unique id corresponding to the physical allocation the pointer belongs to **/
|
|
900
|
+
, CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE = 21 /**< Returns in \p *data a boolean that indicates whether the pointer points to memory that is capable to be used for hardware accelerated decompression. */
|
|
863
901
|
} CUpointer_attribute;
|
|
864
902
|
|
|
865
903
|
/**
|
|
@@ -1449,27 +1487,36 @@ typedef enum CUjit_option_enum
|
|
|
1449
1487
|
*/
|
|
1450
1488
|
typedef enum CUjit_target_enum
|
|
1451
1489
|
{
|
|
1452
|
-
CU_TARGET_COMPUTE_30 = 30,
|
|
1453
|
-
CU_TARGET_COMPUTE_32 = 32,
|
|
1454
|
-
CU_TARGET_COMPUTE_35 = 35,
|
|
1455
|
-
CU_TARGET_COMPUTE_37 = 37,
|
|
1456
|
-
CU_TARGET_COMPUTE_50 = 50,
|
|
1457
|
-
CU_TARGET_COMPUTE_52 = 52,
|
|
1458
|
-
CU_TARGET_COMPUTE_53 = 53,
|
|
1459
|
-
CU_TARGET_COMPUTE_60 = 60,
|
|
1460
|
-
CU_TARGET_COMPUTE_61 = 61,
|
|
1461
|
-
CU_TARGET_COMPUTE_62 = 62,
|
|
1462
|
-
CU_TARGET_COMPUTE_70 = 70,
|
|
1463
|
-
CU_TARGET_COMPUTE_72 = 72,
|
|
1464
|
-
CU_TARGET_COMPUTE_75 = 75,
|
|
1465
|
-
CU_TARGET_COMPUTE_80 = 80,
|
|
1466
|
-
CU_TARGET_COMPUTE_86 = 86,
|
|
1467
|
-
CU_TARGET_COMPUTE_87 = 87,
|
|
1468
|
-
CU_TARGET_COMPUTE_89 = 89,
|
|
1469
|
-
CU_TARGET_COMPUTE_90 = 90,
|
|
1490
|
+
CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
|
|
1491
|
+
CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */
|
|
1492
|
+
CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */
|
|
1493
|
+
CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */
|
|
1494
|
+
CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */
|
|
1495
|
+
CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */
|
|
1496
|
+
CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */
|
|
1497
|
+
CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/
|
|
1498
|
+
CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/
|
|
1499
|
+
CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/
|
|
1500
|
+
CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/
|
|
1501
|
+
CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/
|
|
1502
|
+
CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/
|
|
1503
|
+
CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/
|
|
1504
|
+
CU_TARGET_COMPUTE_86 = 86, /**< Compute device class 8.6.*/
|
|
1505
|
+
CU_TARGET_COMPUTE_87 = 87, /**< Compute device class 8.7.*/
|
|
1506
|
+
CU_TARGET_COMPUTE_89 = 89, /**< Compute device class 8.9.*/
|
|
1507
|
+
CU_TARGET_COMPUTE_90 = 90, /**< Compute device class 9.0.*/
|
|
1508
|
+
CU_TARGET_COMPUTE_100 = 100, /**< Compute device class 10.0.*/
|
|
1509
|
+
CU_TARGET_COMPUTE_101 = 101, /**< Compute device class 10.1.*/
|
|
1510
|
+
CU_TARGET_COMPUTE_120 = 120, /**< Compute device class 12.0.*/
|
|
1470
1511
|
|
|
1471
1512
|
/**< Compute device class 9.0. with accelerated features.*/
|
|
1472
1513
|
CU_TARGET_COMPUTE_90A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_90,
|
|
1514
|
+
/**< Compute device class 10.0. with accelerated features.*/
|
|
1515
|
+
CU_TARGET_COMPUTE_100A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_100,
|
|
1516
|
+
/**< Compute device class 10.1 with accelerated features.*/
|
|
1517
|
+
CU_TARGET_COMPUTE_101A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_101,
|
|
1518
|
+
/**< Compute device class 12.0. with accelerated features.*/
|
|
1519
|
+
CU_TARGET_COMPUTE_120A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_120,
|
|
1473
1520
|
} CUjit_target;
|
|
1474
1521
|
|
|
1475
1522
|
/**
|
|
@@ -1585,6 +1632,9 @@ typedef enum CUlimit_enum {
|
|
|
1585
1632
|
CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */
|
|
1586
1633
|
CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
|
|
1587
1634
|
CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x06, /**< A size in bytes for L2 persisting lines cache size */
|
|
1635
|
+
CU_LIMIT_SHMEM_SIZE = 0x07, /**< A maximum size in bytes of shared memory available to CUDA kernels on a CIG context. Can only be queried, cannot be set */
|
|
1636
|
+
CU_LIMIT_CIG_ENABLED = 0x08, /**< A non-zero value indicates this CUDA context is a CIG-enabled context. Can only be queried, cannot be set */
|
|
1637
|
+
CU_LIMIT_CIG_SHMEM_FALLBACK_ENABLED = 0x09, /**< When set to zero, CUDA will fail to launch a kernel on a CIG context, instead of using the fallback path, if the kernel uses more shared memory than available */
|
|
1588
1638
|
CU_LIMIT_MAX
|
|
1589
1639
|
} CUlimit;
|
|
1590
1640
|
|
|
@@ -1748,8 +1798,9 @@ typedef struct CUDA_HOST_NODE_PARAMS_v2_st {
|
|
|
1748
1798
|
* Conditional node types
|
|
1749
1799
|
*/
|
|
1750
1800
|
typedef enum CUgraphConditionalNodeType_enum {
|
|
1751
|
-
CU_GRAPH_COND_TYPE_IF = 0, /**< Conditional 'if' Node. Body executed
|
|
1801
|
+
CU_GRAPH_COND_TYPE_IF = 0, /**< Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If \p size == 2, an optional ELSE graph is created and this is executed if the condition is zero. */
|
|
1752
1802
|
CU_GRAPH_COND_TYPE_WHILE = 1, /**< Conditional 'while' Node. Body executed repeatedly while condition value is non-zero. */
|
|
1803
|
+
CU_GRAPH_COND_TYPE_SWITCH = 2, /**< Conditional 'switch' Node. Body[n] is executed once, where 'n' is the value of the condition. If the condition does not match a body index, no body is launched. */
|
|
1753
1804
|
} CUgraphConditionalNodeType;
|
|
1754
1805
|
|
|
1755
1806
|
/**
|
|
@@ -1760,7 +1811,8 @@ typedef struct CUDA_CONDITIONAL_NODE_PARAMS {
|
|
|
1760
1811
|
Handles must be created in advance of creating the node
|
|
1761
1812
|
using ::cuGraphConditionalHandleCreate. */
|
|
1762
1813
|
CUgraphConditionalNodeType type; /**< Type of conditional node. */
|
|
1763
|
-
unsigned int size; /**< Size of graph output array.
|
|
1814
|
+
unsigned int size; /**< Size of graph output array. Allowed values are 1 for CU_GRAPH_COND_TYPE_WHILE, 1 or 2
|
|
1815
|
+
for CU_GRAPH_COND_TYPE_IF, or any value greater than zero for CU_GRAPH_COND_TYPE_SWITCH. */
|
|
1764
1816
|
CUgraph *phGraph_out; /**< CUDA-owned array populated with conditional node child graphs during creation of the node.
|
|
1765
1817
|
Valid for the lifetime of the conditional node.
|
|
1766
1818
|
The contents of the graph(s) are subject to the following constraints:
|
|
@@ -1770,7 +1822,17 @@ typedef struct CUDA_CONDITIONAL_NODE_PARAMS {
|
|
|
1770
1822
|
- All kernels, including kernels in nested conditionals or child graphs at any level,
|
|
1771
1823
|
must belong to the same CUDA context.
|
|
1772
1824
|
|
|
1773
|
-
These graphs may be populated using graph node creation APIs or ::cuStreamBeginCaptureToGraph.
|
|
1825
|
+
These graphs may be populated using graph node creation APIs or ::cuStreamBeginCaptureToGraph.
|
|
1826
|
+
|
|
1827
|
+
CU_GRAPH_COND_TYPE_IF:
|
|
1828
|
+
phGraph_out[0] is executed when the condition is non-zero. If \p size == 2, phGraph_out[1] will
|
|
1829
|
+
be executed when the condition is zero.
|
|
1830
|
+
CU_GRAPH_COND_TYPE_WHILE:
|
|
1831
|
+
phGraph_out[0] is executed as long as the condition is non-zero.
|
|
1832
|
+
CU_GRAPH_COND_TYPE_SWITCH:
|
|
1833
|
+
phGraph_out[n] is executed when the condition is equal to n. If the condition >= \p size,
|
|
1834
|
+
no body graph is executed.
|
|
1835
|
+
*/
|
|
1774
1836
|
CUcontext ctx; /**< Context on which to run the node. Must match context used to create the handle and all body nodes. */
|
|
1775
1837
|
} CUDA_CONDITIONAL_NODE_PARAMS;
|
|
1776
1838
|
|
|
@@ -1790,23 +1852,22 @@ typedef enum CUgraphNodeType_enum {
|
|
|
1790
1852
|
CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT = 9, /**< External semaphore wait node */
|
|
1791
1853
|
CU_GRAPH_NODE_TYPE_MEM_ALLOC = 10,/**< Memory Allocation Node */
|
|
1792
1854
|
CU_GRAPH_NODE_TYPE_MEM_FREE = 11,/**< Memory Free Node */
|
|
1793
|
-
CU_GRAPH_NODE_TYPE_BATCH_MEM_OP = 12
|
|
1794
|
-
,
|
|
1855
|
+
CU_GRAPH_NODE_TYPE_BATCH_MEM_OP = 12,/**< Batch MemOp Node */
|
|
1795
1856
|
CU_GRAPH_NODE_TYPE_CONDITIONAL = 13 /**< Conditional Node
|
|
1796
|
-
|
|
1857
|
+
|
|
1797
1858
|
May be used to implement a conditional execution path or loop
|
|
1798
1859
|
inside of a graph. The graph(s) contained within the body of the conditional node
|
|
1799
1860
|
can be selectively executed or iterated upon based on the value of a conditional
|
|
1800
1861
|
variable.
|
|
1801
|
-
|
|
1862
|
+
|
|
1802
1863
|
Handles must be created in advance of creating the node
|
|
1803
1864
|
using ::cuGraphConditionalHandleCreate.
|
|
1804
|
-
|
|
1865
|
+
|
|
1805
1866
|
The following restrictions apply to graphs which contain conditional nodes:
|
|
1806
1867
|
The graph cannot be used in a child node.
|
|
1807
1868
|
Only one instantiation of the graph may exist at any point in time.
|
|
1808
1869
|
The graph cannot be cloned.
|
|
1809
|
-
|
|
1870
|
+
|
|
1810
1871
|
To set the control value, supply a default value when creating the handle and/or
|
|
1811
1872
|
call ::cudaGraphSetConditional from device code.*/
|
|
1812
1873
|
} CUgraphNodeType;
|
|
@@ -1878,7 +1939,8 @@ typedef enum CUgraphInstantiateResult_enum
|
|
|
1878
1939
|
CUDA_GRAPH_INSTANTIATE_ERROR = 1, /**< Instantiation failed for an unexpected reason which is described in the return value of the function */
|
|
1879
1940
|
CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE = 2, /**< Instantiation failed due to invalid structure, such as cycles */
|
|
1880
1941
|
CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED = 3, /**< Instantiation for device launch failed because the graph contained an unsupported operation */
|
|
1881
|
-
CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED = 4
|
|
1942
|
+
CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED = 4, /**< Instantiation for device launch failed due to the nodes belonging to different contexts */
|
|
1943
|
+
CUDA_GRAPH_INSTANTIATE_CONDITIONAL_HANDLE_UNUSED = 5, /**< One or more conditional handles are not associated with conditional nodes */
|
|
1882
1944
|
} CUgraphInstantiateResult;
|
|
1883
1945
|
|
|
1884
1946
|
/**
|
|
@@ -2004,6 +2066,42 @@ typedef enum CUlaunchAttributeID_enum {
|
|
|
2004
2066
|
::CUlaunchAttributeValue::memSyncDomainMap. */
|
|
2005
2067
|
, CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN = 10 /**< Valid for streams, graph nodes, launches. See
|
|
2006
2068
|
::CUlaunchAttributeValue::memSyncDomain. */
|
|
2069
|
+
, CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11 /**< Valid for graph nodes, launches. Set
|
|
2070
|
+
::CUlaunchAttributeValue::preferredClusterDim
|
|
2071
|
+
to allow the kernel launch to specify a preferred substitute
|
|
2072
|
+
cluster dimension. Blocks may be grouped according to either
|
|
2073
|
+
the dimensions specified with this attribute (grouped into a
|
|
2074
|
+
"preferred substitute cluster"), or the one specified with
|
|
2075
|
+
::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped
|
|
2076
|
+
into a "regular cluster"). The cluster dimensions of a
|
|
2077
|
+
"preferred substitute cluster" shall be an integer multiple
|
|
2078
|
+
greater than zero of the regular cluster dimensions. The
|
|
2079
|
+
device will attempt - on a best-effort basis - to group
|
|
2080
|
+
thread blocks into preferred clusters over grouping them
|
|
2081
|
+
into regular clusters. When it deems necessary (primarily
|
|
2082
|
+
when the device temporarily runs out of physical resources
|
|
2083
|
+
to launch the larger preferred clusters), the device may
|
|
2084
|
+
switch to launch the regular clusters instead to attempt to
|
|
2085
|
+
utilize as much of the physical device resources as possible.
|
|
2086
|
+
<br>
|
|
2087
|
+
Each type of cluster will have its enumeration / coordinate
|
|
2088
|
+
setup as if the grid consists solely of its type of cluster.
|
|
2089
|
+
For example, if the preferred substitute cluster dimensions
|
|
2090
|
+
double the regular cluster dimensions, there might be
|
|
2091
|
+
simultaneously a regular cluster indexed at (1,0,0), and a
|
|
2092
|
+
preferred cluster indexed at (1,0,0). In this example, the
|
|
2093
|
+
preferred substitute cluster (1,0,0) replaces regular
|
|
2094
|
+
clusters (2,0,0) and (3,0,0) and groups their blocks.
|
|
2095
|
+
<br>
|
|
2096
|
+
This attribute will only take effect when a regular cluster
|
|
2097
|
+
dimension has been specified. The preferred substitute
|
|
2098
|
+
cluster dimension must be an integer multiple greater than
|
|
2099
|
+
zero of the regular cluster dimension and must divide the
|
|
2100
|
+
grid. It must also be no more than `maxBlocksPerCluster`, if
|
|
2101
|
+
it is set in the kernel's `__launch_bounds__`. Otherwise it
|
|
2102
|
+
must be less than the maximum value the driver can support.
|
|
2103
|
+
Otherwise, setting this attribute to a value physically
|
|
2104
|
+
unable to fit on any particular device is permitted. */
|
|
2007
2105
|
, CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12 /**< Valid for launches. Set
|
|
2008
2106
|
::CUlaunchAttributeValue::launchCompletionEvent to record the
|
|
2009
2107
|
event.
|
|
@@ -2054,7 +2152,14 @@ typedef enum CUlaunchAttributeID_enum {
|
|
|
2054
2152
|
from within the graph, the graph must be uploaded with ::cuGraphUpload before it
|
|
2055
2153
|
is launched. For such a graph, if host-side executable graph updates are made to the
|
|
2056
2154
|
device-updatable nodes, the graph must be uploaded before it is launched again. */
|
|
2057
|
-
|
|
2155
|
+
, CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 14 /**< Valid for launches. On devices where the L1 cache and shared memory use the
|
|
2156
|
+
same hardware resources, setting ::CUlaunchAttributeValue::sharedMemCarveout to a
|
|
2157
|
+
percentage between 0-100 signals the CUDA driver to set the shared memory carveout
|
|
2158
|
+
preference, in percent of the total shared memory for that kernel launch.
|
|
2159
|
+
This attribute takes precedence over ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
|
|
2160
|
+
This is only a hint, and the CUDA driver can choose a different configuration if
|
|
2161
|
+
required for the launch. */
|
|
2162
|
+
#if defined(__CUDA_API_VERSION_INTERNAL) && !defined(__CUDA_API_VERSION_INTERNAL_ODR)
|
|
2058
2163
|
, CU_LAUNCH_ATTRIBUTE_MAX
|
|
2059
2164
|
#endif
|
|
2060
2165
|
} CUlaunchAttributeID;
|
|
@@ -2092,27 +2197,64 @@ typedef union CUlaunchAttributeValue_union {
|
|
|
2092
2197
|
scheduling policy preference for the kernel. */
|
|
2093
2198
|
int programmaticStreamSerializationAllowed; /**< Value of launch attribute
|
|
2094
2199
|
::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION. */
|
|
2200
|
+
/**
|
|
2201
|
+
* Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
|
|
2202
|
+
* with the following fields:
|
|
2203
|
+
* - \p CUevent event - Event to fire when all blocks trigger it.
|
|
2204
|
+
* - \p Event record flags, see ::cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
|
|
2205
|
+
* - \p triggerAtBlockStart - If this is set to non-0, each block launch will automatically trigger the event.
|
|
2206
|
+
*/
|
|
2095
2207
|
struct {
|
|
2096
|
-
CUevent event;
|
|
2097
|
-
int flags;
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2208
|
+
CUevent event;
|
|
2209
|
+
int flags;
|
|
2210
|
+
int triggerAtBlockStart;
|
|
2211
|
+
} programmaticEvent;
|
|
2212
|
+
/**
|
|
2213
|
+
* Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT
|
|
2214
|
+
* with the following fields:
|
|
2215
|
+
* - \p CUevent event - Event to fire when the last block launches
|
|
2216
|
+
* - \p int flags; - Event record flags, see ::cuEventRecordWithFlags. Does not accept ::CU_EVENT_RECORD_EXTERNAL.
|
|
2217
|
+
*/
|
|
2101
2218
|
struct {
|
|
2102
|
-
CUevent event;
|
|
2103
|
-
int flags;
|
|
2104
|
-
} launchCompletionEvent;
|
|
2219
|
+
CUevent event;
|
|
2220
|
+
int flags;
|
|
2221
|
+
} launchCompletionEvent;
|
|
2105
2222
|
int priority; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution priority of the kernel. */
|
|
2106
2223
|
CUlaunchMemSyncDomainMap memSyncDomainMap; /**< Value of launch attribute
|
|
2107
2224
|
::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP. See
|
|
2108
2225
|
::CUlaunchMemSyncDomainMap. */
|
|
2109
2226
|
CUlaunchMemSyncDomain memSyncDomain; /**< Value of launch attribute
|
|
2110
2227
|
::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN. See::CUlaunchMemSyncDomain */
|
|
2228
|
+
/**
|
|
2229
|
+
* Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
|
|
2230
|
+
* that represents the desired preferred cluster dimensions for the kernel.
|
|
2231
|
+
* Opaque type with the following fields:
|
|
2232
|
+
* - \p x - The X dimension of the preferred cluster, in blocks. Must
|
|
2233
|
+
* be a divisor of the grid X dimension, and must be a
|
|
2234
|
+
* multiple of the \p x field of ::CUlaunchAttributeValue::clusterDim.
|
|
2235
|
+
* - \p y - The Y dimension of the preferred cluster, in blocks. Must
|
|
2236
|
+
* be a divisor of the grid Y dimension, and must be a
|
|
2237
|
+
* multiple of the \p y field of ::CUlaunchAttributeValue::clusterDim.
|
|
2238
|
+
* - \p z - The Z dimension of the preferred cluster, in blocks. Must be
|
|
2239
|
+
* equal to the \p z field of ::CUlaunchAttributeValue::clusterDim.
|
|
2240
|
+
*/
|
|
2241
|
+
struct {
|
|
2242
|
+
unsigned int x;
|
|
2243
|
+
unsigned int y;
|
|
2244
|
+
unsigned int z;
|
|
2245
|
+
} preferredClusterDim;
|
|
2111
2246
|
|
|
2247
|
+
/**
|
|
2248
|
+
* Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE.
|
|
2249
|
+
* with the following fields:
|
|
2250
|
+
* - \p int deviceUpdatable - Whether or not the resulting kernel node should be device-updatable.
|
|
2251
|
+
* - \p CUgraphDeviceNode devNode - Returns a handle to pass to the various device-side update functions.
|
|
2252
|
+
*/
|
|
2112
2253
|
struct {
|
|
2113
|
-
int deviceUpdatable;
|
|
2114
|
-
CUgraphDeviceNode devNode;
|
|
2115
|
-
} deviceUpdatableKernelNode;
|
|
2254
|
+
int deviceUpdatable;
|
|
2255
|
+
CUgraphDeviceNode devNode;
|
|
2256
|
+
} deviceUpdatableKernelNode;
|
|
2257
|
+
unsigned int sharedMemCarveout; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT. */
|
|
2116
2258
|
} CUlaunchAttributeValue;
|
|
2117
2259
|
|
|
2118
2260
|
/**
|
|
@@ -2148,7 +2290,9 @@ typedef CUlaunchAttributeID CUkernelNodeAttrID;
|
|
|
2148
2290
|
#define CU_KERNEL_NODE_ATTRIBUTE_PRIORITY CU_LAUNCH_ATTRIBUTE_PRIORITY
|
|
2149
2291
|
#define CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP
|
|
2150
2292
|
#define CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN
|
|
2293
|
+
#define CU_KERNEL_NODE_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
|
|
2151
2294
|
#define CU_KERNEL_NODE_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE
|
|
2295
|
+
#define CU_KERNEL_NODE_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
|
|
2152
2296
|
|
|
2153
2297
|
typedef CUlaunchAttributeValue CUkernelNodeAttrValue_v1;
|
|
2154
2298
|
typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue;
|
|
@@ -2231,6 +2375,29 @@ typedef struct CUexecAffinityParam_st {
|
|
|
2231
2375
|
*/
|
|
2232
2376
|
typedef CUexecAffinityParam_v1 CUexecAffinityParam;
|
|
2233
2377
|
|
|
2378
|
+
typedef enum CUcigDataType_enum {
|
|
2379
|
+
CIG_DATA_TYPE_D3D12_COMMAND_QUEUE = 0x1, /** D3D12 Command Queue Handle */
|
|
2380
|
+
} CUcigDataType;
|
|
2381
|
+
|
|
2382
|
+
/**
|
|
2383
|
+
* CIG Context Create Params
|
|
2384
|
+
*/
|
|
2385
|
+
typedef struct CUctxCigParam_st {
|
|
2386
|
+
CUcigDataType sharedDataType;
|
|
2387
|
+
void* sharedData;
|
|
2388
|
+
} CUctxCigParam;
|
|
2389
|
+
|
|
2390
|
+
/**
|
|
2391
|
+
* Params for creating CUDA context
|
|
2392
|
+
* Exactly one of execAffinityParams and cigParams
|
|
2393
|
+
* must be non-NULL.
|
|
2394
|
+
*/
|
|
2395
|
+
typedef struct CUctxCreateParams_st {
|
|
2396
|
+
CUexecAffinityParam *execAffinityParams;
|
|
2397
|
+
int numExecAffinityParams;
|
|
2398
|
+
CUctxCigParam *cigParams;
|
|
2399
|
+
} CUctxCreateParams;
|
|
2400
|
+
|
|
2234
2401
|
/**
|
|
2235
2402
|
* Library options to be specified with ::cuLibraryLoadData() or ::cuLibraryLoadFromFile()
|
|
2236
2403
|
*/
|
|
@@ -2502,6 +2669,17 @@ typedef enum cudaError_enum {
|
|
|
2502
2669
|
*/
|
|
2503
2670
|
CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC = 225,
|
|
2504
2671
|
|
|
2672
|
+
/**
|
|
2673
|
+
* This indicates that an exception occurred on the device that is now
|
|
2674
|
+
* contained by the GPU's error containment capability. Common causes are -
|
|
2675
|
+
* a. Certain types of invalid accesses of peer GPU memory over nvlink
|
|
2676
|
+
* b. Certain classes of hardware errors
|
|
2677
|
+
* This leaves the process in an inconsistent state and any further CUDA
|
|
2678
|
+
* work will return the same error. To continue using CUDA, the process must
|
|
2679
|
+
* be terminated and relaunched.
|
|
2680
|
+
*/
|
|
2681
|
+
CUDA_ERROR_CONTAINED = 226,
|
|
2682
|
+
|
|
2505
2683
|
/**
|
|
2506
2684
|
* This indicates that the device kernel source is invalid. This includes
|
|
2507
2685
|
* compilation/linker errors encountered in device code or user error.
|
|
@@ -2718,6 +2896,14 @@ typedef enum cudaError_enum {
|
|
|
2718
2896
|
*/
|
|
2719
2897
|
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720,
|
|
2720
2898
|
|
|
2899
|
+
/**
|
|
2900
|
+
* An exception occurred on the device while exiting a kernel using tensor memory: the
|
|
2901
|
+
* tensor memory was not completely deallocated. This leaves the process in an inconsistent
|
|
2902
|
+
* state and any further CUDA work will return the same error. To continue using CUDA, the
|
|
2903
|
+
* process must be terminated and relaunched.
|
|
2904
|
+
*/
|
|
2905
|
+
CUDA_ERROR_TENSOR_MEMORY_LEAK = 721,
|
|
2906
|
+
|
|
2721
2907
|
/**
|
|
2722
2908
|
* This error indicates that the attempted operation is not permitted.
|
|
2723
2909
|
*/
|
|
@@ -2894,6 +3080,12 @@ typedef enum cudaError_enum {
|
|
|
2894
3080
|
*/
|
|
2895
3081
|
CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION = 915,
|
|
2896
3082
|
|
|
3083
|
+
/**
|
|
3084
|
+
* This error indicates that an error happened during the key rotation
|
|
3085
|
+
* sequence.
|
|
3086
|
+
*/
|
|
3087
|
+
CUDA_ERROR_KEY_ROTATION = 916,
|
|
3088
|
+
|
|
2897
3089
|
/**
|
|
2898
3090
|
* This indicates that an unknown internal error has occurred.
|
|
2899
3091
|
*/
|
|
@@ -3307,7 +3499,10 @@ typedef enum CUtensorMapDataType_enum {
|
|
|
3307
3499
|
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
|
|
3308
3500
|
CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,
|
|
3309
3501
|
CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,
|
|
3310
|
-
CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ
|
|
3502
|
+
CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ,
|
|
3503
|
+
CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,
|
|
3504
|
+
CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,
|
|
3505
|
+
CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B
|
|
3311
3506
|
} CUtensorMapDataType;
|
|
3312
3507
|
|
|
3313
3508
|
/**
|
|
@@ -3327,6 +3522,9 @@ typedef enum CUtensorMapSwizzle_enum {
|
|
|
3327
3522
|
CU_TENSOR_MAP_SWIZZLE_32B,
|
|
3328
3523
|
CU_TENSOR_MAP_SWIZZLE_64B,
|
|
3329
3524
|
CU_TENSOR_MAP_SWIZZLE_128B,
|
|
3525
|
+
CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,
|
|
3526
|
+
CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B,
|
|
3527
|
+
CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B
|
|
3330
3528
|
} CUtensorMapSwizzle;
|
|
3331
3529
|
|
|
3332
3530
|
/**
|
|
@@ -3347,6 +3545,14 @@ typedef enum CUtensorMapFloatOOBfill_enum {
|
|
|
3347
3545
|
CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
|
|
3348
3546
|
} CUtensorMapFloatOOBfill;
|
|
3349
3547
|
|
|
3548
|
+
/**
|
|
3549
|
+
* Tensor map Im2Col wide mode
|
|
3550
|
+
*/
|
|
3551
|
+
typedef enum CUtensorMapIm2ColWideMode_enum {
|
|
3552
|
+
CU_TENSOR_MAP_IM2COL_WIDE_MODE_W = 0,
|
|
3553
|
+
CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
|
|
3554
|
+
} CUtensorMapIm2ColWideMode;
|
|
3555
|
+
|
|
3350
3556
|
/**
|
|
3351
3557
|
* GPU Direct v3 tokens
|
|
3352
3558
|
*/
|
|
@@ -3418,7 +3624,7 @@ typedef enum CUexternalMemoryHandleType_enum {
|
|
|
3418
3624
|
/**
|
|
3419
3625
|
* Handle is an NvSciBuf object
|
|
3420
3626
|
*/
|
|
3421
|
-
CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8
|
|
3627
|
+
CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8,
|
|
3422
3628
|
} CUexternalMemoryHandleType;
|
|
3423
3629
|
|
|
3424
3630
|
/**
|
|
@@ -3862,6 +4068,13 @@ typedef enum CUmemRangeHandleType_enum
|
|
|
3862
4068
|
CU_MEM_RANGE_HANDLE_TYPE_MAX = 0x7FFFFFFF
|
|
3863
4069
|
} CUmemRangeHandleType;
|
|
3864
4070
|
|
|
4071
|
+
/**
|
|
4072
|
+
* Flag for requesting handle type for address range.
|
|
4073
|
+
*/
|
|
4074
|
+
typedef enum CUmemRangeFlags_enum {
|
|
4075
|
+
CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE = 0x1 /**< Indicates that DMA_BUF handle should be mapped via PCIe BAR1 */
|
|
4076
|
+
} CUmemRangeFlags;
|
|
4077
|
+
|
|
3865
4078
|
/**
|
|
3866
4079
|
* Sparse subresource types
|
|
3867
4080
|
*/
|
|
@@ -3951,6 +4164,11 @@ typedef enum CUmemAllocationCompType_enum {
|
|
|
3951
4164
|
* This flag if set indicates that the memory will be used as a tile pool.
|
|
3952
4165
|
*/
|
|
3953
4166
|
#define CU_MEM_CREATE_USAGE_TILE_POOL 0x1
|
|
4167
|
+
/**
|
|
4168
|
+
* This flag, if set, indicates that the memory will be used as a buffer for
|
|
4169
|
+
* hardware accelerated decompression.
|
|
4170
|
+
*/
|
|
4171
|
+
#define CU_MEM_CREATE_USAGE_HW_DECOMPRESS 0x2
|
|
3954
4172
|
|
|
3955
4173
|
/**
|
|
3956
4174
|
* Specifies the allocation properties for a allocation.
|
|
@@ -4137,6 +4355,12 @@ typedef enum CUmemPool_attribute_enum {
|
|
|
4137
4355
|
CU_MEMPOOL_ATTR_USED_MEM_HIGH
|
|
4138
4356
|
} CUmemPool_attribute;
|
|
4139
4357
|
|
|
4358
|
+
/**
|
|
4359
|
+
* This flag, if set, indicates that the memory will be used as a buffer for
|
|
4360
|
+
* hardware accelerated decompression.
|
|
4361
|
+
*/
|
|
4362
|
+
#define CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS 0x2
|
|
4363
|
+
|
|
4140
4364
|
/**
|
|
4141
4365
|
* Specifies the properties of allocations made from the pool.
|
|
4142
4366
|
*/
|
|
@@ -4152,7 +4376,8 @@ typedef struct CUmemPoolProps_st {
|
|
|
4152
4376
|
*/
|
|
4153
4377
|
void *win32SecurityAttributes;
|
|
4154
4378
|
size_t maxSize; /**< Maximum pool size. When set to 0, defaults to a system dependent value. */
|
|
4155
|
-
unsigned
|
|
4379
|
+
unsigned short usage; /**< Bitmask indicating intended usage for the pool. */
|
|
4380
|
+
unsigned char reserved[54]; /**< reserved for future use, must be 0 */
|
|
4156
4381
|
} CUmemPoolProps_v1;
|
|
4157
4382
|
typedef CUmemPoolProps_v1 CUmemPoolProps;
|
|
4158
4383
|
|
|
@@ -4350,6 +4575,12 @@ typedef struct CUgraphNodeParams_st {
|
|
|
4350
4575
|
*/
|
|
4351
4576
|
#define CUDA_ARRAY3D_DEFERRED_MAPPING 0x80
|
|
4352
4577
|
|
|
4578
|
+
/**
|
|
4579
|
+
* This flag indicates that the CUDA array will be used for hardware accelerated
|
|
4580
|
+
* video encode/decode operations.
|
|
4581
|
+
*/
|
|
4582
|
+
#define CUDA_ARRAY3D_VIDEO_ENCODE_DECODE 0x100
|
|
4583
|
+
|
|
4353
4584
|
/**
|
|
4354
4585
|
* Override the texref format with a format inferred from the array.
|
|
4355
4586
|
* Flag for ::cuTexRefSetArray()
|
|
@@ -4494,9 +4725,9 @@ typedef enum CUgraphDebugDot_flags_enum {
|
|
|
4494
4725
|
CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES = 1<<10, /**< Adds node handles and every kernel function handle to output */
|
|
4495
4726
|
CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS = 1<<11, /**< Adds memory alloc node parameters to output */
|
|
4496
4727
|
CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS = 1<<12, /**< Adds memory free node parameters to output */
|
|
4497
|
-
CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS = 1<<13
|
|
4498
|
-
|
|
4499
|
-
|
|
4728
|
+
CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS = 1<<13, /**< Adds batch mem op node parameters to output */
|
|
4729
|
+
CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO = 1<<14, /**< Adds edge numbering information */
|
|
4730
|
+
CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS = 1<<15 /**< Adds conditional node parameters to output */
|
|
4500
4731
|
} CUgraphDebugDot_flags;
|
|
4501
4732
|
|
|
4502
4733
|
/**
|
|
@@ -4528,11 +4759,180 @@ typedef enum CUgraphInstantiate_flags_enum {
|
|
|
4528
4759
|
priority of the stream it is launched into. */
|
|
4529
4760
|
} CUgraphInstantiate_flags;
|
|
4530
4761
|
|
|
4762
|
+
/**
|
|
4763
|
+
* CUDA device NUMA configuration
|
|
4764
|
+
*/
|
|
4531
4765
|
typedef enum CUdeviceNumaConfig_enum {
|
|
4532
4766
|
CU_DEVICE_NUMA_CONFIG_NONE = 0, /**< The GPU is not a NUMA node */
|
|
4533
4767
|
CU_DEVICE_NUMA_CONFIG_NUMA_NODE, /**< The GPU is a NUMA node, CU_DEVICE_ATTRIBUTE_NUMA_ID contains its NUMA ID */
|
|
4534
4768
|
} CUdeviceNumaConfig;
|
|
4535
4769
|
|
|
4770
|
+
/**
|
|
4771
|
+
* CUDA Process States
|
|
4772
|
+
*/
|
|
4773
|
+
typedef enum CUprocessState_enum {
|
|
4774
|
+
CU_PROCESS_STATE_RUNNING = 0, /**< Default process state */
|
|
4775
|
+
CU_PROCESS_STATE_LOCKED, /**< CUDA API locks are taken so further CUDA API calls will block */
|
|
4776
|
+
CU_PROCESS_STATE_CHECKPOINTED, /**< Application memory contents have been checkpointed and underlying allocations and device handles have been released */
|
|
4777
|
+
CU_PROCESS_STATE_FAILED, /**< Application entered an uncorrectable error during the checkpoint/restore process */
|
|
4778
|
+
} CUprocessState;
|
|
4779
|
+
|
|
4780
|
+
/**
|
|
4781
|
+
* CUDA checkpoint optional lock arguments
|
|
4782
|
+
*/
|
|
4783
|
+
typedef struct CUcheckpointLockArgs_st {
|
|
4784
|
+
unsigned int timeoutMs; /**< Timeout in milliseconds to attempt to lock the process, 0 indicates no timeout */
|
|
4785
|
+
unsigned int reserved0; /**< Reserved for future use, must be zero */
|
|
4786
|
+
cuuint64_t reserved1[7]; /**< Reserved for future use, must be zeroed */
|
|
4787
|
+
} CUcheckpointLockArgs;
|
|
4788
|
+
|
|
4789
|
+
/**
|
|
4790
|
+
* CUDA checkpoint optional checkpoint arguments
|
|
4791
|
+
*/
|
|
4792
|
+
typedef struct CUcheckpointCheckpointArgs_st {
|
|
4793
|
+
cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
|
|
4794
|
+
} CUcheckpointCheckpointArgs;
|
|
4795
|
+
|
|
4796
|
+
/**
|
|
4797
|
+
* CUDA checkpoint optional restore arguments
|
|
4798
|
+
*/
|
|
4799
|
+
typedef struct CUcheckpointRestoreArgs_st {
|
|
4800
|
+
cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
|
|
4801
|
+
} CUcheckpointRestoreArgs;
|
|
4802
|
+
|
|
4803
|
+
/**
|
|
4804
|
+
* CUDA checkpoint optional unlock arguments
|
|
4805
|
+
*/
|
|
4806
|
+
typedef struct CUcheckpointUnlockArgs_st {
|
|
4807
|
+
cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
|
|
4808
|
+
} CUcheckpointUnlockArgs;
|
|
4809
|
+
|
|
4810
|
+
/**
|
|
4811
|
+
* Flags to specify for copies within a batch. For more details see ::cuMemcpyBatchAsync.
|
|
4812
|
+
*/
|
|
4813
|
+
typedef enum CUmemcpyFlags_enum {
|
|
4814
|
+
CU_MEMCPY_FLAG_DEFAULT = 0x0,
|
|
4815
|
+
|
|
4816
|
+
/**
|
|
4817
|
+
* Hint to the driver to try and overlap the copy with compute work on the SMs.
|
|
4818
|
+
*/
|
|
4819
|
+
CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE = 0x1
|
|
4820
|
+
} CUmemcpyFlags;
|
|
4821
|
+
|
|
4822
|
+
/**
|
|
4823
|
+
* These flags allow applications to convey the source access ordering CUDA must maintain.
|
|
4824
|
+
* The destination will always be accessed in stream order.
|
|
4825
|
+
*/
|
|
4826
|
+
typedef enum CUmemcpySrcAccessOrder_enum {
|
|
4827
|
+
/**
|
|
4828
|
+
* Default invalid.
|
|
4829
|
+
*/
|
|
4830
|
+
CU_MEMCPY_SRC_ACCESS_ORDER_INVALID = 0x0,
|
|
4831
|
+
|
|
4832
|
+
/**
|
|
4833
|
+
* Indicates that access to the source pointer must be in stream order.
|
|
4834
|
+
*/
|
|
4835
|
+
CU_MEMCPY_SRC_ACCESS_ORDER_STREAM = 0x1,
|
|
4836
|
+
|
|
4837
|
+
/**
|
|
4838
|
+
* Indicates that access to the source pointer can be out of stream order and
|
|
4839
|
+
* all accesses must be complete before the API call returns. This flag is suited for
|
|
4840
|
+
* ephemeral sources (ex., stack variables) when it's known that no prior operations
|
|
4841
|
+
* in the stream can be accessing the memory and also that the lifetime of the memory
|
|
4842
|
+
* is limited to the scope that the source variable was declared in. Specifying
|
|
4843
|
+
* this flag allows the driver to optimize the copy and removes the need for the user
|
|
4844
|
+
* to synchronize the stream after the API call.
|
|
4845
|
+
*/
|
|
4846
|
+
CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL = 0x2,
|
|
4847
|
+
|
|
4848
|
+
/**
|
|
4849
|
+
* Indicates that access to the source pointer can be out of stream order and the accesses
|
|
4850
|
+
* can happen even after the API call returns. This flag is suited for host pointers
|
|
4851
|
+
* allocated outside CUDA (ex., via malloc) when it's known that no prior operations
|
|
4852
|
+
* in the stream can be accessing the memory. Specifying this flag allows the driver
|
|
4853
|
+
* to optimize the copy on certain platforms.
|
|
4854
|
+
*/
|
|
4855
|
+
CU_MEMCPY_SRC_ACCESS_ORDER_ANY = 0x3,
|
|
4856
|
+
|
|
4857
|
+
CU_MEMCPY_SRC_ACCESS_ORDER_MAX = 0x7FFFFFFF
|
|
4858
|
+
} CUmemcpySrcAccessOrder;
|
|
4859
|
+
|
|
4860
|
+
/**
|
|
4861
|
+
* Attributes specific to copies within a batch. For more details on usage see ::cuMemcpyBatchAsync.
|
|
4862
|
+
*/
|
|
4863
|
+
typedef struct CUmemcpyAttributes_st {
|
|
4864
|
+
CUmemcpySrcAccessOrder srcAccessOrder; /**< Source access ordering to be observed for copies with this attribute. */
|
|
4865
|
+
CUmemLocation srcLocHint; /**< Hint location for the source operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
|
|
4866
|
+
CUmemLocation dstLocHint; /**< Hint location for the destination operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
|
|
4867
|
+
unsigned int flags; /**< Additional flags for copies with this attribute. See ::CUmemcpyFlags */
|
|
4868
|
+
} CUmemcpyAttributes_v1;
|
|
4869
|
+
typedef CUmemcpyAttributes_v1 CUmemcpyAttributes;
|
|
4870
|
+
|
|
4871
|
+
/**
|
|
4872
|
+
* These flags allow applications to convey the operand type for individual copies specified in ::cuMemcpy3DBatchAsync.
|
|
4873
|
+
*/
|
|
4874
|
+
typedef enum CUmemcpy3DOperandType_enum {
|
|
4875
|
+
CU_MEMCPY_OPERAND_TYPE_POINTER = 0x1, /**< Memcpy operand is a valid pointer. */
|
|
4876
|
+
CU_MEMCPY_OPERAND_TYPE_ARRAY = 0x2, /**< Memcpy operand is a CUarray. */
|
|
4877
|
+
CU_MEMCPY_OPERAND_TYPE_MAX = 0x7FFFFFFF
|
|
4878
|
+
} CUmemcpy3DOperandType;
|
|
4879
|
+
|
|
4880
|
+
/**
|
|
4881
|
+
* Struct representing offset into a CUarray in elements
|
|
4882
|
+
*/
|
|
4883
|
+
typedef struct CUoffset3D_st {
|
|
4884
|
+
size_t x;
|
|
4885
|
+
size_t y;
|
|
4886
|
+
size_t z;
|
|
4887
|
+
} CUoffset3D_v1;
|
|
4888
|
+
typedef CUoffset3D_v1 CUoffset3D;
|
|
4889
|
+
|
|
4890
|
+
/**
|
|
4891
|
+
* Struct representing width/height/depth of a CUarray in elements
|
|
4892
|
+
*/
|
|
4893
|
+
typedef struct CUextent3D_st {
|
|
4894
|
+
size_t width;
|
|
4895
|
+
size_t height;
|
|
4896
|
+
size_t depth;
|
|
4897
|
+
} CUextent3D_v1;
|
|
4898
|
+
typedef CUextent3D_v1 CUextent3D;
|
|
4899
|
+
|
|
4900
|
+
/**
|
|
4901
|
+
* Struct representing an operand for copy with ::cuMemcpy3DBatchAsync
|
|
4902
|
+
*/
|
|
4903
|
+
typedef struct CUmemcpy3DOperand_st {
|
|
4904
|
+
CUmemcpy3DOperandType type;
|
|
4905
|
+
union {
|
|
4906
|
+
/**
|
|
4907
|
+
* Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_POINTER
|
|
4908
|
+
*/
|
|
4909
|
+
struct {
|
|
4910
|
+
CUdeviceptr ptr;
|
|
4911
|
+
size_t rowLength; /**< Length of each row in elements. */
|
|
4912
|
+
size_t layerHeight; /**< Height of each layer in elements. */
|
|
4913
|
+
CUmemLocation locHint; /**< Hint location for the operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
|
|
4914
|
+
} ptr;
|
|
4915
|
+
|
|
4916
|
+
/**
|
|
4917
|
+
* Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_ARRAY
|
|
4918
|
+
*/
|
|
4919
|
+
struct {
|
|
4920
|
+
CUarray array;
|
|
4921
|
+
CUoffset3D offset;
|
|
4922
|
+
} array;
|
|
4923
|
+
} op;
|
|
4924
|
+
} CUmemcpy3DOperand_v1;
|
|
4925
|
+
typedef CUmemcpy3DOperand_v1 CUmemcpy3DOperand;
|
|
4926
|
+
|
|
4927
|
+
typedef struct CUDA_MEMCPY3D_BATCH_OP_st {
|
|
4928
|
+
CUmemcpy3DOperand src; /**< Source memcpy operand. */
|
|
4929
|
+
CUmemcpy3DOperand dst; /**< Destination memcpy operand. */
|
|
4930
|
+
CUextent3D extent; /**< Extents of the memcpy between src and dst. The width, height and depth components must not be 0.*/
|
|
4931
|
+
CUmemcpySrcAccessOrder srcAccessOrder; /**< Source access ordering to be observed for copy from src to dst. */
|
|
4932
|
+
unsigned int flags; /**< Additional flags for copies with this attribute. See ::CUmemcpyFlags */
|
|
4933
|
+
} CUDA_MEMCPY3D_BATCH_OP_v1;
|
|
4934
|
+
typedef CUDA_MEMCPY3D_BATCH_OP_v1 CUDA_MEMCPY3D_BATCH_OP;
|
|
4935
|
+
|
|
4536
4936
|
/** @} */ /* END CUDA_TYPES */
|
|
4537
4937
|
|
|
4538
4938
|
#if defined(__GNUC__)
|
|
@@ -5124,6 +5524,12 @@ CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
|
|
|
5124
5524
|
* - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here.
|
|
5125
5525
|
* - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC
|
|
5126
5526
|
* - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
|
|
5527
|
+
* - ::CU_DEVICE_ATTRIBUTE_NUMA_CONFIG: NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum
|
|
5528
|
+
* - ::CU_DEVICE_ATTRIBUTE_NUMA_ID: NUMA node ID of the GPU memory
|
|
5529
|
+
* - ::CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED: Device supports switch multicast and reduction operations.
|
|
5530
|
+
* - ::CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID: The combined 16-bit PCI device ID and 16-bit PCI vendor ID.
|
|
5531
|
+
* - ::CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID: The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID.
|
|
5532
|
+
ID.
|
|
5127
5533
|
*
|
|
5128
5534
|
* \param pi - Returned device attribute value
|
|
5129
5535
|
* \param attrib - Device attribute to query
|
|
@@ -5310,6 +5716,15 @@ CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type
|
|
|
5310
5716
|
* determined by comparing the numerical values between the two enums, with
|
|
5311
5717
|
* smaller scopes having smaller values.
|
|
5312
5718
|
*
|
|
5719
|
+
* On platforms that support GPUDirect RDMA writes via more than one path in
|
|
5720
|
+
* hardware (see ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE), the user should
|
|
5721
|
+
* consider those paths as belonging to separate ordering domains. Note that in
|
|
5722
|
+
* such cases CUDA driver will report both RDMA writes ordering and RDMA write
|
|
5723
|
+
* scope as ALL_DEVICES and a call to cuFlushGPUDirectRDMA will be a no-op,
|
|
5724
|
+
* but when these multiple paths are used simultaneously, it is the user's
|
|
5725
|
+
* responsibility to ensure ordering by using mechanisms outside the scope of
|
|
5726
|
+
* CUDA.
|
|
5727
|
+
*
|
|
5313
5728
|
* Users may query support for this API via
|
|
5314
5729
|
* ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS.
|
|
5315
5730
|
*
|
|
@@ -5991,6 +6406,161 @@ CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
|
|
|
5991
6406
|
*/
|
|
5992
6407
|
CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev);
|
|
5993
6408
|
|
|
6409
|
+
/**
|
|
6410
|
+
* \brief Create a CUDA context
|
|
6411
|
+
*
|
|
6412
|
+
* Creates a new CUDA context and associates it with the calling thread. The
|
|
6413
|
+
* \p flags parameter is described below. The context is created with a usage
|
|
6414
|
+
* count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy()
|
|
6415
|
+
* when done using the context. If a context is already current to the thread,
|
|
6416
|
+
* it is supplanted by the newly created context and may be restored by a subsequent
|
|
6417
|
+
* call to ::cuCtxPopCurrent().
|
|
6418
|
+
*
|
|
6419
|
+
* CUDA context can be created with execution affinity. The type and the amount of
|
|
6420
|
+
execution resource the context can use is limited by \p paramsArray and \p numExecAffinityParams
|
|
6421
|
+
in \p execAffinity. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numExecAffinityParams
|
|
6422
|
+
* describes the size of the paramsArray. If two \p CUexecAffinityParam in the array have the same type,
|
|
6423
|
+
* the latter execution affinity parameter overrides the former execution affinity parameter.
|
|
6424
|
+
* The supported execution affinity types are:
|
|
6425
|
+
* - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
|
|
6426
|
+
* of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
|
|
6427
|
+
* rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
|
|
6428
|
+
* affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
|
|
6429
|
+
* is only supported under Volta+ MPS.
|
|
6430
|
+
*
|
|
6431
|
+
* CUDA context can be created in CIG(CUDA in Graphics) mode by setting \p cigParams.
|
|
6432
|
+
* Data from graphics client is shared with CUDA via the \p sharedData in \p cigParams.
|
|
6433
|
+
* Support for D3D12 graphics client can be determined using ::cuDeviceGetAttribute() with
|
|
6434
|
+
* ::CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED. \p sharedData is a ID3D12CommandQueue handle.
|
|
6435
|
+
* Either \p execAffinityParams or \p cigParams can be set to a non-null value. Setting both to a
|
|
6436
|
+
* non-null value will result in an undefined behavior.
|
|
6437
|
+
*
|
|
6438
|
+
* The three LSBs of the \p flags parameter can be used to control how the OS
|
|
6439
|
+
* thread, which owns the CUDA context at the time of an API call, interacts
|
|
6440
|
+
* with the OS scheduler when waiting for results from the GPU. Only one of
|
|
6441
|
+
* the scheduling flags can be set when creating a context.
|
|
6442
|
+
*
|
|
6443
|
+
* - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
|
|
6444
|
+
* results from the GPU. This can decrease latency when waiting for the GPU,
|
|
6445
|
+
* but may lower the performance of CPU threads if they are performing work in
|
|
6446
|
+
* parallel with the CUDA thread.
|
|
6447
|
+
*
|
|
6448
|
+
* - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
|
|
6449
|
+
* results from the GPU. This can increase latency when waiting for the GPU,
|
|
6450
|
+
* but can increase the performance of CPU threads performing work in parallel
|
|
6451
|
+
* with the GPU.
|
|
6452
|
+
*
|
|
6453
|
+
* - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
|
|
6454
|
+
* synchronization primitive when waiting for the GPU to finish work.
|
|
6455
|
+
*
|
|
6456
|
+
* - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
|
|
6457
|
+
* synchronization primitive when waiting for the GPU to finish work. <br>
|
|
6458
|
+
* <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
|
|
6459
|
+
* replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
|
|
6460
|
+
*
|
|
6461
|
+
* - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
|
|
6462
|
+
* uses a heuristic based on the number of active CUDA contexts in the
|
|
6463
|
+
* process \e C and the number of logical processors in the system \e P. If
|
|
6464
|
+
* \e C > \e P, then CUDA will yield to other OS threads when waiting for
|
|
6465
|
+
* the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
|
|
6466
|
+
* waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
|
|
6467
|
+
* Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
|
|
6468
|
+
* the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
|
|
6469
|
+
* for low-powered devices.
|
|
6470
|
+
*
|
|
6471
|
+
* - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
|
|
6472
|
+
* This flag must be set in order to allocate pinned host memory that is
|
|
6473
|
+
* accessible to the GPU.
|
|
6474
|
+
*
|
|
6475
|
+
* - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
|
|
6476
|
+
* after resizing local memory for a kernel. This can prevent thrashing by
|
|
6477
|
+
* local memory allocations when launching many kernels with high local
|
|
6478
|
+
* memory usage at the cost of potentially increased memory usage. <br>
|
|
6479
|
+
* <b>Deprecated:</b> This flag is deprecated and the behavior enabled
|
|
6480
|
+
* by this flag is now the default and cannot be disabled.
|
|
6481
|
+
* Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
|
|
6482
|
+
*
|
|
6483
|
+
* - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
|
|
6484
|
+
* with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
|
|
6485
|
+
* be set during context creation to instruct CUDA to create a coredump if
|
|
6486
|
+
* this context raises an exception during execution. These environment variables
|
|
6487
|
+
* are described in the CUDA-GDB user guide under the "GPU core dump support"
|
|
6488
|
+
* section.
|
|
6489
|
+
* The initial attributes will be taken from the global attributes at the time of
|
|
6490
|
+
* context creation. The other attributes that control coredump output can be
|
|
6491
|
+
* modified by calling ::cuCoredumpSetAttribute from the created context after
|
|
6492
|
+
* it becomes current. This flag is not supported when CUDA context is created in
|
|
6493
|
+
* CIG(CUDA in Graphics) mode.
|
|
6494
|
+
*
|
|
6495
|
+
* - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
|
|
6496
|
+
* been enabled globally with ::cuCoredumpSetAttributeGlobal or environment
|
|
6497
|
+
* variables, this flag can be set during context creation to instruct CUDA to
|
|
6498
|
+
* create a coredump if data is written to a certain pipe that is present in the
|
|
6499
|
+
* OS space. These environment variables are described in the CUDA-GDB user
|
|
6500
|
+
* guide under the "GPU core dump support" section.
|
|
6501
|
+
* It is important to note that the pipe name *must* be set with
|
|
6502
|
+
* ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
|
|
6503
|
+
* used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
|
|
6504
|
+
* The initial attributes will be taken from the global attributes at the time of
|
|
6505
|
+
* context creation. The other attributes that control coredump output can be
|
|
6506
|
+
* modified by calling ::cuCoredumpSetAttribute from the created context after
|
|
6507
|
+
* it becomes current.
|
|
6508
|
+
* Setting this flag on any context creation is equivalent to setting the
|
|
6509
|
+
* ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally.
|
|
6510
|
+
* This flag is not supported when CUDA context is created in
|
|
6511
|
+
* CIG(CUDA in Graphics) mode.
|
|
6512
|
+
*
|
|
6513
|
+
* - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated
|
|
6514
|
+
* on this context will always synchronize. See further documentation in the
|
|
6515
|
+
* section titled "API Synchronization behavior" to learn more about cases when
|
|
6516
|
+
* synchronous memory operations can exhibit asynchronous behavior.
|
|
6517
|
+
*
|
|
6518
|
+
* Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
|
|
6519
|
+
* the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
|
|
6520
|
+
* can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
|
|
6521
|
+
* compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
|
|
6522
|
+
* the compute mode for * devices.
|
|
6523
|
+
* Documentation for <i>nvidia-smi</i> can be obtained by passing a
|
|
6524
|
+
* -h option to it.
|
|
6525
|
+
*
|
|
6526
|
+
* Context creation will fail with :: CUDA_ERROR_INVALID_VALUE if invalid parameter was
|
|
6527
|
+
* passed by client to create the CUDA context.
|
|
6528
|
+
*
|
|
6529
|
+
* Context creation in CIG mode will fail with ::CUDA_ERROR_NOT_SUPPORTED if CIG is not supported
|
|
6530
|
+
* by the device or the driver.
|
|
6531
|
+
* \param pctx - Returned context handle of the new context
|
|
6532
|
+
* \param ctxCreateParams - Context creation parameters
|
|
6533
|
+
* \param flags - Context creation flags
|
|
6534
|
+
* \param dev - Device to create context on
|
|
6535
|
+
*
|
|
6536
|
+
* \return
|
|
6537
|
+
* ::CUDA_SUCCESS,
|
|
6538
|
+
* ::CUDA_ERROR_DEINITIALIZED,
|
|
6539
|
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
6540
|
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
|
6541
|
+
* ::CUDA_ERROR_INVALID_DEVICE,
|
|
6542
|
+
* ::CUDA_ERROR_INVALID_VALUE,
|
|
6543
|
+
* ::CUDA_ERROR_NOT_SUPPORTED,
|
|
6544
|
+
* ::CUDA_ERROR_OUT_OF_MEMORY,
|
|
6545
|
+
* ::CUDA_ERROR_UNKNOWN
|
|
6546
|
+
* \notefnerr
|
|
6547
|
+
*
|
|
6548
|
+
* \sa ::cuCtxDestroy,
|
|
6549
|
+
* ::cuCtxGetApiVersion,
|
|
6550
|
+
* ::cuCtxGetCacheConfig,
|
|
6551
|
+
* ::cuCtxGetDevice,
|
|
6552
|
+
* ::cuCtxGetFlags,
|
|
6553
|
+
* ::cuCtxGetLimit,
|
|
6554
|
+
* ::cuCtxPopCurrent,
|
|
6555
|
+
* ::cuCtxPushCurrent,
|
|
6556
|
+
* ::cuCtxSetCacheConfig,
|
|
6557
|
+
* ::cuCtxSetLimit,
|
|
6558
|
+
* ::cuCoredumpSetAttributeGlobal,
|
|
6559
|
+
* ::cuCoredumpSetAttribute,
|
|
6560
|
+
* ::cuCtxSynchronize
|
|
6561
|
+
*/
|
|
6562
|
+
CUresult CUDAAPI cuCtxCreate_v4(CUcontext *pctx, CUctxCreateParams *ctxCreateParams, unsigned int flags, CUdevice dev);
|
|
6563
|
+
|
|
5994
6564
|
/**
|
|
5995
6565
|
* \brief Destroy a CUDA context
|
|
5996
6566
|
*
|
|
@@ -6002,9 +6572,11 @@ CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArra
|
|
|
6002
6572
|
* Destroys and cleans up all resources associated with the context.
|
|
6003
6573
|
* It is the caller's responsibility to ensure that the context or its resources
|
|
6004
6574
|
* are not accessed or passed in subsequent API calls and doing so will result in undefined behavior.
|
|
6005
|
-
* These resources include CUDA types
|
|
6575
|
+
* These resources include CUDA types ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
|
|
6006
6576
|
* ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref,
|
|
6007
6577
|
* ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore.
|
|
6578
|
+
* These resources also include memory allocations by ::cuMemAlloc(), ::cuMemAllocHost(),
|
|
6579
|
+
* ::cuMemAllocManaged() and ::cuMemAllocPitch().
|
|
6008
6580
|
*
|
|
6009
6581
|
* If \p ctx is current to the calling thread then \p ctx will also be
|
|
6010
6582
|
* popped from the current thread's context stack (as though ::cuCtxPopCurrent()
|
|
@@ -6012,6 +6584,10 @@ CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArra
|
|
|
6012
6584
|
* remain current to those threads, and attempting to access \p ctx from
|
|
6013
6585
|
* those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
|
|
6014
6586
|
*
|
|
6587
|
+
* \note ::cuCtxDestroy() will not destroy memory allocations by ::cuMemCreate(), ::cuMemAllocAsync() and
|
|
6588
|
+
* ::cuMemAllocFromPoolAsync(). These memory allocations are not associated with any CUDA context and need to
|
|
6589
|
+
* be destroyed explicitly.
|
|
6590
|
+
*
|
|
6015
6591
|
* \param ctx - Context to destroy
|
|
6016
6592
|
*
|
|
6017
6593
|
* \return
|
|
@@ -6158,11 +6734,11 @@ CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
|
|
|
6158
6734
|
CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
|
|
6159
6735
|
|
|
6160
6736
|
/**
|
|
6161
|
-
* \brief Returns the device
|
|
6737
|
+
* \brief Returns the device handle for the current context
|
|
6162
6738
|
*
|
|
6163
|
-
* Returns in \p *device the
|
|
6739
|
+
* Returns in \p *device the handle of the current context's device.
|
|
6164
6740
|
*
|
|
6165
|
-
* \param device - Returned device
|
|
6741
|
+
* \param device - Returned device handle for the current context
|
|
6166
6742
|
*
|
|
6167
6743
|
* \return
|
|
6168
6744
|
* ::CUDA_SUCCESS,
|
|
@@ -6278,9 +6854,11 @@ CUresult CUDAAPI cuCtxSetFlags(unsigned int flags);
|
|
|
6278
6854
|
CUresult CUDAAPI cuCtxGetId(CUcontext ctx, unsigned long long *ctxId);
|
|
6279
6855
|
|
|
6280
6856
|
/**
|
|
6281
|
-
* \brief Block for
|
|
6857
|
+
* \brief Block for the current context's tasks to complete
|
|
6282
6858
|
*
|
|
6283
|
-
* Blocks until the
|
|
6859
|
+
* Blocks until the current context has completed all preceding requested tasks.
|
|
6860
|
+
* If the current context is the primary context, green contexts that have been
|
|
6861
|
+
* created will also be synchronized.
|
|
6284
6862
|
* ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
|
|
6285
6863
|
* If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
|
|
6286
6864
|
* CPU thread will block until the GPU context has finished its work.
|
|
@@ -6662,14 +7240,87 @@ CUresult CUDAAPI cuCtxResetPersistingL2Cache(void);
|
|
|
6662
7240
|
*/
|
|
6663
7241
|
CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
|
|
6664
7242
|
|
|
6665
|
-
|
|
6666
|
-
/** @} */ /* END CUDA_CTX */
|
|
6667
|
-
|
|
6668
7243
|
/**
|
|
6669
|
-
* \
|
|
7244
|
+
* \brief Records an event.
|
|
6670
7245
|
*
|
|
6671
|
-
*
|
|
6672
|
-
*
|
|
7246
|
+
* Captures in \p hEvent all the activities of the context \p hCtx
|
|
7247
|
+
* at the time of this call. \p hEvent and \p hCtx must be from the same
|
|
7248
|
+
* CUDA context, otherwise ::CUDA_ERROR_INVALID_HANDLE will be returned.
|
|
7249
|
+
* Calls such as ::cuEventQuery() or ::cuCtxWaitEvent() will then examine
|
|
7250
|
+
* or wait for completion of the work that was captured.
|
|
7251
|
+
* Uses of \p hCtx after this call do not modify \p hEvent.
|
|
7252
|
+
* If the context passed to \p hCtx is the primary context, \p hEvent will
|
|
7253
|
+
* capture all the activities of the primary context and its green contexts.
|
|
7254
|
+
* If the context passed to \p hCtx is a context converted from green context
|
|
7255
|
+
* via ::cuCtxFromGreenCtx(), \p hEvent will capture only the activities of the green context.
|
|
7256
|
+
*
|
|
7257
|
+
* \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the
|
|
7258
|
+
* specified context \p hCtx has a stream in the capture mode. In such a case,
|
|
7259
|
+
* the call will invalidate all the conflicting captures.
|
|
7260
|
+
*
|
|
7261
|
+
* \param hCtx - Context to record event for
|
|
7262
|
+
* \param hEvent - Event to record
|
|
7263
|
+
*
|
|
7264
|
+
* \return
|
|
7265
|
+
* ::CUDA_SUCCESS
|
|
7266
|
+
* ::CUDA_ERROR_DEINITIALIZED,
|
|
7267
|
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
7268
|
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
|
7269
|
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
|
7270
|
+
* ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
|
|
7271
|
+
*
|
|
7272
|
+
* \sa
|
|
7273
|
+
* ::cuCtxWaitEvent,
|
|
7274
|
+
* ::cuGreenCtxRecordEvent,
|
|
7275
|
+
* ::cuGreenCtxWaitEvent,
|
|
7276
|
+
* ::cuEventRecord
|
|
7277
|
+
*/
|
|
7278
|
+
CUresult CUDAAPI cuCtxRecordEvent(CUcontext hCtx, CUevent hEvent);
|
|
7279
|
+
|
|
7280
|
+
/**
|
|
7281
|
+
* \brief Make a context wait on an event
|
|
7282
|
+
*
|
|
7283
|
+
* Makes all future work submitted to context \p hCtx wait for all work
|
|
7284
|
+
* captured in \p hEvent. The synchronization will be performed on the device
|
|
7285
|
+
* and will not block the calling CPU thread. See ::cuCtxRecordEvent()
|
|
7286
|
+
* for details on what is captured by an event.
|
|
7287
|
+
* If the context passed to \p hCtx is the primary context, the primary context
|
|
7288
|
+
* and its green contexts will wait for \p hEvent.
|
|
7289
|
+
* If the context passed to \p hCtx is a context converted from green context
|
|
7290
|
+
* via ::cuCtxFromGreenCtx(), the green context will wait for \p hEvent.
|
|
7291
|
+
*
|
|
7292
|
+
* \note \p hEvent may be from a different context or device than \p hCtx.
|
|
7293
|
+
*
|
|
7294
|
+
* \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and
|
|
7295
|
+
* invalidate the capture if the specified event \p hEvent is part of an ongoing
|
|
7296
|
+
* capture sequence or if the specified context \p hCtx has a stream in the capture mode.
|
|
7297
|
+
*
|
|
7298
|
+
* \param hCtx - Context to wait
|
|
7299
|
+
* \param hEvent - Event to wait on
|
|
7300
|
+
*
|
|
7301
|
+
* \return
|
|
7302
|
+
* ::CUDA_SUCCESS,
|
|
7303
|
+
* ::CUDA_ERROR_DEINITIALIZED,
|
|
7304
|
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
7305
|
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
|
7306
|
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
|
7307
|
+
* ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
|
|
7308
|
+
*
|
|
7309
|
+
* \sa
|
|
7310
|
+
* ::cuCtxRecordEvent,
|
|
7311
|
+
* ::cuGreenCtxRecordEvent,
|
|
7312
|
+
* ::cuGreenCtxWaitEvent,
|
|
7313
|
+
* ::cuStreamWaitEvent
|
|
7314
|
+
*/
|
|
7315
|
+
CUresult CUDAAPI cuCtxWaitEvent(CUcontext hCtx, CUevent hEvent);
|
|
7316
|
+
|
|
7317
|
+
/** @} */ /* END CUDA_CTX */
|
|
7318
|
+
|
|
7319
|
+
/**
|
|
7320
|
+
* \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
|
|
7321
|
+
*
|
|
7322
|
+
* ___MANBRIEF___ deprecated context management functions of the low-level CUDA
|
|
7323
|
+
* driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
|
|
6673
7324
|
*
|
|
6674
7325
|
* This section describes the deprecated context management functions of the low-level
|
|
6675
7326
|
* CUDA driver application programming interface.
|
|
@@ -7203,6 +7854,11 @@ CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hm
|
|
|
7203
7854
|
* ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
|
|
7204
7855
|
* will accumulate data until the CUlinkState is destroyed.
|
|
7205
7856
|
*
|
|
7857
|
+
* The data passed in via ::cuLinkAddData and ::cuLinkAddFile will be treated
|
|
7858
|
+
* as relocatable (-rdc=true to nvcc) when linking the final cubin during
|
|
7859
|
+
* ::cuLinkComplete and will have similar consequences as offline relocatable
|
|
7860
|
+
* device code linking.
|
|
7861
|
+
*
|
|
7206
7862
|
* \p optionValues must remain valid for the life of the CUlinkState if output
|
|
7207
7863
|
* options are used. No other references to inputs are maintained after this
|
|
7208
7864
|
* call returns.
|
|
@@ -7471,6 +8127,7 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmod
|
|
|
7471
8127
|
*
|
|
7472
8128
|
* The \p code may be a \e cubin or \e fatbin as output by \b nvcc,
|
|
7473
8129
|
* or a NULL-terminated \e PTX, either as output by \b nvcc or hand-written.
|
|
8130
|
+
* A fatbin should also contain relocatable code when doing separate compilation.
|
|
7474
8131
|
*
|
|
7475
8132
|
* Options are passed as an array via \p jitOptions and any corresponding parameters are passed in
|
|
7476
8133
|
* \p jitOptionsValues. The number of total JIT options is supplied via \p numJitOptions.
|
|
@@ -7479,6 +8136,9 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmod
|
|
|
7479
8136
|
* Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
|
|
7480
8137
|
* \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
|
|
7481
8138
|
*
|
|
8139
|
+
* \note If the library contains managed variables and no device in the system
|
|
8140
|
+
* supports managed variables this call is expected to return ::CUDA_ERROR_NOT_SUPPORTED
|
|
8141
|
+
*
|
|
7482
8142
|
* \param library - Returned library
|
|
7483
8143
|
* \param code - Code to load
|
|
7484
8144
|
* \param jitOptions - Options for JIT
|
|
@@ -7499,7 +8159,8 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmod
|
|
|
7499
8159
|
* ::CUDA_ERROR_NO_BINARY_FOR_GPU,
|
|
7500
8160
|
* ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
|
|
7501
8161
|
* ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
|
|
7502
|
-
* ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
|
|
8162
|
+
* ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
|
|
8163
|
+
* ::CUDA_ERROR_NOT_SUPPORTED
|
|
7503
8164
|
*
|
|
7504
8165
|
* \sa ::cuLibraryLoadFromFile,
|
|
7505
8166
|
* ::cuLibraryUnload,
|
|
@@ -7528,6 +8189,7 @@ CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
|
|
|
7528
8189
|
*
|
|
7529
8190
|
* The file should be a \e cubin file as output by \b nvcc, or a \e PTX file either
|
|
7530
8191
|
* as output by \b nvcc or handwritten, or a \e fatbin file as output by \b nvcc.
|
|
8192
|
+
* A fatbin should also contain relocatable code when doing separate compilation.
|
|
7531
8193
|
*
|
|
7532
8194
|
* Options are passed as an array via \p jitOptions and any corresponding parameters are
|
|
7533
8195
|
* passed in \p jitOptionsValues. The number of total options is supplied via \p numJitOptions.
|
|
@@ -7536,6 +8198,9 @@ CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
|
|
|
7536
8198
|
* Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
|
|
7537
8199
|
* \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
|
|
7538
8200
|
*
|
|
8201
|
+
* \note If the library contains managed variables and no device in the system
|
|
8202
|
+
* supports managed variables this call is expected to return ::CUDA_ERROR_NOT_SUPPORTED
|
|
8203
|
+
*
|
|
7539
8204
|
* \param library - Returned library
|
|
7540
8205
|
* \param fileName - File to load from
|
|
7541
8206
|
* \param jitOptions - Options for JIT
|
|
@@ -7556,7 +8221,8 @@ CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
|
|
|
7556
8221
|
* ::CUDA_ERROR_NO_BINARY_FOR_GPU,
|
|
7557
8222
|
* ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
|
|
7558
8223
|
* ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
|
|
7559
|
-
* ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
|
|
8224
|
+
* ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
|
|
8225
|
+
* ::CUDA_ERROR_NOT_SUPPORTED
|
|
7560
8226
|
*
|
|
7561
8227
|
* \sa ::cuLibraryLoadData,
|
|
7562
8228
|
* ::cuLibraryUnload,
|
|
@@ -7702,6 +8368,29 @@ CUresult CUDAAPI cuLibraryGetModule(CUmodule *pMod, CUlibrary library);
|
|
|
7702
8368
|
*/
|
|
7703
8369
|
CUresult CUDAAPI cuKernelGetFunction(CUfunction *pFunc, CUkernel kernel);
|
|
7704
8370
|
|
|
8371
|
+
/**
|
|
8372
|
+
* \brief Returns a library handle
|
|
8373
|
+
*
|
|
8374
|
+
* Returns in \p pLib the handle of the library for the requested kernel \p kernel
|
|
8375
|
+
*
|
|
8376
|
+
* \param pLib - Returned library handle
|
|
8377
|
+
* \param kernel - Kernel to retrieve library handle
|
|
8378
|
+
*
|
|
8379
|
+
* \return
|
|
8380
|
+
* ::CUDA_SUCCESS,
|
|
8381
|
+
* ::CUDA_ERROR_DEINITIALIZED,
|
|
8382
|
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
8383
|
+
* ::CUDA_ERROR_INVALID_VALUE,
|
|
8384
|
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
|
8385
|
+
* ::CUDA_ERROR_NOT_FOUND
|
|
8386
|
+
*
|
|
8387
|
+
* \sa ::cuLibraryLoadData,
|
|
8388
|
+
* ::cuLibraryLoadFromFile,
|
|
8389
|
+
* ::cuLibraryUnload,
|
|
8390
|
+
* ::cuLibraryGetKernel
|
|
8391
|
+
*/
|
|
8392
|
+
CUresult CUDAAPI cuKernelGetLibrary(CUlibrary *pLib, CUkernel kernel);
|
|
8393
|
+
|
|
7705
8394
|
/**
|
|
7706
8395
|
* \brief Returns a global device pointer
|
|
7707
8396
|
*
|
|
@@ -7744,9 +8433,6 @@ CUresult CUDAAPI cuLibraryGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUlibrary
|
|
|
7744
8433
|
* Note that managed memory for library \p library is shared across devices and is registered
|
|
7745
8434
|
* when the library is loaded into atleast one context.
|
|
7746
8435
|
*
|
|
7747
|
-
* \note The API requires a CUDA context to be present and initialized on at least one device.
|
|
7748
|
-
* If no context is present, the call returns ::CUDA_ERROR_NOT_FOUND.
|
|
7749
|
-
*
|
|
7750
8436
|
* \param dptr - Returned pointer to the managed memory
|
|
7751
8437
|
* \param bytes - Returned memory size in bytes
|
|
7752
8438
|
* \param library - Library to retrieve managed memory from
|
|
@@ -7923,6 +8609,9 @@ CUresult CUDAAPI cuKernelGetAttribute(int *pi, CUfunction_attribute attrib, CUke
|
|
|
7923
8609
|
* positive. The validity of the cluster dimensions is checked at launch time.
|
|
7924
8610
|
* If the value is set during compile time, it cannot be set at runtime.
|
|
7925
8611
|
* Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
|
|
8612
|
+
* - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
|
|
8613
|
+
* the function can be launched with non-portable cluster size. 1 is allowed,
|
|
8614
|
+
* 0 is disallowed.
|
|
7926
8615
|
* - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
|
|
7927
8616
|
* scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
|
|
7928
8617
|
*
|
|
@@ -8222,9 +8911,10 @@ CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t Width
|
|
|
8222
8911
|
* ::cuMemAllocPitch(), ::cuMemAllocManaged(), ::cuMemAllocAsync(), ::cuMemAllocFromPoolAsync()
|
|
8223
8912
|
*
|
|
8224
8913
|
* Note - This API will not perform any implict synchronization when the pointer was allocated with
|
|
8225
|
-
* ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to
|
|
8914
|
+
* ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to these
|
|
8226
8915
|
* pointer have completed before invoking ::cuMemFree. For best performance and memory reuse, users
|
|
8227
8916
|
* should use ::cuMemFreeAsync to free memory allocated via the stream ordered memory allocator.
|
|
8917
|
+
* For all other pointers, this API may perform implicit synchronization.
|
|
8228
8918
|
*
|
|
8229
8919
|
* \param dptr - Pointer to memory to free
|
|
8230
8920
|
*
|
|
@@ -8776,7 +9466,8 @@ CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
|
|
|
8776
9466
|
*
|
|
8777
9467
|
* IPC functionality is restricted to devices with support for unified
|
|
8778
9468
|
* addressing on Linux and Windows operating systems.
|
|
8779
|
-
* IPC functionality on Windows is
|
|
9469
|
+
* IPC functionality on Windows is supported for compatibility purposes
|
|
9470
|
+
* but not recommended as it comes with performance cost.
|
|
8780
9471
|
* Users can test their device for IPC functionality by calling
|
|
8781
9472
|
* ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
|
|
8782
9473
|
*
|
|
@@ -8819,7 +9510,8 @@ CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
|
|
|
8819
9510
|
*
|
|
8820
9511
|
* IPC functionality is restricted to devices with support for unified
|
|
8821
9512
|
* addressing on Linux and Windows operating systems.
|
|
8822
|
-
* IPC functionality on Windows is
|
|
9513
|
+
* IPC functionality on Windows is supported for compatibility purposes
|
|
9514
|
+
* but not recommended as it comes with performance cost.
|
|
8823
9515
|
* Users can test their device for IPC functionality by calling
|
|
8824
9516
|
* ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
|
|
8825
9517
|
*
|
|
@@ -8864,7 +9556,8 @@ CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle)
|
|
|
8864
9556
|
*
|
|
8865
9557
|
* IPC functionality is restricted to devices with support for unified
|
|
8866
9558
|
* addressing on Linux and Windows operating systems.
|
|
8867
|
-
* IPC functionality on Windows is
|
|
9559
|
+
* IPC functionality on Windows is supported for compatibility purposes
|
|
9560
|
+
* but not recommended as it comes with performance cost.
|
|
8868
9561
|
* Users can test their device for IPC functionality by calling
|
|
8869
9562
|
* ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
|
|
8870
9563
|
*
|
|
@@ -8919,7 +9612,8 @@ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
|
|
|
8919
9612
|
*
|
|
8920
9613
|
* IPC functionality is restricted to devices with support for unified
|
|
8921
9614
|
* addressing on Linux and Windows operating systems.
|
|
8922
|
-
* IPC functionality on Windows is
|
|
9615
|
+
* IPC functionality on Windows is supported for compatibility purposes
|
|
9616
|
+
* but not recommended as it comes with performance cost.
|
|
8923
9617
|
* Users can test their device for IPC functionality by calling
|
|
8924
9618
|
* ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
|
|
8925
9619
|
*
|
|
@@ -8964,7 +9658,8 @@ CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, u
|
|
|
8964
9658
|
*
|
|
8965
9659
|
* IPC functionality is restricted to devices with support for unified
|
|
8966
9660
|
* addressing on Linux and Windows operating systems.
|
|
8967
|
-
* IPC functionality on Windows is
|
|
9661
|
+
* IPC functionality on Windows is supported for compatibility purposes
|
|
9662
|
+
* but not recommended as it comes with performance cost.
|
|
8968
9663
|
* Users can test their device for IPC functionality by calling
|
|
8969
9664
|
* ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
|
|
8970
9665
|
*
|
|
@@ -10643,6 +11338,153 @@ CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
|
|
|
10643
11338
|
*/
|
|
10644
11339
|
CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
|
|
10645
11340
|
|
|
11341
|
+
/**
|
|
11342
|
+
* \brief Performs a batch of memory copies asynchronously.
|
|
11343
|
+
*
|
|
11344
|
+
* Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
|
|
11345
|
+
* batch are not guaranteed to execute in any specific order. This API only supports pointer-to-pointer copies.
|
|
11346
|
+
* For copies involving CUDA arrays, please see ::cuMemcpy3DBatchAsync.
|
|
11347
|
+
*
|
|
11348
|
+
* Performs memory copies from source buffers specified in \p srcs to destination buffers specified in \p dsts.
|
|
11349
|
+
* The size of each copy is specified in \p sizes. All three arrays must be of the same length as specified
|
|
11350
|
+
* by \p count. Since there are no ordering guarantees for copies within a batch, specifying any dependent copies
|
|
11351
|
+
* within a batch will result in undefined behavior.
|
|
11352
|
+
*
|
|
11353
|
+
* Every copy in the batch has to be associated with a set of attributes specified in the \p attrs array.
|
|
11354
|
+
* Each entry in this array can apply to more than one copy. This can be done by specifying in the \p attrsIdxs array,
|
|
11355
|
+
* the index of the first copy that the corresponding entry in the \p attrs array applies to. Both \p attrs and
|
|
11356
|
+
* \p attrsIdxs must be of the same length as specified by \p numAttrs. For example, if a batch has 10 copies listed
|
|
11357
|
+
* in dst/src/sizes, the first 6 of which have one set of attributes and the remaining 4 another, then \p numAttrs
|
|
11358
|
+
* will be 2, \p attrsIdxs will be {0, 6} and \p attrs will contains the two sets of attributes. Note that the first entry
|
|
11359
|
+
* in \p attrsIdxs must always be 0. Also, each entry must be greater than the previous entry and the last entry should be
|
|
11360
|
+
* less than \p count. Furthermore, \p numAttrs must be lesser than or equal to \p count.
|
|
11361
|
+
*
|
|
11362
|
+
* The ::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
|
|
11363
|
+
* with the attribute. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will
|
|
11364
|
+
* be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then
|
|
11365
|
+
* it indicates that access to the source pointer can be out of stream order and all accesses must be complete before
|
|
11366
|
+
* the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
|
|
11367
|
+
* operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
|
|
11368
|
+
* that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
|
|
11369
|
+
* need for the user to synchronize the stream after the API call. If the source access order is set to
|
|
11370
|
+
* ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the
|
|
11371
|
+
* accesses can happen even after the API call returns. This flag is suited for host pointers allocated
|
|
11372
|
+
* outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
|
|
11373
|
+
* Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcpy operation in the batch must
|
|
11374
|
+
* have a valid ::CUmemcpyAttributes corresponding to it including the appropriate srcAccessOrder setting, otherwise the API
|
|
11375
|
+
* will return ::CUDA_ERROR_INVALID_VALUE.
|
|
11376
|
+
*
|
|
11377
|
+
* The ::CUmemcpyAttributes::srcLocHint and ::CUmemcpyAttributes::dstLocHint allows applications to specify hint locations
|
|
11378
|
+
* for operands of a copy when the operand doesn't have a fixed location. That is, these hints are
|
|
11379
|
+
* only applicable for managed memory pointers on devices where ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or
|
|
11380
|
+
* system-allocated pageable memory on devices where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true.
|
|
11381
|
+
* For other cases, these hints are ignored.
|
|
11382
|
+
*
|
|
11383
|
+
* The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
|
|
11384
|
+
* ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with
|
|
11385
|
+
* any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
|
|
11386
|
+
*
|
|
11387
|
+
* If any error is encountered while parsing the batch, the index within the batch where the error was encountered
|
|
11388
|
+
* will be returned in \p failIdx.
|
|
11389
|
+
*
|
|
11390
|
+
* \param dsts - Array of destination pointers.
|
|
11391
|
+
* \param srcs - Array of memcpy source pointers.
|
|
11392
|
+
* \param sizes - Array of sizes for memcpy operations.
|
|
11393
|
+
* \param count - Size of \p dsts, \p srcs and \p sizes arrays
|
|
11394
|
+
* \param attrs - Array of memcpy attributes.
|
|
11395
|
+
* \param attrsIdxs - Array of indices to specify which copies each entry in the \p attrs array applies to.
|
|
11396
|
+
The attributes specified in attrs[k] will be applied to copies starting from attrsIdxs[k]
|
|
11397
|
+
through attrsIdxs[k+1] - 1. Also attrs[numAttrs-1] will apply to copies starting from
|
|
11398
|
+
attrsIdxs[numAttrs-1] through count - 1.
|
|
11399
|
+
* \param numAttrs - Size of \p attrs and \p attrsIdxs arrays.
|
|
11400
|
+
* \param failIdx - Pointer to a location to return the index of the copy where a failure was encountered.
|
|
11401
|
+
The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
|
|
11402
|
+
* \param hStream - The stream to enqueue the operations in. Must not be legacy NULL stream.
|
|
11403
|
+
*
|
|
11404
|
+
* \return
|
|
11405
|
+
* ::CUDA_SUCCESS
|
|
11406
|
+
* ::CUDA_ERROR_DEINITIALIZED
|
|
11407
|
+
* ::CUDA_ERROR_NOT_INITIALIZED
|
|
11408
|
+
* ::CUDA_ERROR_INVALID_VALUE
|
|
11409
|
+
* \notefnerr
|
|
11410
|
+
* \note_async
|
|
11411
|
+
* \note_memcpy
|
|
11412
|
+
*/
|
|
11413
|
+
CUresult CUDAAPI cuMemcpyBatchAsync(CUdeviceptr *dsts, CUdeviceptr *srcs, size_t *sizes, size_t count,
|
|
11414
|
+
CUmemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs,
|
|
11415
|
+
size_t *failIdx, CUstream hStream);
|
|
11416
|
+
|
|
11417
|
+
/**
|
|
11418
|
+
* \brief Performs a batch of 3D memory copies asynchronously.
|
|
11419
|
+
*
|
|
11420
|
+
* Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
|
|
11421
|
+
* batch are not guaranteed to execute in any specific order. Note that this means specifying any dependent
|
|
11422
|
+
* copies within a batch will result in undefined behavior.
|
|
11423
|
+
*
|
|
11424
|
+
* Performs memory copies as specified in the \p opList array. The length of this array is specified in \p numOps.
|
|
11425
|
+
* Each entry in this array describes a copy operation. This includes among other things, the source and destination
|
|
11426
|
+
* operands for the copy as specified in ::CUDA_MEMCPY3D_BATCH_OP::src and ::CUDA_MEMCPY3D_BATCH_OP::dst respectively.
|
|
11427
|
+
* The source and destination operands of a copy can either be a pointer or a CUDA array. The width, height and depth
|
|
11428
|
+
* of a copy is specified in ::CUDA_MEMCPY3D_BATCH_OP::extent. The width, height and depth of a copy are specified in
|
|
11429
|
+
* elements and must not be zero. For pointer-to-pointer copies, the element size is considered to be 1. For pointer
|
|
11430
|
+
* to CUDA array or vice versa copies, the element size is determined by the CUDA array. For CUDA array to CUDA array copies,
|
|
11431
|
+
* the element size of the two CUDA arrays must match.
|
|
11432
|
+
*
|
|
11433
|
+
* For a given operand, if ::CUmemcpy3DOperand::type is specified as ::CU_MEMCPY_OPERAND_TYPE_POINTER, then
|
|
11434
|
+
* ::CUmemcpy3DOperand::op::ptr will be used. The ::CUmemcpy3DOperand::op::ptr::ptr field must contain the pointer where
|
|
11435
|
+
* the copy should begin. The ::CUmemcpy3DOperand::op::ptr::rowLength field specifies the length of each row in elements and
|
|
11436
|
+
* must either be zero or be greater than or equal to the width of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::width.
|
|
11437
|
+
* The ::CUmemcpy3DOperand::op::ptr::layerHeight field specifies the height of each layer and must either be zero or be greater than
|
|
11438
|
+
* or equal to the height of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::height. When either of these values is zero,
|
|
11439
|
+
* that aspect of the operand is considered to be tightly packed according to the copy extent. For managed memory pointers on devices where
|
|
11440
|
+
* ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or system-allocated pageable memory on devices where
|
|
11441
|
+
* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true, the ::CUmemcpy3DOperand::op::ptr::locHint field can be used to hint
|
|
11442
|
+
* the location of the operand.
|
|
11443
|
+
*
|
|
11444
|
+
* If an operand's type is specified as ::CU_MEMCPY_OPERAND_TYPE_ARRAY, then ::CUmemcpy3DOperand::op::array will be used.
|
|
11445
|
+
* The ::CUmemcpy3DOperand::op::array::array field specifies the CUDA array and ::CUmemcpy3DOperand::op::array::offset specifies
|
|
11446
|
+
* the 3D offset into that array where the copy begins.
|
|
11447
|
+
*
|
|
11448
|
+
* The ::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
|
|
11449
|
+
* with the attribute. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will
|
|
11450
|
+
* be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then
|
|
11451
|
+
* it indicates that access to the source pointer can be out of stream order and all accesses must be complete before
|
|
11452
|
+
* the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
|
|
11453
|
+
* operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
|
|
11454
|
+
* that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
|
|
11455
|
+
* need for the user to synchronize the stream after the API call. If the source access order is set to
|
|
11456
|
+
* ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the
|
|
11457
|
+
* accesses can happen even after the API call returns. This flag is suited for host pointers allocated
|
|
11458
|
+
* outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
|
|
11459
|
+
* Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcopy operation in \p opList must
|
|
11460
|
+
* have a valid srcAccessOrder setting, otherwise this API will return ::CUDA_ERROR_INVALID_VALUE.
|
|
11461
|
+
*
|
|
11462
|
+
* The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
|
|
11463
|
+
* ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with
|
|
11464
|
+
* any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
|
|
11465
|
+
*
|
|
11466
|
+
* If any error is encountered while parsing the batch, the index within the batch where the error was encountered
|
|
11467
|
+
* will be returned in \p failIdx.
|
|
11468
|
+
*
|
|
11469
|
+
* \param numOps - Total number of memcpy operations.
|
|
11470
|
+
* \param opList - Array of size \p numOps containing the actual memcpy operations.
|
|
11471
|
+
* \param failIdx - Pointer to a location to return the index of the copy where a failure was encountered.
|
|
11472
|
+
* The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
|
|
11473
|
+
* \param flags - Flags for future use, must be zero now.
|
|
11474
|
+
* \param hStream - The stream to enqueue the operations in. Must not be default NULL stream.
|
|
11475
|
+
*
|
|
11476
|
+
* \return
|
|
11477
|
+
* ::CUDA_SUCCESS
|
|
11478
|
+
* ::CUDA_ERROR_DEINITIALIZED
|
|
11479
|
+
* ::CUDA_ERROR_NOT_INITIALIZED
|
|
11480
|
+
* ::CUDA_ERROR_INVALID_VALUE
|
|
11481
|
+
* \notefnerr
|
|
11482
|
+
* \note_async
|
|
11483
|
+
* \note_memcpy
|
|
11484
|
+
*/
|
|
11485
|
+
CUresult CUDAAPI cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP *opList,
|
|
11486
|
+
size_t *failIdx, unsigned long long flags, CUstream hStream);
|
|
11487
|
+
|
|
10646
11488
|
/**
|
|
10647
11489
|
* \brief Initializes device memory
|
|
10648
11490
|
*
|
|
@@ -11139,8 +11981,51 @@ CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsi
|
|
|
11139
11981
|
CU_AD_FORMAT_SIGNED_INT16 = 0x09,
|
|
11140
11982
|
CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
|
|
11141
11983
|
CU_AD_FORMAT_HALF = 0x10,
|
|
11142
|
-
CU_AD_FORMAT_FLOAT = 0x20
|
|
11143
|
-
|
|
11984
|
+
CU_AD_FORMAT_FLOAT = 0x20,
|
|
11985
|
+
CU_AD_FORMAT_NV12 = 0xb0,
|
|
11986
|
+
CU_AD_FORMAT_UNORM_INT8X1 = 0xc0,
|
|
11987
|
+
CU_AD_FORMAT_UNORM_INT8X2 = 0xc1,
|
|
11988
|
+
CU_AD_FORMAT_UNORM_INT8X4 = 0xc2,
|
|
11989
|
+
CU_AD_FORMAT_UNORM_INT16X1 = 0xc3,
|
|
11990
|
+
CU_AD_FORMAT_UNORM_INT16X2 = 0xc4,
|
|
11991
|
+
CU_AD_FORMAT_UNORM_INT16X4 = 0xc5,
|
|
11992
|
+
CU_AD_FORMAT_SNORM_INT8X1 = 0xc6,
|
|
11993
|
+
CU_AD_FORMAT_SNORM_INT8X2 = 0xc7,
|
|
11994
|
+
CU_AD_FORMAT_SNORM_INT8X4 = 0xc8,
|
|
11995
|
+
CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
|
|
11996
|
+
CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
|
|
11997
|
+
CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
|
|
11998
|
+
CU_AD_FORMAT_BC1_UNORM = 0x91,
|
|
11999
|
+
CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
|
|
12000
|
+
CU_AD_FORMAT_BC2_UNORM = 0x93,
|
|
12001
|
+
CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
|
|
12002
|
+
CU_AD_FORMAT_BC3_UNORM = 0x95,
|
|
12003
|
+
CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
|
|
12004
|
+
CU_AD_FORMAT_BC4_UNORM = 0x97,
|
|
12005
|
+
CU_AD_FORMAT_BC4_SNORM = 0x98,
|
|
12006
|
+
CU_AD_FORMAT_BC5_UNORM = 0x99,
|
|
12007
|
+
CU_AD_FORMAT_BC5_SNORM = 0x9a,
|
|
12008
|
+
CU_AD_FORMAT_BC6H_UF16 = 0x9b,
|
|
12009
|
+
CU_AD_FORMAT_BC6H_SF16 = 0x9c,
|
|
12010
|
+
CU_AD_FORMAT_BC7_UNORM = 0x9d,
|
|
12011
|
+
CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
|
|
12012
|
+
CU_AD_FORMAT_P010 = 0x9f,
|
|
12013
|
+
CU_AD_FORMAT_P016 = 0xa1,
|
|
12014
|
+
CU_AD_FORMAT_NV16 = 0xa2,
|
|
12015
|
+
CU_AD_FORMAT_P210 = 0xa3,
|
|
12016
|
+
CU_AD_FORMAT_P216 = 0xa4,
|
|
12017
|
+
CU_AD_FORMAT_YUY2 = 0xa5,
|
|
12018
|
+
CU_AD_FORMAT_Y210 = 0xa6,
|
|
12019
|
+
CU_AD_FORMAT_Y216 = 0xa7,
|
|
12020
|
+
CU_AD_FORMAT_AYUV = 0xa8,
|
|
12021
|
+
CU_AD_FORMAT_Y410 = 0xa9,
|
|
12022
|
+
CU_AD_FORMAT_Y416 = 0xb1,
|
|
12023
|
+
CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
|
|
12024
|
+
CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
|
|
12025
|
+
CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
|
|
12026
|
+
CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
|
|
12027
|
+
CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
|
|
12028
|
+
} CUarray_format;
|
|
11144
12029
|
* \endcode
|
|
11145
12030
|
* - \p NumChannels specifies the number of packed components per CUDA array
|
|
11146
12031
|
* element; it may be 1, 2, or 4;
|
|
@@ -11459,7 +12344,50 @@ CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
|
|
|
11459
12344
|
CU_AD_FORMAT_SIGNED_INT16 = 0x09,
|
|
11460
12345
|
CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
|
|
11461
12346
|
CU_AD_FORMAT_HALF = 0x10,
|
|
11462
|
-
CU_AD_FORMAT_FLOAT = 0x20
|
|
12347
|
+
CU_AD_FORMAT_FLOAT = 0x20,
|
|
12348
|
+
CU_AD_FORMAT_NV12 = 0xb0,
|
|
12349
|
+
CU_AD_FORMAT_UNORM_INT8X1 = 0xc0,
|
|
12350
|
+
CU_AD_FORMAT_UNORM_INT8X2 = 0xc1,
|
|
12351
|
+
CU_AD_FORMAT_UNORM_INT8X4 = 0xc2,
|
|
12352
|
+
CU_AD_FORMAT_UNORM_INT16X1 = 0xc3,
|
|
12353
|
+
CU_AD_FORMAT_UNORM_INT16X2 = 0xc4,
|
|
12354
|
+
CU_AD_FORMAT_UNORM_INT16X4 = 0xc5,
|
|
12355
|
+
CU_AD_FORMAT_SNORM_INT8X1 = 0xc6,
|
|
12356
|
+
CU_AD_FORMAT_SNORM_INT8X2 = 0xc7,
|
|
12357
|
+
CU_AD_FORMAT_SNORM_INT8X4 = 0xc8,
|
|
12358
|
+
CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
|
|
12359
|
+
CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
|
|
12360
|
+
CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
|
|
12361
|
+
CU_AD_FORMAT_BC1_UNORM = 0x91,
|
|
12362
|
+
CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
|
|
12363
|
+
CU_AD_FORMAT_BC2_UNORM = 0x93,
|
|
12364
|
+
CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
|
|
12365
|
+
CU_AD_FORMAT_BC3_UNORM = 0x95,
|
|
12366
|
+
CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
|
|
12367
|
+
CU_AD_FORMAT_BC4_UNORM = 0x97,
|
|
12368
|
+
CU_AD_FORMAT_BC4_SNORM = 0x98,
|
|
12369
|
+
CU_AD_FORMAT_BC5_UNORM = 0x99,
|
|
12370
|
+
CU_AD_FORMAT_BC5_SNORM = 0x9a,
|
|
12371
|
+
CU_AD_FORMAT_BC6H_UF16 = 0x9b,
|
|
12372
|
+
CU_AD_FORMAT_BC6H_SF16 = 0x9c,
|
|
12373
|
+
CU_AD_FORMAT_BC7_UNORM = 0x9d,
|
|
12374
|
+
CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
|
|
12375
|
+
CU_AD_FORMAT_P010 = 0x9f,
|
|
12376
|
+
CU_AD_FORMAT_P016 = 0xa1,
|
|
12377
|
+
CU_AD_FORMAT_NV16 = 0xa2,
|
|
12378
|
+
CU_AD_FORMAT_P210 = 0xa3,
|
|
12379
|
+
CU_AD_FORMAT_P216 = 0xa4,
|
|
12380
|
+
CU_AD_FORMAT_YUY2 = 0xa5,
|
|
12381
|
+
CU_AD_FORMAT_Y210 = 0xa6,
|
|
12382
|
+
CU_AD_FORMAT_Y216 = 0xa7,
|
|
12383
|
+
CU_AD_FORMAT_AYUV = 0xa8,
|
|
12384
|
+
CU_AD_FORMAT_Y410 = 0xa9,
|
|
12385
|
+
CU_AD_FORMAT_Y416 = 0xb1,
|
|
12386
|
+
CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
|
|
12387
|
+
CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
|
|
12388
|
+
CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
|
|
12389
|
+
CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
|
|
12390
|
+
CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
|
|
11463
12391
|
} CUarray_format;
|
|
11464
12392
|
* \endcode
|
|
11465
12393
|
*
|
|
@@ -11680,7 +12608,50 @@ CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescripto
|
|
|
11680
12608
|
CU_AD_FORMAT_SIGNED_INT16 = 0x09,
|
|
11681
12609
|
CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
|
|
11682
12610
|
CU_AD_FORMAT_HALF = 0x10,
|
|
11683
|
-
CU_AD_FORMAT_FLOAT = 0x20
|
|
12611
|
+
CU_AD_FORMAT_FLOAT = 0x20,
|
|
12612
|
+
CU_AD_FORMAT_NV12 = 0xb0,
|
|
12613
|
+
CU_AD_FORMAT_UNORM_INT8X1 = 0xc0,
|
|
12614
|
+
CU_AD_FORMAT_UNORM_INT8X2 = 0xc1,
|
|
12615
|
+
CU_AD_FORMAT_UNORM_INT8X4 = 0xc2,
|
|
12616
|
+
CU_AD_FORMAT_UNORM_INT16X1 = 0xc3,
|
|
12617
|
+
CU_AD_FORMAT_UNORM_INT16X2 = 0xc4,
|
|
12618
|
+
CU_AD_FORMAT_UNORM_INT16X4 = 0xc5,
|
|
12619
|
+
CU_AD_FORMAT_SNORM_INT8X1 = 0xc6,
|
|
12620
|
+
CU_AD_FORMAT_SNORM_INT8X2 = 0xc7,
|
|
12621
|
+
CU_AD_FORMAT_SNORM_INT8X4 = 0xc8,
|
|
12622
|
+
CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
|
|
12623
|
+
CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
|
|
12624
|
+
CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
|
|
12625
|
+
CU_AD_FORMAT_BC1_UNORM = 0x91,
|
|
12626
|
+
CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
|
|
12627
|
+
CU_AD_FORMAT_BC2_UNORM = 0x93,
|
|
12628
|
+
CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
|
|
12629
|
+
CU_AD_FORMAT_BC3_UNORM = 0x95,
|
|
12630
|
+
CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
|
|
12631
|
+
CU_AD_FORMAT_BC4_UNORM = 0x97,
|
|
12632
|
+
CU_AD_FORMAT_BC4_SNORM = 0x98,
|
|
12633
|
+
CU_AD_FORMAT_BC5_UNORM = 0x99,
|
|
12634
|
+
CU_AD_FORMAT_BC5_SNORM = 0x9a,
|
|
12635
|
+
CU_AD_FORMAT_BC6H_UF16 = 0x9b,
|
|
12636
|
+
CU_AD_FORMAT_BC6H_SF16 = 0x9c,
|
|
12637
|
+
CU_AD_FORMAT_BC7_UNORM = 0x9d,
|
|
12638
|
+
CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
|
|
12639
|
+
CU_AD_FORMAT_P010 = 0x9f,
|
|
12640
|
+
CU_AD_FORMAT_P016 = 0xa1,
|
|
12641
|
+
CU_AD_FORMAT_NV16 = 0xa2,
|
|
12642
|
+
CU_AD_FORMAT_P210 = 0xa3,
|
|
12643
|
+
CU_AD_FORMAT_P216 = 0xa4,
|
|
12644
|
+
CU_AD_FORMAT_YUY2 = 0xa5,
|
|
12645
|
+
CU_AD_FORMAT_Y210 = 0xa6,
|
|
12646
|
+
CU_AD_FORMAT_Y216 = 0xa7,
|
|
12647
|
+
CU_AD_FORMAT_AYUV = 0xa8,
|
|
12648
|
+
CU_AD_FORMAT_Y410 = 0xa9,
|
|
12649
|
+
CU_AD_FORMAT_Y416 = 0xb1,
|
|
12650
|
+
CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
|
|
12651
|
+
CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
|
|
12652
|
+
CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
|
|
12653
|
+
CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
|
|
12654
|
+
CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
|
|
11684
12655
|
} CUarray_format;
|
|
11685
12656
|
* \endcode
|
|
11686
12657
|
*
|
|
@@ -11842,12 +12813,18 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
|
|
|
11842
12813
|
* have identical allocation properties. Users are also expected to retrieve a
|
|
11843
12814
|
* new handle every time the underlying physical allocation(s) corresponding
|
|
11844
12815
|
* to a previously queried VA range are changed.
|
|
12816
|
+
*
|
|
12817
|
+
* For CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, users may set
|
|
12818
|
+
* flags to ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE. Which when set on a
|
|
12819
|
+
* supported platform, will give a DMA_BUF handle mapped via PCIE BAR1 or will
|
|
12820
|
+
* return an error otherwise.
|
|
11845
12821
|
*
|
|
11846
12822
|
* \param[out] handle - Pointer to the location where the returned handle will be stored.
|
|
11847
12823
|
* \param[in] dptr - Pointer to a valid CUDA device allocation. Must be aligned to host page size.
|
|
11848
12824
|
* \param[in] size - Length of the address range. Must be aligned to host page size.
|
|
11849
12825
|
* \param[in] handleType - Type of handle requested (defines type and size of the \p handle output parameter)
|
|
11850
|
-
* \param[in] flags -
|
|
12826
|
+
* \param[in] flags - When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD the value could be
|
|
12827
|
+
* ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE, otherwise 0.
|
|
11851
12828
|
*
|
|
11852
12829
|
* \return
|
|
11853
12830
|
* CUDA_SUCCESS
|
|
@@ -11856,6 +12833,112 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
|
|
|
11856
12833
|
*/
|
|
11857
12834
|
CUresult CUDAAPI cuMemGetHandleForAddressRange(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
|
|
11858
12835
|
|
|
12836
|
+
/**
|
|
12837
|
+
* \brief Bitmasks for CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK.
|
|
12838
|
+
*/
|
|
12839
|
+
typedef enum CUmemDecompressAlgorithm_enum {
|
|
12840
|
+
CU_MEM_DECOMPRESS_UNSUPPORTED = 0, /**< Decompression is unsupported. */
|
|
12841
|
+
CU_MEM_DECOMPRESS_ALGORITHM_DEFLATE = 1<<0, /**< Deflate is supported. */
|
|
12842
|
+
CU_MEM_DECOMPRESS_ALGORITHM_SNAPPY = 1<<1 /**< Snappy is supported. */
|
|
12843
|
+
} CUmemDecompressAlgorithm;
|
|
12844
|
+
|
|
12845
|
+
/**
|
|
12846
|
+
* \brief Structure describing the parameters that compose a single
|
|
12847
|
+
* decompression operation.
|
|
12848
|
+
*/
|
|
12849
|
+
typedef struct CUmemDecompressParams_st {
|
|
12850
|
+
/** The number of bytes to be read and decompressed from
|
|
12851
|
+
* ::CUmemDecompressParams_st.src. */
|
|
12852
|
+
size_t srcNumBytes;
|
|
12853
|
+
/** The number of bytes that the decompression operation will be expected to
|
|
12854
|
+
* write to ::CUmemDecompressParams_st.dst. This value is optional; if
|
|
12855
|
+
* present, it may be used by the CUDA driver as a heuristic for scheduling
|
|
12856
|
+
* the individual decompression operations. */
|
|
12857
|
+
size_t dstNumBytes;
|
|
12858
|
+
/** After the decompression operation has completed, the actual number of
|
|
12859
|
+
* bytes written to ::CUmemDecompressParams.dst will be recorded as a 32-bit
|
|
12860
|
+
* unsigned integer in the memory at this address. */
|
|
12861
|
+
cuuint32_t *dstActBytes;
|
|
12862
|
+
/** Pointer to a buffer of at least ::CUmemDecompressParams_st.srcNumBytes
|
|
12863
|
+
* compressed bytes. */
|
|
12864
|
+
const void *src;
|
|
12865
|
+
/** Pointer to a buffer where the decompressed data will be written. The
|
|
12866
|
+
* number of bytes written to this location will be recorded in the memory
|
|
12867
|
+
* pointed to by ::CUmemDecompressParams_st.dstActBytes */
|
|
12868
|
+
void *dst;
|
|
12869
|
+
/** The decompression algorithm to use. */
|
|
12870
|
+
CUmemDecompressAlgorithm algo;
|
|
12871
|
+
/* These bytes are unused and must be zeroed. This ensures compatibility if
|
|
12872
|
+
* additional fields are added in the future. */
|
|
12873
|
+
unsigned char padding[20];
|
|
12874
|
+
} CUmemDecompressParams;
|
|
12875
|
+
|
|
12876
|
+
/**
|
|
12877
|
+
* \brief Submit a batch of \p count independent decompression operations.
|
|
12878
|
+
*
|
|
12879
|
+
* \details Each of the \p count decompression operations is described by a
|
|
12880
|
+
* single entry in the \p paramsArray array. Once the batch has been
|
|
12881
|
+
* submitted, the function will return, and decompression will happen
|
|
12882
|
+
* asynchronously w.r.t. the CPU. To the work completion tracking
|
|
12883
|
+
* mechanisms in the CUDA driver, the batch will be considered a single
|
|
12884
|
+
* unit of work and processed according to stream semantics, i.e., it
|
|
12885
|
+
* is not possible to query the completion of individual decompression
|
|
12886
|
+
* operations within a batch.
|
|
12887
|
+
*
|
|
12888
|
+
* The memory pointed to by each of ::CUmemDecompressParams.src,
|
|
12889
|
+
* ::CUmemDecompressParams.dst, and ::CUmemDecompressParams.dstActBytes,
|
|
12890
|
+
* must be capable of usage with the hardware decompress feature. That
|
|
12891
|
+
* is, for each of said pointers, the pointer attribute
|
|
12892
|
+
* ::CU_POINTER_ATTRIBUTE_IS_MEM_DECOMPRESS_CAPABLE should give a
|
|
12893
|
+
* non-zero value. To ensure this, the memory backing the pointers
|
|
12894
|
+
* should have been allocated using one of the following CUDA memory
|
|
12895
|
+
* allocators:
|
|
12896
|
+
* * ::cuMemAlloc()
|
|
12897
|
+
* * ::cuMemCreate() with the usage flag ::CU_MEM_CREATE_USAGE_HW_DECOMPRESS
|
|
12898
|
+
* * ::cuMemAllocFromPoolAsync() from a pool that was created with
|
|
12899
|
+
* the usage flag ::CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS
|
|
12900
|
+
* Additionally, ::CUmemDecompressParams.src, ::CUmemDecompressParams.dst,
|
|
12901
|
+
* and ::CUmemDecompressParams.dstActBytes, must all be accessible from
|
|
12902
|
+
* the device associated with the context where \p stream was created.
|
|
12903
|
+
* For information on how to ensure this, see the documentation for the
|
|
12904
|
+
* allocator of interest.
|
|
12905
|
+
*
|
|
12906
|
+
* \param[in] paramsArray The array of structures describing the independent
|
|
12907
|
+
* decompression operations.
|
|
12908
|
+
* \param[in] count The number of entries in \p paramsArray array.
|
|
12909
|
+
* \param[in] flags Must be 0.
|
|
12910
|
+
* \param[out] errorIndex The index into \p paramsArray of the decompression
|
|
12911
|
+
* operation for which the error returned by this
|
|
12912
|
+
* function pertains to. If \p index is SIZE_MAX and
|
|
12913
|
+
* the value returned is not ::CUDA_SUCCESS, then the
|
|
12914
|
+
* error returned by this function should be considered
|
|
12915
|
+
* a general error that does not pertain to a
|
|
12916
|
+
* particular decompression operation. May be \p NULL,
|
|
12917
|
+
* in which case, no index will be recorded in the
|
|
12918
|
+
* event of error.
|
|
12919
|
+
* \param[in] stream The stream where the work will be enqueued.
|
|
12920
|
+
*
|
|
12921
|
+
* \return
|
|
12922
|
+
* ::CUDA_SUCCESS,
|
|
12923
|
+
* ::CUDA_ERROR_DEINITIALIZED,
|
|
12924
|
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
12925
|
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
|
12926
|
+
* ::CUDA_ERROR_INVALID_VALUE,
|
|
12927
|
+
* ::CUDA_ERROR_INVALID_HANDLE
|
|
12928
|
+
* \notefnerr
|
|
12929
|
+
* \note_async
|
|
12930
|
+
* \note_null_stream
|
|
12931
|
+
*
|
|
12932
|
+
* \sa ::cuMemAlloc, ::cuMemPoolCreate, ::cuMemAllocFromPoolAsync
|
|
12933
|
+
*/
|
|
12934
|
+
CUresult CUDAAPI cuMemBatchDecompressAsync(
|
|
12935
|
+
CUmemDecompressParams *paramsArray,
|
|
12936
|
+
size_t count,
|
|
12937
|
+
unsigned int flags,
|
|
12938
|
+
size_t *errorIndex,
|
|
12939
|
+
CUstream stream
|
|
12940
|
+
);
|
|
12941
|
+
|
|
11859
12942
|
/** @} */ /* END CUDA_MEM */
|
|
11860
12943
|
|
|
11861
12944
|
/**
|
|
@@ -11937,17 +13020,23 @@ CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size);
|
|
|
11937
13020
|
* set ::CUmemAllocationProp::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
|
|
11938
13021
|
* ::CUmemAllocationProp::CUmemLocation::id must specify the NUMA ID of the CPU.
|
|
11939
13022
|
* On systems where NUMA is not available ::CUmemAllocationProp::CUmemLocation::id must be set to 0.
|
|
13023
|
+
* Specifying ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or ::CU_MEM_LOCATION_TYPE_HOST as the
|
|
13024
|
+
* ::CUmemLocation::type will result in ::CUDA_ERROR_INVALID_VALUE.
|
|
13025
|
+
*
|
|
13026
|
+
* Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
|
|
13027
|
+
* (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices
|
|
13028
|
+
* (2) have at least one IMEX channel file accessible by the user launching the application.
|
|
13029
|
+
*
|
|
13030
|
+
* When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
|
|
13031
|
+
* share memory.
|
|
11940
13032
|
*
|
|
11941
|
-
*
|
|
11942
|
-
*
|
|
11943
|
-
*
|
|
11944
|
-
*
|
|
11945
|
-
*
|
|
11946
|
-
*
|
|
11947
|
-
*
|
|
11948
|
-
* If the allocating process does not have access setup for an IMEX channel, attempting to create
|
|
11949
|
-
* a ::CUmemGenericAllocationHandle with ::CU_MEM_HANDLE_TYPE_FABRIC will result in ::CUDA_ERROR_NOT_PERMITTED.
|
|
11950
|
-
* The nvidia-modprobe CLI provides more information regarding setting up of IMEX channels.
|
|
13033
|
+
* The IMEX channel security model works on a per user basis. Which means all processes under a user can share
|
|
13034
|
+
* memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
|
|
13035
|
+
* channel is required for each user.
|
|
13036
|
+
*
|
|
13037
|
+
* These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
|
|
13038
|
+
* native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
|
|
13039
|
+
* users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
|
|
11951
13040
|
*
|
|
11952
13041
|
* If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then
|
|
11953
13042
|
* the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays
|
|
@@ -12637,25 +13726,31 @@ CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPo
|
|
|
12637
13726
|
* Creates a CUDA memory pool and returns the handle in \p pool. The \p poolProps determines
|
|
12638
13727
|
* the properties of the pool such as the backing device and IPC capabilities.
|
|
12639
13728
|
*
|
|
12640
|
-
* To create a memory pool targeting a specific host NUMA node, applications must
|
|
12641
|
-
* set ::CUmemPoolProps::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
|
|
12642
|
-
* ::CUmemPoolProps::CUmemLocation::id must specify the NUMA ID of the host memory node.
|
|
13729
|
+
* To create a memory pool targeting a specific host NUMA node, applications must
|
|
13730
|
+
* set ::CUmemPoolProps::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
|
|
13731
|
+
* ::CUmemPoolProps::CUmemLocation::id must specify the NUMA ID of the host memory node.
|
|
13732
|
+
* Specifying ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or ::CU_MEM_LOCATION_TYPE_HOST as the
|
|
13733
|
+
* ::CUmemPoolProps::CUmemLocation::type will result in ::CUDA_ERROR_INVALID_VALUE.
|
|
12643
13734
|
* By default, the pool's memory will be accessible from the device it is allocated on.
|
|
12644
13735
|
* In the case of pools created with ::CU_MEM_LOCATION_TYPE_HOST_NUMA, their default accessibility
|
|
12645
13736
|
* will be from the host CPU.
|
|
12646
13737
|
* Applications can control the maximum size of the pool by specifying a non-zero value for ::CUmemPoolProps::maxSize.
|
|
12647
13738
|
* If set to 0, the maximum size of the pool will default to a system dependent value.
|
|
12648
13739
|
*
|
|
12649
|
-
* Applications
|
|
12650
|
-
*
|
|
12651
|
-
*
|
|
12652
|
-
*
|
|
12653
|
-
*
|
|
12654
|
-
*
|
|
12655
|
-
*
|
|
12656
|
-
*
|
|
12657
|
-
* a
|
|
12658
|
-
*
|
|
13740
|
+
* Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
|
|
13741
|
+
* (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices
|
|
13742
|
+
* (2) have at least one IMEX channel file accessible by the user launching the application.
|
|
13743
|
+
*
|
|
13744
|
+
* When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
|
|
13745
|
+
* share memory.
|
|
13746
|
+
*
|
|
13747
|
+
* The IMEX channel security model works on a per user basis. Which means all processes under a user can share
|
|
13748
|
+
* memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
|
|
13749
|
+
* channel is required for each user.
|
|
13750
|
+
*
|
|
13751
|
+
* These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
|
|
13752
|
+
* native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
|
|
13753
|
+
* users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
|
|
12659
13754
|
*
|
|
12660
13755
|
* \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
|
|
12661
13756
|
*
|
|
@@ -12962,8 +14057,8 @@ CUresult CUDAAPI cuMulticastAddDevice(CUmemGenericAllocationHandle mcHandle, CUd
|
|
|
12962
14057
|
* returned by ::cuMulticastGetGranularity with the flag
|
|
12963
14058
|
* ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
|
|
12964
14059
|
*
|
|
12965
|
-
* The \p size + \p memOffset
|
|
12966
|
-
* memory. Similarly the \p size + \p mcOffset
|
|
14060
|
+
* The \p size + \p memOffset cannot be larger than the size of the allocated
|
|
14061
|
+
* memory. Similarly the \p size + \p mcOffset cannot be larger than the size
|
|
12967
14062
|
* of the multicast object.
|
|
12968
14063
|
* The memory allocation must have beeen created on one of the devices
|
|
12969
14064
|
* that was added to the multicast team via ::cuMulticastAddDevice.
|
|
@@ -13010,8 +14105,8 @@ CUresult CUDAAPI cuMulticastBindMem(CUmemGenericAllocationHandle mcHandle, size_
|
|
|
13010
14105
|
* aligned to the value returned by ::cuMulticastGetGranularity with the flag
|
|
13011
14106
|
* ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
|
|
13012
14107
|
*
|
|
13013
|
-
* The \p size
|
|
13014
|
-
* Similarly the \p size + \p mcOffset
|
|
14108
|
+
* The \p size cannot be larger than the size of the allocated memory.
|
|
14109
|
+
* Similarly the \p size + \p mcOffset cannot be larger than the total size
|
|
13015
14110
|
* of the multicast object.
|
|
13016
14111
|
* The memory allocation must have beeen created on one of the devices
|
|
13017
14112
|
* that was added to the multicast team via ::cuMulticastAddDevice.
|
|
@@ -13052,7 +14147,7 @@ CUresult CUDAAPI cuMulticastBindAddr(CUmemGenericAllocationHandle mcHandle, size
|
|
|
13052
14147
|
* The intended \p size of the unbind and the offset in the multicast range
|
|
13053
14148
|
* ( \p mcOffset ) must be a multiple of the value returned by
|
|
13054
14149
|
* ::cuMulticastGetGranularity flag ::CU_MULTICAST_GRANULARITY_MINIMUM.
|
|
13055
|
-
* The \p size + \p mcOffset
|
|
14150
|
+
* The \p size + \p mcOffset cannot be larger than the total size of the
|
|
13056
14151
|
* multicast object.
|
|
13057
14152
|
*
|
|
13058
14153
|
* \note
|
|
@@ -13343,6 +14438,12 @@ CUresult CUDAAPI cuMulticastGetGranularity(size_t *granularity, const CUmulticas
|
|
|
13343
14438
|
*
|
|
13344
14439
|
* Returns in \p *data the handle to the mempool that the allocation was obtained from.
|
|
13345
14440
|
*
|
|
14441
|
+
* - ::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE:
|
|
14442
|
+
*
|
|
14443
|
+
* Returns in \p *data a boolean that indicates whether the pointer points
|
|
14444
|
+
* to memory that is capable to be used for hardware accelerated
|
|
14445
|
+
* decompression.
|
|
14446
|
+
*
|
|
13346
14447
|
* \par
|
|
13347
14448
|
*
|
|
13348
14449
|
* Note that for most allocations in the unified virtual address space
|
|
@@ -13397,7 +14498,9 @@ CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute
|
|
|
13397
14498
|
* base device pointer of the memory to be prefetched and \p dstDevice is the
|
|
13398
14499
|
* destination device. \p count specifies the number of bytes to copy. \p hStream
|
|
13399
14500
|
* is the stream in which the operation is enqueued. The memory range must refer
|
|
13400
|
-
* to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables
|
|
14501
|
+
* to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables
|
|
14502
|
+
* or it may also refer to system-allocated memory on systems with non-zero
|
|
14503
|
+
* CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
|
|
13401
14504
|
*
|
|
13402
14505
|
* Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
|
|
13403
14506
|
* \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
|
|
@@ -13957,6 +15060,7 @@ CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute at
|
|
|
13957
15060
|
* - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
|
|
13958
15061
|
* - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
|
|
13959
15062
|
* - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
|
|
15063
|
+
* - ::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE
|
|
13960
15064
|
*
|
|
13961
15065
|
* \param numAttributes - Number of attributes to query
|
|
13962
15066
|
* \param attributes - An array of attributes to query
|
|
@@ -14027,8 +15131,10 @@ CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_at
|
|
|
14027
15131
|
*
|
|
14028
15132
|
* \sa ::cuStreamDestroy,
|
|
14029
15133
|
* ::cuStreamCreateWithPriority,
|
|
15134
|
+
* ::cuGreenCtxStreamCreate,
|
|
14030
15135
|
* ::cuStreamGetPriority,
|
|
14031
15136
|
* ::cuStreamGetFlags,
|
|
15137
|
+
* ::cuStreamGetDevice
|
|
14032
15138
|
* ::cuStreamWaitEvent,
|
|
14033
15139
|
* ::cuStreamQuery,
|
|
14034
15140
|
* ::cuStreamSynchronize,
|
|
@@ -14078,9 +15184,11 @@ CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags);
|
|
|
14078
15184
|
*
|
|
14079
15185
|
* \sa ::cuStreamDestroy,
|
|
14080
15186
|
* ::cuStreamCreate,
|
|
15187
|
+
* ::cuGreenCtxStreamCreate,
|
|
14081
15188
|
* ::cuStreamGetPriority,
|
|
14082
15189
|
* ::cuCtxGetStreamPriorityRange,
|
|
14083
15190
|
* ::cuStreamGetFlags,
|
|
15191
|
+
* ::cuStreamGetDevice
|
|
14084
15192
|
* ::cuStreamWaitEvent,
|
|
14085
15193
|
* ::cuStreamQuery,
|
|
14086
15194
|
* ::cuStreamSynchronize,
|
|
@@ -14093,7 +15201,7 @@ CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int fla
|
|
|
14093
15201
|
/**
|
|
14094
15202
|
* \brief Query the priority of a given stream
|
|
14095
15203
|
*
|
|
14096
|
-
* Query the priority of a stream created using ::cuStreamCreate or ::
|
|
15204
|
+
* Query the priority of a stream created using ::cuStreamCreate, ::cuStreamCreateWithPriority or ::cuGreenCtxStreamCreate
|
|
14097
15205
|
* and return the priority in \p priority. Note that if the stream was created with a
|
|
14098
15206
|
* priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
|
|
14099
15207
|
* this function returns the clamped priority.
|
|
@@ -14114,16 +15222,44 @@ CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int fla
|
|
|
14114
15222
|
* \sa ::cuStreamDestroy,
|
|
14115
15223
|
* ::cuStreamCreate,
|
|
14116
15224
|
* ::cuStreamCreateWithPriority,
|
|
15225
|
+
* ::cuGreenCtxStreamCreate,
|
|
14117
15226
|
* ::cuCtxGetStreamPriorityRange,
|
|
14118
15227
|
* ::cuStreamGetFlags,
|
|
15228
|
+
* ::cuStreamGetDevice
|
|
14119
15229
|
* ::cudaStreamGetPriority
|
|
14120
15230
|
*/
|
|
14121
15231
|
CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
|
|
14122
15232
|
|
|
15233
|
+
/**
|
|
15234
|
+
* \brief Returns the device handle of the stream
|
|
15235
|
+
*
|
|
15236
|
+
* Returns in \p *device the device handle of the stream
|
|
15237
|
+
*
|
|
15238
|
+
* \param hStream - Handle to the stream to be queried
|
|
15239
|
+
* \param device - Returns the device to which a stream belongs
|
|
15240
|
+
*
|
|
15241
|
+
* \return
|
|
15242
|
+
* ::CUDA_SUCCESS,
|
|
15243
|
+
* ::CUDA_ERROR_DEINITIALIZED,
|
|
15244
|
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
15245
|
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
|
15246
|
+
* ::CUDA_ERROR_INVALID_VALUE,
|
|
15247
|
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
|
15248
|
+
* ::CUDA_ERROR_OUT_OF_MEMORY
|
|
15249
|
+
* \notefnerr
|
|
15250
|
+
*
|
|
15251
|
+
* \sa
|
|
15252
|
+
* ::cuStreamDestroy,
|
|
15253
|
+
* ::cuStreamCreate,
|
|
15254
|
+
* ::cuGreenCtxStreamCreate,
|
|
15255
|
+
* ::cuStreamGetFlags
|
|
15256
|
+
*/
|
|
15257
|
+
CUresult CUDAAPI cuStreamGetDevice(CUstream hStream, CUdevice *device);
|
|
15258
|
+
|
|
14123
15259
|
/**
|
|
14124
15260
|
* \brief Query the flags of a given stream
|
|
14125
15261
|
*
|
|
14126
|
-
* Query the flags of a stream created using ::cuStreamCreate or ::
|
|
15262
|
+
* Query the flags of a stream created using ::cuStreamCreate, ::cuStreamCreateWithPriority or ::cuGreenCtxStreamCreate
|
|
14127
15263
|
* and return the flags in \p flags.
|
|
14128
15264
|
*
|
|
14129
15265
|
* \param hStream - Handle to the stream to be queried
|
|
@@ -14143,8 +15279,10 @@ CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
|
|
|
14143
15279
|
*
|
|
14144
15280
|
* \sa ::cuStreamDestroy,
|
|
14145
15281
|
* ::cuStreamCreate,
|
|
15282
|
+
* ::cuGreenCtxStreamCreate,
|
|
14146
15283
|
* ::cuStreamGetPriority,
|
|
14147
15284
|
* ::cudaStreamGetFlags
|
|
15285
|
+
* ::cuStreamGetDevice
|
|
14148
15286
|
*/
|
|
14149
15287
|
CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
|
|
14150
15288
|
|
|
@@ -14186,6 +15324,10 @@ CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
|
|
|
14186
15324
|
*
|
|
14187
15325
|
* Returns the CUDA context that the stream is associated with.
|
|
14188
15326
|
*
|
|
15327
|
+
* Note there is a later version of this API, ::cuStreamGetCtx_v2. It will
|
|
15328
|
+
* supplant this version in CUDA 13.0. It is recommended to use ::cuStreamGetCtx_v2
|
|
15329
|
+
* till then as this version will return ::CUDA_ERROR_NOT_SUPPORTED for streams created via the API ::cuGreenCtxStreamCreate.
|
|
15330
|
+
*
|
|
14189
15331
|
* The stream handle \p hStream can refer to any of the following:
|
|
14190
15332
|
* <ul>
|
|
14191
15333
|
* <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
|
|
@@ -14210,21 +15352,82 @@ CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
|
|
|
14210
15352
|
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
14211
15353
|
* ::CUDA_ERROR_INVALID_CONTEXT,
|
|
14212
15354
|
* ::CUDA_ERROR_INVALID_HANDLE,
|
|
15355
|
+
* ::CUDA_ERROR_NOT_SUPPORTED
|
|
14213
15356
|
* \notefnerr
|
|
14214
15357
|
*
|
|
14215
15358
|
* \sa ::cuStreamDestroy,
|
|
14216
15359
|
* ::cuStreamCreateWithPriority,
|
|
14217
15360
|
* ::cuStreamGetPriority,
|
|
14218
15361
|
* ::cuStreamGetFlags,
|
|
15362
|
+
* ::cuStreamGetDevice
|
|
14219
15363
|
* ::cuStreamWaitEvent,
|
|
14220
15364
|
* ::cuStreamQuery,
|
|
14221
15365
|
* ::cuStreamSynchronize,
|
|
14222
15366
|
* ::cuStreamAddCallback,
|
|
14223
15367
|
* ::cudaStreamCreate,
|
|
15368
|
+
* ::cuStreamGetCtx_v2,
|
|
14224
15369
|
* ::cudaStreamCreateWithFlags
|
|
14225
15370
|
*/
|
|
14226
15371
|
CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
|
|
14227
15372
|
|
|
15373
|
+
/**
|
|
15374
|
+
* \brief Query the contexts associated with a stream
|
|
15375
|
+
*
|
|
15376
|
+
* Returns the contexts that the stream is associated with.
|
|
15377
|
+
*
|
|
15378
|
+
* If the stream is associated with a green context, the API returns the green context in \p pGreenCtx
|
|
15379
|
+
* and the primary context of the associated device in \p pCtx.
|
|
15380
|
+
*
|
|
15381
|
+
* If the stream is associated with a regular context, the API returns the regular context in \p pCtx
|
|
15382
|
+
* and NULL in \p pGreenCtx.
|
|
15383
|
+
*
|
|
15384
|
+
* The stream handle \p hStream can refer to any of the following:
|
|
15385
|
+
* <ul>
|
|
15386
|
+
* <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate,
|
|
15387
|
+
* ::cuStreamCreateWithPriority and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as
|
|
15388
|
+
* ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
|
|
15389
|
+
* Passing an invalid handle will result in undefined behavior.</li>
|
|
15390
|
+
* <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
|
|
15391
|
+
* ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
|
|
15392
|
+
* which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
|
|
15393
|
+
* If any of the special handles are specified, the API will operate on the context current to the
|
|
15394
|
+
* calling thread. If a green context (that was converted via ::cuCtxFromGreenCtx() before setting it current)
|
|
15395
|
+
* is current to the calling thread, the API will return the green context in \p pGreenCtx
|
|
15396
|
+
* and the primary context of the associated device in \p pCtx. If a regular context is current,
|
|
15397
|
+
* the API returns the regular context in \p pCtx and NULL in \p pGreenCtx.
|
|
15398
|
+
* Note that specifying ::CU_STREAM_PER_THREAD or ::cudaStreamPerThread will return ::CUDA_ERROR_INVALID_HANDLE
|
|
15399
|
+
* if a green context is current to the calling thread.
|
|
15400
|
+
* If no context is current to the calling thread, ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
|
|
15401
|
+
* </ul>
|
|
15402
|
+
*
|
|
15403
|
+
* \param hStream - Handle to the stream to be queried
|
|
15404
|
+
* \param pCtx - Returned regular context associated with the stream
|
|
15405
|
+
* \param pGreenCtx - Returned green context if the stream is associated with a green context or NULL if not
|
|
15406
|
+
*
|
|
15407
|
+
* \return
|
|
15408
|
+
* ::CUDA_SUCCESS,
|
|
15409
|
+
* ::CUDA_ERROR_DEINITIALIZED,
|
|
15410
|
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
15411
|
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
|
15412
|
+
* ::CUDA_ERROR_INVALID_HANDLE
|
|
15413
|
+
* \notefnerr
|
|
15414
|
+
*
|
|
15415
|
+
* \sa ::cuStreamDestroy,
|
|
15416
|
+
* ::cuStreamCreate
|
|
15417
|
+
* ::cuStreamCreateWithPriority,
|
|
15418
|
+
* ::cuGreenCtxStreamCreate,
|
|
15419
|
+
* ::cuStreamGetPriority,
|
|
15420
|
+
* ::cuStreamGetFlags,
|
|
15421
|
+
* ::cuStreamGetDevice
|
|
15422
|
+
* ::cuStreamWaitEvent,
|
|
15423
|
+
* ::cuStreamQuery,
|
|
15424
|
+
* ::cuStreamSynchronize,
|
|
15425
|
+
* ::cuStreamAddCallback,
|
|
15426
|
+
* ::cudaStreamCreate,
|
|
15427
|
+
* ::cudaStreamCreateWithFlags,
|
|
15428
|
+
*/
|
|
15429
|
+
CUresult CUDAAPI cuStreamGetCtx_v2(CUstream hStream, CUcontext *pCtx, CUgreenCtx *pGreenCtx);
|
|
15430
|
+
|
|
14228
15431
|
/**
|
|
14229
15432
|
* \brief Make a compute stream wait on an event
|
|
14230
15433
|
*
|
|
@@ -14545,6 +15748,7 @@ CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
|
|
|
14545
15748
|
*/
|
|
14546
15749
|
CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
|
|
14547
15750
|
|
|
15751
|
+
|
|
14548
15752
|
/**
|
|
14549
15753
|
* \brief Query a stream's capture state
|
|
14550
15754
|
*
|
|
@@ -15031,7 +16235,8 @@ CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
|
|
|
15031
16235
|
* \brief Records an event
|
|
15032
16236
|
*
|
|
15033
16237
|
* Captures in \p hEvent the contents of \p hStream at the time of this call.
|
|
15034
|
-
* \p hEvent and \p hStream must be from the same context
|
|
16238
|
+
* \p hEvent and \p hStream must be from the same context otherwise
|
|
16239
|
+
* ::CUDA_ERROR_INVALID_HANDLE is returned.
|
|
15035
16240
|
* Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
|
|
15036
16241
|
* examine or wait for completion of the work that was captured. Uses of
|
|
15037
16242
|
* \p hStream after this call do not modify \p hEvent. See note on default
|
|
@@ -15073,7 +16278,8 @@ CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
|
|
|
15073
16278
|
* \brief Records an event
|
|
15074
16279
|
*
|
|
15075
16280
|
* Captures in \p hEvent the contents of \p hStream at the time of this call.
|
|
15076
|
-
* \p hEvent and \p hStream must be from the same context
|
|
16281
|
+
* \p hEvent and \p hStream must be from the same context otherwise
|
|
16282
|
+
* ::CUDA_ERROR_INVALID_HANDLE is returned.
|
|
15077
16283
|
* Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
|
|
15078
16284
|
* examine or wait for completion of the work that was captured. Uses of
|
|
15079
16285
|
* \p hStream after this call do not modify \p hEvent. See note on default
|
|
@@ -15231,6 +16437,9 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
|
|
|
15231
16437
|
* events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
|
|
15232
16438
|
* the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
|
|
15233
16439
|
* ::CUDA_ERROR_INVALID_HANDLE.
|
|
16440
|
+
*
|
|
16441
|
+
* Note there is a later version of this API, ::cuEventElapsedTime_v2. It will
|
|
16442
|
+
* supplant this version in CUDA 13.0, which is retained for minor version compatibility.
|
|
15234
16443
|
*
|
|
15235
16444
|
* \param pMilliseconds - Time between \p hStart and \p hEnd in ms
|
|
15236
16445
|
* \param hStart - Starting event
|
|
@@ -15255,6 +16464,54 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
|
|
|
15255
16464
|
*/
|
|
15256
16465
|
CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
|
|
15257
16466
|
|
|
16467
|
+
/**
|
|
16468
|
+
* \brief Computes the elapsed time between two events
|
|
16469
|
+
*
|
|
16470
|
+
* Computes the elapsed time between two events (in milliseconds with a
|
|
16471
|
+
* resolution of around 0.5 microseconds). Note this API is not guaranteed
|
|
16472
|
+
* to return the latest errors for pending work. As such this API is intended to
|
|
16473
|
+
* serve as an elapsed time calculation only and any polling for completion on the
|
|
16474
|
+
* events to be compared should be done with ::cuEventQuery instead.
|
|
16475
|
+
*
|
|
16476
|
+
* If either event was last recorded in a non-NULL stream, the resulting time
|
|
16477
|
+
* may be greater than expected (even if both used the same stream handle). This
|
|
16478
|
+
* happens because the ::cuEventRecord() operation takes place asynchronously
|
|
16479
|
+
* and there is no guarantee that the measured latency is actually just between
|
|
16480
|
+
* the two events. Any number of other different stream operations could execute
|
|
16481
|
+
* in between the two measured events, thus altering the timing in a significant
|
|
16482
|
+
* way.
|
|
16483
|
+
*
|
|
16484
|
+
* If ::cuEventRecord() has not been called on either event then
|
|
16485
|
+
* ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
|
|
16486
|
+
* on both events but one or both of them has not yet been completed (that is,
|
|
16487
|
+
* ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
|
|
16488
|
+
* events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
|
|
16489
|
+
* the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
|
|
16490
|
+
* ::CUDA_ERROR_INVALID_HANDLE.
|
|
16491
|
+
*
|
|
16492
|
+
* \param pMilliseconds - Time between \p hStart and \p hEnd in ms
|
|
16493
|
+
* \param hStart - Starting event
|
|
16494
|
+
* \param hEnd - Ending event
|
|
16495
|
+
*
|
|
16496
|
+
* \return
|
|
16497
|
+
* ::CUDA_SUCCESS,
|
|
16498
|
+
* ::CUDA_ERROR_DEINITIALIZED,
|
|
16499
|
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
16500
|
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
|
16501
|
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
|
16502
|
+
* ::CUDA_ERROR_NOT_READY,
|
|
16503
|
+
* ::CUDA_ERROR_UNKNOWN
|
|
16504
|
+
* \notefnerr
|
|
16505
|
+
*
|
|
16506
|
+
* \sa ::cuEventCreate,
|
|
16507
|
+
* ::cuEventRecord,
|
|
16508
|
+
* ::cuEventQuery,
|
|
16509
|
+
* ::cuEventSynchronize,
|
|
16510
|
+
* ::cuEventDestroy,
|
|
16511
|
+
* ::cudaEventElapsedTime
|
|
16512
|
+
*/
|
|
16513
|
+
CUresult CUDAAPI cuEventElapsedTime_v2(float *pMilliseconds, CUevent hStart, CUevent hEnd);
|
|
16514
|
+
|
|
15258
16515
|
/** @} */ /* END CUDA_EVENT */
|
|
15259
16516
|
|
|
15260
16517
|
/**
|
|
@@ -15308,7 +16565,7 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven
|
|
|
15308
16565
|
CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5,
|
|
15309
16566
|
CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6,
|
|
15310
16567
|
CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
|
|
15311
|
-
CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8
|
|
16568
|
+
CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8,
|
|
15312
16569
|
} CUexternalMemoryHandleType;
|
|
15313
16570
|
* \endcode
|
|
15314
16571
|
*
|
|
@@ -15522,6 +16779,7 @@ CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternal
|
|
|
15522
16779
|
* If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
|
|
15523
16780
|
* ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1.
|
|
15524
16781
|
*
|
|
16782
|
+
*
|
|
15525
16783
|
* The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy.
|
|
15526
16784
|
*
|
|
15527
16785
|
* \param mipmap - Returned CUDA mipmapped array
|
|
@@ -16280,6 +17538,9 @@ CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunc
|
|
|
16280
17538
|
* positive. The validity of the cluster dimensions is checked at launch time.
|
|
16281
17539
|
* If the value is set during compile time, it cannot be set at runtime.
|
|
16282
17540
|
* Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
|
|
17541
|
+
* - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
|
|
17542
|
+
* the function can be launched with non-portable cluster size. 1 is allowed,
|
|
17543
|
+
* 0 is disallowed.
|
|
16283
17544
|
* - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
|
|
16284
17545
|
* scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
|
|
16285
17546
|
*
|
|
@@ -16679,6 +17940,7 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f,
|
|
|
16679
17940
|
* CU_LAUNCH_ATTRIBUTE_PRIORITY = 8,
|
|
16680
17941
|
* CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP = 9,
|
|
16681
17942
|
* CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN = 10,
|
|
17943
|
+
* CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11,
|
|
16682
17944
|
* CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12,
|
|
16683
17945
|
* CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE = 13,
|
|
16684
17946
|
* } CUlaunchAttributeID;
|
|
@@ -16706,6 +17968,11 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f,
|
|
|
16706
17968
|
* CUlaunchMemSyncDomainMap memSyncDomainMap;
|
|
16707
17969
|
* CUlaunchMemSyncDomain memSyncDomain;
|
|
16708
17970
|
* struct {
|
|
17971
|
+
* unsigned int x;
|
|
17972
|
+
* unsigned int y;
|
|
17973
|
+
* unsigned int z;
|
|
17974
|
+
* } preferredClusterDim;
|
|
17975
|
+
* struct {
|
|
16709
17976
|
* CUevent event;
|
|
16710
17977
|
* int flags;
|
|
16711
17978
|
* } launchCompletionEvent;
|
|
@@ -16776,6 +18043,36 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f,
|
|
|
16776
18043
|
* opt out, and any attempt to set the attribute to 0 will result in an error. Graphs
|
|
16777
18044
|
* containing one or more device-updatable node also do not allow multiple instantiation.
|
|
16778
18045
|
*
|
|
18046
|
+
* ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION allows the kernel launch to
|
|
18047
|
+
* specify a preferred substitute cluster dimension. Blocks may be grouped
|
|
18048
|
+
* according to either the dimensions specified with this attribute (grouped
|
|
18049
|
+
* into a "preferred substitute cluster"), or the one specified with
|
|
18050
|
+
* ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped into a "regular
|
|
18051
|
+
* cluster"). The cluster dimensions of a "preferred substitute cluster" shall
|
|
18052
|
+
* be an integer multiple greater than zero of the regular cluster dimensions.
|
|
18053
|
+
* The device will attempt - on a best-effort basis - to group thread blocks
|
|
18054
|
+
* into preferred clusters over grouping them into regular clusters. When it
|
|
18055
|
+
* deems necessary (primarily when the device temporarily runs out of physical
|
|
18056
|
+
* resources to launch the larger preferred clusters), the device may switch to
|
|
18057
|
+
* launch the regular clusters instead to attempt to utilize as much of the
|
|
18058
|
+
* physical device resources as possible.
|
|
18059
|
+
*
|
|
18060
|
+
* Each type of cluster will have its enumeration / coordinate setup as if the
|
|
18061
|
+
* grid consists solely of its type of cluster. For example, if the preferred
|
|
18062
|
+
* substitute cluster dimensions double the regular cluster dimensions, there
|
|
18063
|
+
* might be simultaneously a regular cluster indexed at (1,0,0), and a preferred
|
|
18064
|
+
* cluster indexed at (1,0,0). In this example, the preferred substitute cluster
|
|
18065
|
+
* (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their
|
|
18066
|
+
* blocks.
|
|
18067
|
+
*
|
|
18068
|
+
* This attribute will only take effect when a regular cluster dimension has
|
|
18069
|
+
* been specified. The preferred substitute The preferred substitute cluster
|
|
18070
|
+
* dimension must be an integer multiple greater than zero of the regular
|
|
18071
|
+
* cluster dimension and must divide the grid. It must also be no more than
|
|
18072
|
+
* `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`.
|
|
18073
|
+
* Otherwise it must be less than the maximum value the driver can support.
|
|
18074
|
+
* Otherwise, setting this attribute to a value physically unable to fit on any
|
|
18075
|
+
* particular device is permitted.
|
|
16779
18076
|
*
|
|
16780
18077
|
* The effect of other attributes is consistent with their effect when set via
|
|
16781
18078
|
* persistent APIs.
|
|
@@ -16844,12 +18141,6 @@ CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config,
|
|
|
16844
18141
|
* grid of blocks. Each block contains \p blockDimX x \p blockDimY x
|
|
16845
18142
|
* \p blockDimZ threads.
|
|
16846
18143
|
*
|
|
16847
|
-
* Note that the API can also be used to launch context-less kernel ::CUkernel
|
|
16848
|
-
* by querying the handle using ::cuLibraryGetKernel() and then passing it
|
|
16849
|
-
* to the API by casting to ::CUfunction. Here, the context to launch
|
|
16850
|
-
* the kernel on will either be taken from the specified stream \p hStream
|
|
16851
|
-
* or the current context in case of NULL stream.
|
|
16852
|
-
*
|
|
16853
18144
|
* \p sharedMemBytes sets the amount of dynamic shared memory that will be
|
|
16854
18145
|
* available to each thread block.
|
|
16855
18146
|
*
|
|
@@ -19826,18 +21117,22 @@ CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphN
|
|
|
19826
21117
|
* contained \p memsetParams at instantiation. hNode must remain in the graph which was
|
|
19827
21118
|
* used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored.
|
|
19828
21119
|
*
|
|
19829
|
-
*
|
|
19830
|
-
*
|
|
19831
|
-
*
|
|
19832
|
-
*
|
|
21120
|
+
* Zero sized operations are not supported.
|
|
21121
|
+
*
|
|
21122
|
+
* The new destination pointer in memsetParams must be to the same kind of allocation
|
|
21123
|
+
* as the original destination pointer and have the same context association and device mapping
|
|
21124
|
+
* as the original destination pointer.
|
|
21125
|
+
*
|
|
21126
|
+
* Both the value and pointer address may be updated.
|
|
21127
|
+
* Changing other aspects of the memset (width, height, element size or pitch) may cause the update to be rejected.
|
|
21128
|
+
* Specifically, for 2d memsets, all dimension changes are rejected.
|
|
21129
|
+
* For 1d memsets, changes in height are explicitly rejected and other changes are oportunistically allowed
|
|
21130
|
+
* if the resulting work maps onto the work resources already allocated for the node.
|
|
19833
21131
|
*
|
|
19834
21132
|
* The modifications only affect future launches of \p hGraphExec. Already enqueued
|
|
19835
21133
|
* or running launches of \p hGraphExec are not affected by this call. hNode is also
|
|
19836
21134
|
* not modified by this call.
|
|
19837
21135
|
*
|
|
19838
|
-
* Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or
|
|
19839
|
-
* either the original or new memory operand are multidimensional.
|
|
19840
|
-
*
|
|
19841
21136
|
* \param hGraphExec - The executable graph in which to set the specified node
|
|
19842
21137
|
* \param hNode - Memset node from the graph which was used to instantiate graphExec
|
|
19843
21138
|
* \param memsetParams - The updated parameters to set
|
|
@@ -20319,7 +21614,9 @@ CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
|
|
|
20319
21614
|
* - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
|
|
20320
21615
|
* - The source/destination memory must be allocated from the same contexts as the original
|
|
20321
21616
|
* source/destination memory.
|
|
20322
|
-
* -
|
|
21617
|
+
* - For 2d memsets, only address and assinged value may be updated.
|
|
21618
|
+
* - For 1d memsets, updating dimensions is also allowed, but may fail if the resulting operation doesn't
|
|
21619
|
+
* map onto the work resources already allocated for the node.
|
|
20323
21620
|
* - Additional memcpy node restrictions:
|
|
20324
21621
|
* - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
|
|
20325
21622
|
* CU_MEMORYTYPE_ARRAY, etc.) is not supported.
|
|
@@ -20776,6 +22073,7 @@ CUresult CUDAAPI cuGraphExecNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hN
|
|
|
20776
22073
|
* \param hGraph - Graph which will contain the conditional node using this handle.
|
|
20777
22074
|
* \param ctx - Context for the handle and associated conditional node.
|
|
20778
22075
|
* \param defaultLaunchValue - Optional initial value for the conditional variable.
|
|
22076
|
+
* Applied at the beginning of each graph execution if CU_GRAPH_COND_ASSIGN_DEFAULT is set in \p flags.
|
|
20779
22077
|
* \param flags - Currently must be CU_GRAPH_COND_ASSIGN_DEFAULT or 0.
|
|
20780
22078
|
*
|
|
20781
22079
|
* \return
|
|
@@ -20810,6 +22108,11 @@ CUresult CUDAAPI cuGraphConditionalHandleCreate(CUgraphConditionalHandle *pHandl
|
|
|
20810
22108
|
* Returns in \p *numBlocks the number of the maximum active blocks per
|
|
20811
22109
|
* streaming multiprocessor.
|
|
20812
22110
|
*
|
|
22111
|
+
* Note that the API can also be used with context-less kernel ::CUkernel
|
|
22112
|
+
* by querying the handle using ::cuLibraryGetKernel() and then passing it
|
|
22113
|
+
* to the API by casting to ::CUfunction. Here, the context to use for calculations
|
|
22114
|
+
* will be the current context.
|
|
22115
|
+
*
|
|
20813
22116
|
* \param numBlocks - Returned occupancy
|
|
20814
22117
|
* \param func - Kernel for which occupancy is calculated
|
|
20815
22118
|
* \param blockSize - Block size the kernel is intended to be launched with
|
|
@@ -20851,6 +22154,11 @@ CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUf
|
|
|
20851
22154
|
* can be found about this feature in the "Unified L1/Texture Cache"
|
|
20852
22155
|
* section of the Maxwell tuning guide.
|
|
20853
22156
|
*
|
|
22157
|
+
* Note that the API can also be with launch context-less kernel ::CUkernel
|
|
22158
|
+
* by querying the handle using ::cuLibraryGetKernel() and then passing it
|
|
22159
|
+
* to the API by casting to ::CUfunction. Here, the context to use for calculations
|
|
22160
|
+
* will be the current context.
|
|
22161
|
+
*
|
|
20854
22162
|
* \param numBlocks - Returned occupancy
|
|
20855
22163
|
* \param func - Kernel for which occupancy is calculated
|
|
20856
22164
|
* \param blockSize - Block size the kernel is intended to be launched with
|
|
@@ -20902,6 +22210,11 @@ CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBl
|
|
|
20902
22210
|
* size_t blockToSmem(int blockSize);
|
|
20903
22211
|
* \endcode
|
|
20904
22212
|
*
|
|
22213
|
+
* Note that the API can also be used with context-less kernel ::CUkernel
|
|
22214
|
+
* by querying the handle using ::cuLibraryGetKernel() and then passing it
|
|
22215
|
+
* to the API by casting to ::CUfunction. Here, the context to use for calculations
|
|
22216
|
+
* will be the current context.
|
|
22217
|
+
*
|
|
20905
22218
|
* \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
|
|
20906
22219
|
* \param blockSize - Returned maximum block size that can achieve the maximum occupancy
|
|
20907
22220
|
* \param func - Kernel for which launch configuration is calculated
|
|
@@ -20947,6 +22260,11 @@ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSi
|
|
|
20947
22260
|
* can be found about this feature in the "Unified L1/Texture Cache"
|
|
20948
22261
|
* section of the Maxwell tuning guide.
|
|
20949
22262
|
*
|
|
22263
|
+
* Note that the API can also be used with context-less kernel ::CUkernel
|
|
22264
|
+
* by querying the handle using ::cuLibraryGetKernel() and then passing it
|
|
22265
|
+
* to the API by casting to ::CUfunction. Here, the context to use for calculations
|
|
22266
|
+
* will be the current context.
|
|
22267
|
+
*
|
|
20950
22268
|
* \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
|
|
20951
22269
|
* \param blockSize - Returned maximum block size that can achieve the maximum occupancy
|
|
20952
22270
|
* \param func - Kernel for which launch configuration is calculated
|
|
@@ -20974,6 +22292,11 @@ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int
|
|
|
20974
22292
|
*
|
|
20975
22293
|
* Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM.
|
|
20976
22294
|
*
|
|
22295
|
+
* Note that the API can also be used with context-less kernel ::CUkernel
|
|
22296
|
+
* by querying the handle using ::cuLibraryGetKernel() and then passing it
|
|
22297
|
+
* to the API by casting to ::CUfunction. Here, the context to use for calculations
|
|
22298
|
+
* will be the current context.
|
|
22299
|
+
*
|
|
20977
22300
|
* \param dynamicSmemSize - Returned maximum dynamic shared memory
|
|
20978
22301
|
* \param func - Kernel function for which occupancy is calculated
|
|
20979
22302
|
* \param numBlocks - Number of blocks to fit on SM
|
|
@@ -21004,6 +22327,12 @@ CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize
|
|
|
21004
22327
|
*
|
|
21005
22328
|
* This function will respect the compile time launch bounds.
|
|
21006
22329
|
*
|
|
22330
|
+
* Note that the API can also be used with context-less kernel ::CUkernel
|
|
22331
|
+
* by querying the handle using ::cuLibraryGetKernel() and then passing it
|
|
22332
|
+
* to the API by casting to ::CUfunction. Here, the context to use for calculations
|
|
22333
|
+
* will either be taken from the specified stream \p config->hStream
|
|
22334
|
+
* or the current context in case of NULL stream.
|
|
22335
|
+
*
|
|
21007
22336
|
* \param clusterSize - Returned maximum cluster size that can be launched
|
|
21008
22337
|
* for the given kernel function and launch configuration
|
|
21009
22338
|
* \param func - Kernel function for which maximum cluster
|
|
@@ -21040,6 +22369,12 @@ CUresult CUDAAPI cuOccupancyMaxPotentialClusterSize(int *clusterSize, CUfunction
|
|
|
21040
22369
|
* calculation. Runtime environment may affect how the hardware schedules
|
|
21041
22370
|
* the clusters, so the calculated occupancy is not guaranteed to be achievable.
|
|
21042
22371
|
*
|
|
22372
|
+
* Note that the API can also be used with context-less kernel ::CUkernel
|
|
22373
|
+
* by querying the handle using ::cuLibraryGetKernel() and then passing it
|
|
22374
|
+
* to the API by casting to ::CUfunction. Here, the context to use for calculations
|
|
22375
|
+
* will either be taken from the specified stream \p config->hStream
|
|
22376
|
+
* or the current context in case of NULL stream.
|
|
22377
|
+
*
|
|
21043
22378
|
* \param numClusters - Returned maximum number of clusters that
|
|
21044
22379
|
* could co-exist on the target device
|
|
21045
22380
|
* \param func - Kernel function for which maximum number
|
|
@@ -22004,7 +23339,8 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref
|
|
|
22004
23339
|
* \p pResViewDesc is an optional argument that specifies an alternate format for
|
|
22005
23340
|
* the data described by \p pResDesc, and also describes the subresource region
|
|
22006
23341
|
* to restrict access to when texturing. \p pResViewDesc can only be specified if
|
|
22007
|
-
* the type of resource is a CUDA array or a CUDA mipmapped array
|
|
23342
|
+
* the type of resource is a CUDA array or a CUDA mipmapped array not in a block
|
|
23343
|
+
* compressed format.
|
|
22008
23344
|
*
|
|
22009
23345
|
* Texture objects are only supported on devices of compute capability 3.0 or higher.
|
|
22010
23346
|
* Additionally, a texture object is an opaque value, and, as such, should only be
|
|
@@ -22412,7 +23748,7 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
|
|
|
22412
23748
|
*
|
|
22413
23749
|
* Tensor map objects are only supported on devices of compute capability 9.0 or higher.
|
|
22414
23750
|
* Additionally, a tensor map object is an opaque value, and, as such, should only be
|
|
22415
|
-
* accessed through CUDA
|
|
23751
|
+
* accessed through CUDA APIs and PTX.
|
|
22416
23752
|
*
|
|
22417
23753
|
* The parameters passed are bound to the following requirements:
|
|
22418
23754
|
*
|
|
@@ -22433,21 +23769,33 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
|
|
|
22433
23769
|
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, // 2 bytes
|
|
22434
23770
|
CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ, // 4 bytes
|
|
22435
23771
|
CU_TENSOR_MAP_DATA_TYPE_TFLOAT32, // 4 bytes
|
|
22436
|
-
CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ
|
|
23772
|
+
CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ, // 4 bytes
|
|
23773
|
+
CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, // 4 bits
|
|
23774
|
+
CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, // 4 bits
|
|
23775
|
+
CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B // 6 bits
|
|
22437
23776
|
} CUtensorMapDataType;
|
|
22438
23777
|
* \endcode
|
|
23778
|
+
* ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
|
|
23779
|
+
* ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
|
|
23780
|
+
* ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
|
|
22439
23781
|
*
|
|
22440
23782
|
* - \p tensorRank must be non-zero and less than or equal to the maximum supported dimensionality of 5. If \p interleave is not
|
|
22441
23783
|
* ::CU_TENSOR_MAP_INTERLEAVE_NONE, then \p tensorRank must additionally be greater than or equal to 3.
|
|
22442
23784
|
*
|
|
22443
|
-
* - \p globalAddress, which specifies the starting address of the memory region described, must be
|
|
22444
|
-
* ::CU_TENSOR_MAP_INTERLEAVE_32B
|
|
23785
|
+
* - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
|
|
23786
|
+
* - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
|
|
23787
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
|
|
22445
23788
|
*
|
|
22446
|
-
|
|
22447
|
-
* equal to 2^32.
|
|
23789
|
+
* - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
|
|
23790
|
+
* equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
|
|
23791
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
|
|
23792
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
|
|
23793
|
+
* - Dimension for the packed data types must reflect the number of individual U# values.
|
|
22448
23794
|
*
|
|
22449
23795
|
* - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
|
|
22450
|
-
* multiple of 16 and less than 2^40. Additionally, the
|
|
23796
|
+
* multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
|
|
23797
|
+
* - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
|
|
23798
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
|
|
22451
23799
|
* Each following dimension specified includes previous dimension stride:
|
|
22452
23800
|
* \code
|
|
22453
23801
|
globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
|
|
@@ -22457,9 +23805,9 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
|
|
|
22457
23805
|
* \endcode
|
|
22458
23806
|
*
|
|
22459
23807
|
* - \p boxDim array, which specifies number of elements to be traversed along each of the \p tensorRank dimensions, must be non-zero
|
|
22460
|
-
* and less than or equal to 256.
|
|
22461
|
-
* When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, { \p boxDim[0] * elementSizeInBytes( \p tensorDataType ) } must be a multiple
|
|
22462
|
-
*
|
|
23808
|
+
* and less than or equal to 256. Additionally, the following requirements need to be met:
|
|
23809
|
+
* - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, { \p boxDim[0] * elementSizeInBytes( \p tensorDataType ) } must be a multiple of 16 bytes.
|
|
23810
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, boxDim[0] must be 128.
|
|
22463
23811
|
*
|
|
22464
23812
|
* - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
|
|
22465
23813
|
* than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
|
|
@@ -22480,17 +23828,21 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
|
|
|
22480
23828
|
* uses 32 bytes.
|
|
22481
23829
|
* When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
|
|
22482
23830
|
* (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size.
|
|
22483
|
-
* - CU_TENSOR_MAP_SWIZZLE_32B
|
|
22484
|
-
* - CU_TENSOR_MAP_SWIZZLE_64B
|
|
22485
|
-
* - CU_TENSOR_MAP_SWIZZLE_128B
|
|
23831
|
+
* - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension to be <= 32.
|
|
23832
|
+
* - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
|
|
23833
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
|
|
23834
|
+
* Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
|
|
22486
23835
|
*
|
|
22487
23836
|
* - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
|
|
22488
23837
|
* \code
|
|
22489
23838
|
typedef enum CUtensorMapSwizzle_enum {
|
|
22490
23839
|
CU_TENSOR_MAP_SWIZZLE_NONE = 0,
|
|
22491
|
-
CU_TENSOR_MAP_SWIZZLE_32B,
|
|
22492
|
-
CU_TENSOR_MAP_SWIZZLE_64B,
|
|
22493
|
-
CU_TENSOR_MAP_SWIZZLE_128B
|
|
23840
|
+
CU_TENSOR_MAP_SWIZZLE_32B, // Swizzle 16B chunks within 32B span
|
|
23841
|
+
CU_TENSOR_MAP_SWIZZLE_64B, // Swizzle 16B chunks within 64B span
|
|
23842
|
+
CU_TENSOR_MAP_SWIZZLE_128B, // Swizzle 16B chunks within 128B span
|
|
23843
|
+
CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B, // Swizzle 32B chunks within 128B span
|
|
23844
|
+
CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B, // Swizzle 32B chunks within 128B span, additionally swap lower 8B with upper 8B within each 16B for every alternate row
|
|
23845
|
+
CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B // Swizzle 64B chunks within 128B span
|
|
22494
23846
|
} CUtensorMapSwizzle;
|
|
22495
23847
|
* \endcode
|
|
22496
23848
|
* Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
|
|
@@ -22498,6 +23850,15 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
|
|
|
22498
23850
|
* problem, data can be loaded to shared memory with shuffling across shared memory banks.
|
|
22499
23851
|
* When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
|
|
22500
23852
|
* Other interleave modes can have any swizzling pattern.
|
|
23853
|
+
* When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
|
|
23854
|
+
* - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
|
|
23855
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
|
|
23856
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
|
|
23857
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only)
|
|
23858
|
+
* When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
|
|
23859
|
+
* - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
|
|
23860
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
|
|
23861
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
|
|
22501
23862
|
*
|
|
22502
23863
|
* - \p l2Promotion specifies L2 fetch size which indicates the byte granurality at which L2 requests is filled from DRAM. It must be of
|
|
22503
23864
|
* type ::CUtensorMapL2promotion, which is defined as:
|
|
@@ -22518,7 +23879,8 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
|
|
|
22518
23879
|
CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
|
|
22519
23880
|
} CUtensorMapFloatOOBfill;
|
|
22520
23881
|
* \endcode
|
|
22521
|
-
* Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type
|
|
23882
|
+
* Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
|
|
23883
|
+
* and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
|
|
22522
23884
|
*
|
|
22523
23885
|
* \param tensorMap - Tensor map object to create
|
|
22524
23886
|
* \param tensorDataType - Tensor data type
|
|
@@ -22542,11 +23904,11 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
|
|
|
22542
23904
|
*
|
|
22543
23905
|
* \sa
|
|
22544
23906
|
* ::cuTensorMapEncodeIm2col,
|
|
23907
|
+
* ::cuTensorMapEncodeIm2colWide,
|
|
22545
23908
|
* ::cuTensorMapReplaceAddress
|
|
22546
23909
|
*/
|
|
22547
23910
|
CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const cuuint32_t *boxDim, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
|
|
22548
23911
|
|
|
22549
|
-
|
|
22550
23912
|
/**
|
|
22551
23913
|
* \brief Create a tensor map descriptor object representing im2col memory region
|
|
22552
23914
|
*
|
|
@@ -22555,7 +23917,7 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
|
|
|
22555
23917
|
*
|
|
22556
23918
|
* Tensor map objects are only supported on devices of compute capability 9.0 or higher.
|
|
22557
23919
|
* Additionally, a tensor map object is an opaque value, and, as such, should only be
|
|
22558
|
-
* accessed through CUDA
|
|
23920
|
+
* accessed through CUDA APIs and PTX.
|
|
22559
23921
|
*
|
|
22560
23922
|
* The parameters passed are bound to the following requirements:
|
|
22561
23923
|
*
|
|
@@ -22577,19 +23939,31 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
|
|
|
22577
23939
|
CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ, // 4 bytes
|
|
22578
23940
|
CU_TENSOR_MAP_DATA_TYPE_TFLOAT32, // 4 bytes
|
|
22579
23941
|
CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ // 4 bytes
|
|
23942
|
+
CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, // 4 bits
|
|
23943
|
+
CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, // 4 bits
|
|
23944
|
+
CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B // 6 bits
|
|
22580
23945
|
} CUtensorMapDataType;
|
|
22581
23946
|
* \endcode
|
|
23947
|
+
* ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
|
|
23948
|
+
* ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
|
|
23949
|
+
* ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
|
|
22582
23950
|
*
|
|
22583
23951
|
* - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
|
|
22584
23952
|
*
|
|
22585
|
-
* - \p globalAddress, which specifies the starting address of the memory region described, must be
|
|
22586
|
-
* ::CU_TENSOR_MAP_INTERLEAVE_32B
|
|
23953
|
+
* - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
|
|
23954
|
+
* - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
|
|
23955
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
|
|
22587
23956
|
*
|
|
22588
23957
|
* - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
|
|
22589
|
-
* equal to 2^32.
|
|
23958
|
+
* equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
|
|
23959
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
|
|
23960
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
|
|
23961
|
+
* - Dimension for the packed data types must reflect the number of individual U# values.
|
|
22590
23962
|
*
|
|
22591
23963
|
* - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
|
|
22592
|
-
* multiple of 16 and less than 2^40. Additionally, the
|
|
23964
|
+
* multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
|
|
23965
|
+
* - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
|
|
23966
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
|
|
22593
23967
|
* Each following dimension specified includes previous dimension stride:
|
|
22594
23968
|
* \code
|
|
22595
23969
|
globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
|
|
@@ -22612,6 +23986,7 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
|
|
|
22612
23986
|
* The bounding box specified by \p pixelBoxLowerCorner and \p pixelBoxUpperCorner must have non-zero area.
|
|
22613
23987
|
*
|
|
22614
23988
|
* - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
|
|
23989
|
+
* Additionally, when \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p channelsPerPixel must be 128.
|
|
22615
23990
|
*
|
|
22616
23991
|
* - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the {N, D, H, W} dimensions, must be less than or
|
|
22617
23992
|
* equal to 1024.
|
|
@@ -22634,18 +24009,22 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
|
|
|
22634
24009
|
* TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
|
|
22635
24010
|
* uses 32 bytes.
|
|
22636
24011
|
* When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
|
|
22637
|
-
* (computed as \p
|
|
22638
|
-
* - CU_TENSOR_MAP_SWIZZLE_32B
|
|
22639
|
-
* - CU_TENSOR_MAP_SWIZZLE_64B
|
|
22640
|
-
* - CU_TENSOR_MAP_SWIZZLE_128B
|
|
24012
|
+
* (computed as \p channelsPerPixel multiplied by element size in bytes derived from \p tensorDataType) must be less than or equal to the swizzle size.
|
|
24013
|
+
* - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension to be <= 32.
|
|
24014
|
+
* - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
|
|
24015
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
|
|
24016
|
+
* Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
|
|
22641
24017
|
*
|
|
22642
24018
|
* - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
|
|
22643
24019
|
* \code
|
|
22644
24020
|
typedef enum CUtensorMapSwizzle_enum {
|
|
22645
24021
|
CU_TENSOR_MAP_SWIZZLE_NONE = 0,
|
|
22646
|
-
CU_TENSOR_MAP_SWIZZLE_32B,
|
|
22647
|
-
CU_TENSOR_MAP_SWIZZLE_64B,
|
|
22648
|
-
CU_TENSOR_MAP_SWIZZLE_128B
|
|
24022
|
+
CU_TENSOR_MAP_SWIZZLE_32B, // Swizzle 16B chunks within 32B span
|
|
24023
|
+
CU_TENSOR_MAP_SWIZZLE_64B, // Swizzle 16B chunks within 64B span
|
|
24024
|
+
CU_TENSOR_MAP_SWIZZLE_128B, // Swizzle 16B chunks within 128B span
|
|
24025
|
+
CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B, // Swizzle 32B chunks within 128B span
|
|
24026
|
+
CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B, // Swizzle 32B chunks within 128B span, additionally swap lower 8B with upper 8B within each 16B for every alternate row
|
|
24027
|
+
CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B // Swizzle 64B chunks within 128B span
|
|
22649
24028
|
} CUtensorMapSwizzle;
|
|
22650
24029
|
* \endcode
|
|
22651
24030
|
* Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
|
|
@@ -22653,6 +24032,15 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
|
|
|
22653
24032
|
* problem, data can be loaded to shared memory with shuffling across shared memory banks.
|
|
22654
24033
|
* When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
|
|
22655
24034
|
* Other interleave modes can have any swizzling pattern.
|
|
24035
|
+
* When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
|
|
24036
|
+
* - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
|
|
24037
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
|
|
24038
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
|
|
24039
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only)
|
|
24040
|
+
* When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
|
|
24041
|
+
* - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
|
|
24042
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
|
|
24043
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
|
|
22656
24044
|
*
|
|
22657
24045
|
* - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
|
|
22658
24046
|
* type ::CUtensorMapL2promotion, which is defined as:
|
|
@@ -22673,7 +24061,8 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
|
|
|
22673
24061
|
CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
|
|
22674
24062
|
} CUtensorMapFloatOOBfill;
|
|
22675
24063
|
* \endcode
|
|
22676
|
-
* Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type
|
|
24064
|
+
* Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
|
|
24065
|
+
* and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
|
|
22677
24066
|
*
|
|
22678
24067
|
* \param tensorMap - Tensor map object to create
|
|
22679
24068
|
* \param tensorDataType - Tensor data type
|
|
@@ -22700,12 +24089,197 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
|
|
|
22700
24089
|
*
|
|
22701
24090
|
* \sa
|
|
22702
24091
|
* ::cuTensorMapEncodeTiled,
|
|
24092
|
+
* ::cuTensorMapEncodeIm2colWide,
|
|
22703
24093
|
* ::cuTensorMapReplaceAddress
|
|
22704
24094
|
*/
|
|
22705
24095
|
CUresult CUDAAPI cuTensorMapEncodeIm2col(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner, const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
|
|
22706
24096
|
|
|
22707
24097
|
/**
|
|
22708
|
-
* \brief
|
|
24098
|
+
* \brief Create a tensor map descriptor object representing im2col memory region, but where
|
|
24099
|
+
* the elements are exclusively loaded along the W dimension.
|
|
24100
|
+
*
|
|
24101
|
+
* Creates a descriptor for Tensor Memory Access (TMA) object specified by the parameters
|
|
24102
|
+
* describing a im2col memory layout and where the row is always loaded along the W dimensuin
|
|
24103
|
+
* and returns it in \p tensorMap. This assumes the tensor layout in memory is either NDHWC,
|
|
24104
|
+
* NHWC, or NWC.
|
|
24105
|
+
*
|
|
24106
|
+
* This API is only supported on devices of compute capability 10.0 or higher.
|
|
24107
|
+
* Additionally, a tensor map object is an opaque value, and, as such, should only be
|
|
24108
|
+
* accessed through CUDA APIs and PTX.
|
|
24109
|
+
*
|
|
24110
|
+
* The parameters passed are bound to the following requirements:
|
|
24111
|
+
*
|
|
24112
|
+
* - \p tensorMap address must be aligned to 64 bytes.
|
|
24113
|
+
*
|
|
24114
|
+
* - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as:
|
|
24115
|
+
* \code
|
|
24116
|
+
typedef enum CUtensorMapDataType_enum {
|
|
24117
|
+
CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0, // 1 byte
|
|
24118
|
+
CU_TENSOR_MAP_DATA_TYPE_UINT16, // 2 bytes
|
|
24119
|
+
CU_TENSOR_MAP_DATA_TYPE_UINT32, // 4 bytes
|
|
24120
|
+
CU_TENSOR_MAP_DATA_TYPE_INT32, // 4 bytes
|
|
24121
|
+
CU_TENSOR_MAP_DATA_TYPE_UINT64, // 8 bytes
|
|
24122
|
+
CU_TENSOR_MAP_DATA_TYPE_INT64, // 8 bytes
|
|
24123
|
+
CU_TENSOR_MAP_DATA_TYPE_FLOAT16, // 2 bytes
|
|
24124
|
+
CU_TENSOR_MAP_DATA_TYPE_FLOAT32, // 4 bytes
|
|
24125
|
+
CU_TENSOR_MAP_DATA_TYPE_FLOAT64, // 8 bytes
|
|
24126
|
+
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, // 2 bytes
|
|
24127
|
+
CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ, // 4 bytes
|
|
24128
|
+
CU_TENSOR_MAP_DATA_TYPE_TFLOAT32, // 4 bytes
|
|
24129
|
+
CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ // 4 bytes
|
|
24130
|
+
CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, // 4 bits
|
|
24131
|
+
CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, // 4 bits
|
|
24132
|
+
CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B // 6 bits
|
|
24133
|
+
} CUtensorMapDataType;
|
|
24134
|
+
* \endcode
|
|
24135
|
+
* ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
|
|
24136
|
+
* ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
|
|
24137
|
+
* ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
|
|
24138
|
+
*
|
|
24139
|
+
* - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
|
|
24140
|
+
*
|
|
24141
|
+
* - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
|
|
24142
|
+
* - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
|
|
24143
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
|
|
24144
|
+
*
|
|
24145
|
+
* - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
|
|
24146
|
+
* equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
|
|
24147
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
|
|
24148
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
|
|
24149
|
+
* - Dimension for the packed data types must reflect the number of individual U# values.
|
|
24150
|
+
*
|
|
24151
|
+
* - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
|
|
24152
|
+
* multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
|
|
24153
|
+
* - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
|
|
24154
|
+
* - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
|
|
24155
|
+
* Each following dimension specified includes previous dimension stride:
|
|
24156
|
+
* \code
|
|
24157
|
+
globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
|
|
24158
|
+
for (i = 1; i < tensorRank - 1; i++)
|
|
24159
|
+
globalStrides[i] = globalStrides[i – 1] * (globalDim[i] + padding[i]);
|
|
24160
|
+
assert(globalStrides[i] >= globalDim[i]);
|
|
24161
|
+
* \endcode
|
|
24162
|
+
*
|
|
24163
|
+
* - \p pixelBoxLowerCornerWidth specifies the coordinate offset W of the bounding box from left corner. The offset must be
|
|
24164
|
+
* within range [-32768, 32767].
|
|
24165
|
+
*
|
|
24166
|
+
* - \p pixelBoxUpperCornerWidth specifies the coordinate offset W of the bounding box from right corner. The offset must be
|
|
24167
|
+
* within range [-32768, 32767].
|
|
24168
|
+
*
|
|
24169
|
+
* The bounding box specified by \p pixelBoxLowerCornerWidth and \p pixelBoxUpperCornerWidth must have non-zero area. Note
|
|
24170
|
+
* that the size of the box along D and H dimensions is always equal to one.
|
|
24171
|
+
*
|
|
24172
|
+
* - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
|
|
24173
|
+
* Additionally, when \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p channelsPerPixel must be 128.
|
|
24174
|
+
*
|
|
24175
|
+
* - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the W dimension, must be less than or
|
|
24176
|
+
* equal to 1024. This field is ignored when \p mode is ::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128.
|
|
24177
|
+
*
|
|
24178
|
+
* - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
|
|
24179
|
+
* than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
|
|
24180
|
+
* TMA doesn’t support the stride for dimension zero.
|
|
24181
|
+
* When all elements of the \p elementStrides array are one, \p boxDim specifies the number of elements to load. However, if \p elementStrides[i]
|
|
24182
|
+
* is not equal to one for some \p i, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension.
|
|
24183
|
+
* To load N elements along i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i].
|
|
24184
|
+
*
|
|
24185
|
+
* - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as:
|
|
24186
|
+
* \code
|
|
24187
|
+
typedef enum CUtensorMapInterleave_enum {
|
|
24188
|
+
CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
|
|
24189
|
+
CU_TENSOR_MAP_INTERLEAVE_16B,
|
|
24190
|
+
CU_TENSOR_MAP_INTERLEAVE_32B
|
|
24191
|
+
} CUtensorMapInterleave;
|
|
24192
|
+
* \endcode
|
|
24193
|
+
* TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
|
|
24194
|
+
* uses 32 bytes.
|
|
24195
|
+
* When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the bounding box inner dimension (computed as \p channelsPerPixel multiplied by
|
|
24196
|
+
* element size in bytes derived from \p tensorDataType) must be less than or equal to the swizzle size.
|
|
24197
|
+
* - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
|
|
24198
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
|
|
24199
|
+
* Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
|
|
24200
|
+
*
|
|
24201
|
+
* - \p mode, which describes loading of elements loaded along the W dimension, has to be one of the following ::CUtensorMapIm2ColWideMode types:
|
|
24202
|
+
* \code
|
|
24203
|
+
* CU_TENSOR_MAP_IM2COL_WIDE_MODE_W,
|
|
24204
|
+
* CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
|
|
24205
|
+
* \endcode
|
|
24206
|
+
* ::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W allows the number of elements loaded along the W dimension to be specified
|
|
24207
|
+
* via the \p pixelsPerColumn field.
|
|
24208
|
+
*
|
|
24209
|
+
* - \p swizzle, which specifies the shared memory bank swizzling pattern, must be one of the following
|
|
24210
|
+
* ::CUtensorMapSwizzle modes (other swizzle modes are not supported):
|
|
24211
|
+
* \code
|
|
24212
|
+
typedef enum CUtensorMapSwizzle_enum {
|
|
24213
|
+
CU_TENSOR_MAP_SWIZZLE_64B, // Swizzle 16B chunks within 64B span
|
|
24214
|
+
CU_TENSOR_MAP_SWIZZLE_128B, // Swizzle 16B chunks within 128B span
|
|
24215
|
+
CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B, // Swizzle 32B chunks within 128B span
|
|
24216
|
+
} CUtensorMapSwizzle;
|
|
24217
|
+
* \endcode
|
|
24218
|
+
* Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
|
|
24219
|
+
* in shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this
|
|
24220
|
+
* problem, data can be loaded to shared memory with shuffling across shared memory banks.
|
|
24221
|
+
* When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
|
|
24222
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
|
|
24223
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
|
|
24224
|
+
* When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
|
|
24225
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
|
|
24226
|
+
* - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
|
|
24227
|
+
*
|
|
24228
|
+
* - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
|
|
24229
|
+
* type ::CUtensorMapL2promotion, which is defined as:
|
|
24230
|
+
* \code
|
|
24231
|
+
typedef enum CUtensorMapL2promotion_enum {
|
|
24232
|
+
CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
|
|
24233
|
+
CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
|
|
24234
|
+
CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
|
|
24235
|
+
CU_TENSOR_MAP_L2_PROMOTION_L2_256B
|
|
24236
|
+
} CUtensorMapL2promotion;
|
|
24237
|
+
* \endcode
|
|
24238
|
+
*
|
|
24239
|
+
* - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type
|
|
24240
|
+
* ::CUtensorMapFloatOOBfill which is defined as:
|
|
24241
|
+
* \code
|
|
24242
|
+
typedef enum CUtensorMapFloatOOBfill_enum {
|
|
24243
|
+
CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
|
|
24244
|
+
CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
|
|
24245
|
+
} CUtensorMapFloatOOBfill;
|
|
24246
|
+
* \endcode
|
|
24247
|
+
* Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
|
|
24248
|
+
* and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
|
|
24249
|
+
*
|
|
24250
|
+
* \param tensorMap - Tensor map object to create
|
|
24251
|
+
* \param tensorDataType - Tensor data type
|
|
24252
|
+
* \param tensorRank - Dimensionality of tensor; must be at least 3
|
|
24253
|
+
* \param globalAddress - Starting address of memory region described by tensor
|
|
24254
|
+
* \param globalDim - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions
|
|
24255
|
+
* \param globalStrides - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions
|
|
24256
|
+
* \param pixelBoxLowerCornerWidth - Width offset of left box corner
|
|
24257
|
+
* \param pixelBoxUpperCornerWidth - Width offset of right box corner
|
|
24258
|
+
* \param channelsPerPixel - Number of channels per pixel
|
|
24259
|
+
* \param pixelsPerColumn - Number of pixels per column
|
|
24260
|
+
* \param elementStrides - Array containing traversal stride in each of the \p tensorRank dimensions
|
|
24261
|
+
* \param interleave - Type of interleaved layout the tensor addresses
|
|
24262
|
+
* \param mode - W or W128 mode
|
|
24263
|
+
* \param swizzle - Bank swizzling pattern inside shared memory
|
|
24264
|
+
* \param l2Promotion - L2 promotion size
|
|
24265
|
+
* \param oobFill - Indicate whether zero or special NaN constant will be used to fill out-of-bound elements
|
|
24266
|
+
*
|
|
24267
|
+
* \return
|
|
24268
|
+
* ::CUDA_SUCCESS,
|
|
24269
|
+
* ::CUDA_ERROR_DEINITIALIZED,
|
|
24270
|
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
24271
|
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
|
24272
|
+
* ::CUDA_ERROR_INVALID_VALUE
|
|
24273
|
+
*
|
|
24274
|
+
* \sa
|
|
24275
|
+
* ::cuTensorMapEncodeTiled,
|
|
24276
|
+
* ::cuTensorMapEncodeIm2col,
|
|
24277
|
+
* ::cuTensorMapReplaceAddress
|
|
24278
|
+
*/
|
|
24279
|
+
CUresult CUDAAPI cuTensorMapEncodeIm2colWide(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, int pixelBoxLowerCornerWidth, int pixelBoxUpperCornerWidth, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapIm2ColWideMode mode, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
|
|
24280
|
+
|
|
24281
|
+
/**
|
|
24282
|
+
* \brief Modify an existing tensor map descriptor with an updated global address
|
|
22709
24283
|
*
|
|
22710
24284
|
* Modifies the descriptor for Tensor Memory Access (TMA) object passed in \p tensorMap with
|
|
22711
24285
|
* an updated \p globalAddress.
|
|
@@ -22727,6 +24301,7 @@ CUresult CUDAAPI cuTensorMapEncodeIm2col(CUtensorMap *tensorMap, CUtensorMapData
|
|
|
22727
24301
|
* \sa
|
|
22728
24302
|
* ::cuTensorMapEncodeTiled,
|
|
22729
24303
|
* ::cuTensorMapEncodeIm2col
|
|
24304
|
+
* ::cuTensorMapEncodeIm2colWide
|
|
22730
24305
|
*/
|
|
22731
24306
|
CUresult CUDAAPI cuTensorMapReplaceAddress(CUtensorMap *tensorMap, void *globalAddress);
|
|
22732
24307
|
|
|
@@ -23261,9 +24836,29 @@ typedef enum CUcoredumpSettings_enum {
|
|
|
23261
24836
|
CU_COREDUMP_ENABLE_USER_TRIGGER,
|
|
23262
24837
|
CU_COREDUMP_FILE,
|
|
23263
24838
|
CU_COREDUMP_PIPE,
|
|
24839
|
+
CU_COREDUMP_GENERATION_FLAGS,
|
|
23264
24840
|
CU_COREDUMP_MAX
|
|
23265
24841
|
} CUcoredumpSettings;
|
|
23266
24842
|
|
|
24843
|
+
/**
|
|
24844
|
+
* Flags for controlling coredump contents
|
|
24845
|
+
*/
|
|
24846
|
+
typedef enum CUCoredumpGenerationFlags {
|
|
24847
|
+
CU_COREDUMP_DEFAULT_FLAGS = 0,
|
|
24848
|
+
CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES = (1 << 0),
|
|
24849
|
+
CU_COREDUMP_SKIP_GLOBAL_MEMORY = (1 << 1),
|
|
24850
|
+
CU_COREDUMP_SKIP_SHARED_MEMORY = (1 << 2),
|
|
24851
|
+
CU_COREDUMP_SKIP_LOCAL_MEMORY = (1 << 3),
|
|
24852
|
+
CU_COREDUMP_SKIP_ABORT = (1 << 4),
|
|
24853
|
+
CU_COREDUMP_SKIP_CONSTBANK_MEMORY = (1 << 5),
|
|
24854
|
+
|
|
24855
|
+
CU_COREDUMP_LIGHTWEIGHT_FLAGS = CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES
|
|
24856
|
+
| CU_COREDUMP_SKIP_GLOBAL_MEMORY
|
|
24857
|
+
| CU_COREDUMP_SKIP_SHARED_MEMORY
|
|
24858
|
+
| CU_COREDUMP_SKIP_LOCAL_MEMORY
|
|
24859
|
+
| CU_COREDUMP_SKIP_CONSTBANK_MEMORY
|
|
24860
|
+
} CUCoredumpGenerationFlags;
|
|
24861
|
+
|
|
23267
24862
|
/**
|
|
23268
24863
|
* \brief Allows caller to fetch a coredump attribute value for the current context
|
|
23269
24864
|
*
|
|
@@ -23280,10 +24875,12 @@ typedef enum CUcoredumpSettings_enum {
|
|
|
23280
24875
|
* CU_CTX_USER_COREDUMP_ENABLE flag was set during context creation.
|
|
23281
24876
|
* - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
|
|
23282
24877
|
* also create a coredump. The default value is ::true unless set to ::false globally or
|
|
23283
|
-
* or locally.
|
|
24878
|
+
* or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
|
|
24879
|
+
* flag to disable host device abort() if needed.
|
|
23284
24880
|
* - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
|
|
23285
24881
|
* will not have a dump of GPU memory or non-reloc ELF images. The default value is
|
|
23286
|
-
* ::false unless set to ::true globally or locally.
|
|
24882
|
+
* ::false unless set to ::true globally or locally. This attribute is deprecated as
|
|
24883
|
+
* of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS instead.
|
|
23287
24884
|
* - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
|
|
23288
24885
|
* created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
|
|
23289
24886
|
* value is ::false unless set to ::true globally or locally.
|
|
@@ -23295,6 +24892,22 @@ typedef enum CUcoredumpSettings_enum {
|
|
|
23295
24892
|
* that will be monitored if user-triggered coredumps are enabled. The default value is
|
|
23296
24893
|
* ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
|
|
23297
24894
|
* the CUDA application and ::PID is the process ID of the CUDA application.
|
|
24895
|
+
* - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
|
|
24896
|
+
* contained in a coredump specified as a bitwise OR combination of the following values:
|
|
24897
|
+
* + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
|
|
24898
|
+
* default settings of including all memory regions that it is able to access
|
|
24899
|
+
* + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
|
|
24900
|
+
* CUDA source modules that are not relocated at runtime.
|
|
24901
|
+
* + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
|
|
24902
|
+
* that does not belong to any context.
|
|
24903
|
+
* + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
|
|
24904
|
+
* for the warp that the dumped kernel belonged to.
|
|
24905
|
+
* + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
|
|
24906
|
+
* + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
|
|
24907
|
+
* the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
|
|
24908
|
+
* + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
|
|
24909
|
+
* process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
|
|
24910
|
+
* behavior.
|
|
23298
24911
|
*
|
|
23299
24912
|
* \param attrib - The enum defining which value to fetch.
|
|
23300
24913
|
* \param value - void* containing the requested data.
|
|
@@ -23330,10 +24943,13 @@ CUresult CUDAAPI cuCoredumpGetAttribute(CUcoredumpSettings attrib, void* value,
|
|
|
23330
24943
|
* this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
|
|
23331
24944
|
* The default value is ::false.
|
|
23332
24945
|
* - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
|
|
23333
|
-
* also create a coredump. The default value is ::true
|
|
24946
|
+
* also create a coredump. The default value is ::true unless set to ::false globally or
|
|
24947
|
+
* or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
|
|
24948
|
+
* flag to disable host device abort() if needed.
|
|
23334
24949
|
* - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
|
|
23335
24950
|
* will not have a dump of GPU memory or non-reloc ELF images. The default value is
|
|
23336
|
-
* ::false.
|
|
24951
|
+
* ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
|
|
24952
|
+
* instead.
|
|
23337
24953
|
* - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
|
|
23338
24954
|
* created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
|
|
23339
24955
|
* value is ::false.
|
|
@@ -23345,6 +24961,22 @@ CUresult CUDAAPI cuCoredumpGetAttribute(CUcoredumpSettings attrib, void* value,
|
|
|
23345
24961
|
* that will be monitored if user-triggered coredumps are enabled. The default value is
|
|
23346
24962
|
* ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
|
|
23347
24963
|
* the CUDA application and ::PID is the process ID of the CUDA application.
|
|
24964
|
+
* - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
|
|
24965
|
+
* contained in a coredump specified as a bitwise OR combination of the following values:
|
|
24966
|
+
* + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
|
|
24967
|
+
* default settings of including all memory regions that it is able to access
|
|
24968
|
+
* + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
|
|
24969
|
+
* CUDA source modules that are not relocated at runtime.
|
|
24970
|
+
* + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
|
|
24971
|
+
* that does not belong to any context.
|
|
24972
|
+
* + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
|
|
24973
|
+
* for the warp that the dumped kernel belonged to.
|
|
24974
|
+
* + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
|
|
24975
|
+
* + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
|
|
24976
|
+
* the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
|
|
24977
|
+
* + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
|
|
24978
|
+
* process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
|
|
24979
|
+
* behavior.
|
|
23348
24980
|
*
|
|
23349
24981
|
* \param attrib - The enum defining which value to fetch.
|
|
23350
24982
|
* \param value - void* containing the requested data.
|
|
@@ -23369,7 +25001,7 @@ CUresult CUDAAPI cuCoredumpGetAttributeGlobal(CUcoredumpSettings attrib, void *v
|
|
|
23369
25001
|
*
|
|
23370
25002
|
* An important design decision to note is that any coredump environment variable values
|
|
23371
25003
|
* set before CUDA initializes will take permanent precedence over any values set with this
|
|
23372
|
-
*
|
|
25004
|
+
* function. This decision was made to ensure no change in behavior for any users that
|
|
23373
25005
|
* may be currently using these variables to get coredumps.
|
|
23374
25006
|
*
|
|
23375
25007
|
* \p *value shall contain the requested value specified by \p set. It is up to the caller
|
|
@@ -23389,14 +25021,33 @@ CUresult CUDAAPI cuCoredumpGetAttributeGlobal(CUcoredumpSettings attrib, void *v
|
|
|
23389
25021
|
* this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
|
|
23390
25022
|
* The default value is ::false.
|
|
23391
25023
|
* - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
|
|
23392
|
-
* also create a coredump. The default value is ::true
|
|
25024
|
+
* also create a coredump. The default value is ::true unless set to ::false globally or
|
|
25025
|
+
* or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
|
|
25026
|
+
* flag to disable host device abort() if needed.
|
|
23393
25027
|
* - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
|
|
23394
25028
|
* will not have a dump of GPU memory or non-reloc ELF images. The default value is
|
|
23395
|
-
* ::false.
|
|
25029
|
+
* ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
|
|
25030
|
+
* instead.
|
|
23396
25031
|
* - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
|
|
23397
25032
|
* any coredumps generated by this context will be written. The default value is
|
|
23398
25033
|
* ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
|
|
23399
25034
|
* the CUDA applications and ::PID is the process ID of the CUDA application.
|
|
25035
|
+
* - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
|
|
25036
|
+
* contained in a coredump specified as a bitwise OR combination of the following values:
|
|
25037
|
+
* + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
|
|
25038
|
+
* default settings of including all memory regions that it is able to access
|
|
25039
|
+
* + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
|
|
25040
|
+
* CUDA source modules that are not relocated at runtime.
|
|
25041
|
+
* + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
|
|
25042
|
+
* that does not belong to any context.
|
|
25043
|
+
* + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
|
|
25044
|
+
* for the warp that the dumped kernel belonged to.
|
|
25045
|
+
* + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
|
|
25046
|
+
* + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
|
|
25047
|
+
* the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
|
|
25048
|
+
* + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
|
|
25049
|
+
* process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
|
|
25050
|
+
* behavior.
|
|
23400
25051
|
*
|
|
23401
25052
|
* \param attrib - The enum defining which value to set.
|
|
23402
25053
|
* \param value - void* containing the requested data.
|
|
@@ -23427,7 +25078,7 @@ CUresult CUDAAPI cuCoredumpSetAttribute(CUcoredumpSettings attrib, void* value,
|
|
|
23427
25078
|
*
|
|
23428
25079
|
* An important design decision to note is that any coredump environment variable values
|
|
23429
25080
|
* set before CUDA initializes will take permanent precedence over any values set with this
|
|
23430
|
-
*
|
|
25081
|
+
* function. This decision was made to ensure no change in behavior for any users that
|
|
23431
25082
|
* may be currently using these variables to get coredumps.
|
|
23432
25083
|
*
|
|
23433
25084
|
* \p *value shall contain the requested value specified by \p set. It is up to the caller
|
|
@@ -23441,10 +25092,13 @@ CUresult CUDAAPI cuCoredumpSetAttribute(CUcoredumpSettings attrib, void* value,
|
|
|
23441
25092
|
* this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
|
|
23442
25093
|
* The default value is ::false.
|
|
23443
25094
|
* - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
|
|
23444
|
-
* also create a coredump. The default value is ::true
|
|
25095
|
+
* also create a coredump. The default value is ::true unless set to ::false globally or
|
|
25096
|
+
* or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
|
|
25097
|
+
* flag to disable host device abort() if needed.
|
|
23445
25098
|
* - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
|
|
23446
25099
|
* will not have a dump of GPU memory or non-reloc ELF images. The default value is
|
|
23447
|
-
* ::false.
|
|
25100
|
+
* ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
|
|
25101
|
+
* instead.
|
|
23448
25102
|
* - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
|
|
23449
25103
|
* created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
|
|
23450
25104
|
* value is ::false.
|
|
@@ -23457,6 +25111,22 @@ CUresult CUDAAPI cuCoredumpSetAttribute(CUcoredumpSettings attrib, void* value,
|
|
|
23457
25111
|
* changed after ::CU_COREDUMP_ENABLE_USER_TRIGGER is set to ::true. The default
|
|
23458
25112
|
* value is ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine
|
|
23459
25113
|
* running the CUDA application and ::PID is the process ID of the CUDA application.
|
|
25114
|
+
* - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
|
|
25115
|
+
* contained in a coredump specified as a bitwise OR combination of the following values:
|
|
25116
|
+
* + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
|
|
25117
|
+
* default settings of including all memory regions that it is able to access
|
|
25118
|
+
* + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
|
|
25119
|
+
* CUDA source modules that are not relocated at runtime.
|
|
25120
|
+
* + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
|
|
25121
|
+
* that does not belong to any context.
|
|
25122
|
+
* + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
|
|
25123
|
+
* for the warp that the dumped kernel belonged to.
|
|
25124
|
+
* + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
|
|
25125
|
+
* + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
|
|
25126
|
+
* the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
|
|
25127
|
+
* + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
|
|
25128
|
+
* process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
|
|
25129
|
+
* behavior.
|
|
23460
25130
|
*
|
|
23461
25131
|
* \param attrib - The enum defining which value to set.
|
|
23462
25132
|
* \param value - void* containing the requested data.
|
|
@@ -23523,13 +25193,6 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExp
|
|
|
23523
25193
|
* @{
|
|
23524
25194
|
*/
|
|
23525
25195
|
|
|
23526
|
-
/*!
|
|
23527
|
-
* \typedef typedef struct CUgreenCtx_st* CUgreenCtx
|
|
23528
|
-
* A green context handle. This handle can be used safely from only one CPU thread at a time.
|
|
23529
|
-
* Created via ::cuGreenCtxCreate
|
|
23530
|
-
*/
|
|
23531
|
-
typedef struct CUgreenCtx_st *CUgreenCtx;
|
|
23532
|
-
|
|
23533
25196
|
/*!
|
|
23534
25197
|
* \typedef struct CUdevResourceDesc_st* CUdevResourceDesc;
|
|
23535
25198
|
* An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources.
|
|
@@ -23541,6 +25204,11 @@ typedef enum {
|
|
|
23541
25204
|
CU_GREEN_CTX_DEFAULT_STREAM = 0x1, /**< Required. Creates a default stream to use inside the green context */
|
|
23542
25205
|
} CUgreenCtxCreate_flags;
|
|
23543
25206
|
|
|
25207
|
+
typedef enum {
|
|
25208
|
+
CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING = 0x1,
|
|
25209
|
+
CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE = 0x2,
|
|
25210
|
+
} CUdevSmResourceSplit_flags;
|
|
25211
|
+
|
|
23544
25212
|
#define RESOURCE_ABI_VERSION 1
|
|
23545
25213
|
#define RESOURCE_ABI_EXTERNAL_BYTES 48
|
|
23546
25214
|
|
|
@@ -23554,7 +25222,7 @@ typedef enum {
|
|
|
23554
25222
|
typedef enum {
|
|
23555
25223
|
CU_DEV_RESOURCE_TYPE_INVALID = 0,
|
|
23556
25224
|
CU_DEV_RESOURCE_TYPE_SM = 1, /**< Streaming multiprocessors related information */
|
|
23557
|
-
#
|
|
25225
|
+
#if defined(__CUDA_API_VERSION_INTERNAL) && !defined(__CUDA_API_VERSION_INTERNAL_ODR)
|
|
23558
25226
|
CU_DEV_RESOURCE_TYPE_MAX,
|
|
23559
25227
|
#endif
|
|
23560
25228
|
} CUdevResourceType;
|
|
@@ -23777,18 +25445,24 @@ CUresult CUDAAPI cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resour
|
|
|
23777
25445
|
* first creating a descriptor and a green context with that descriptor.
|
|
23778
25446
|
*
|
|
23779
25447
|
* When creating the groups, the API will take into account the performance and functional characteristics of the
|
|
23780
|
-
* input resource, and guarantee a split that will create a disjoint set of symmetrical partitions. This may lead to
|
|
25448
|
+
* input resource, and guarantee a split that will create a disjoint set of symmetrical partitions. This may lead to fewer groups created
|
|
23781
25449
|
* than purely dividing the total SM count by the \p minCount due to cluster requirements or
|
|
23782
25450
|
* alignment and granularity requirements for the minCount.
|
|
23783
25451
|
*
|
|
23784
|
-
* The \p remainder set
|
|
25452
|
+
* The \p remainder set does not have the same functional or performance guarantees as the groups in \p result.
|
|
23785
25453
|
* Its use should be carefully planned and future partitions of the \p remainder set are discouraged.
|
|
23786
25454
|
*
|
|
25455
|
+
* The following flags are supported:
|
|
25456
|
+
* - \p CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING : Lower the minimum SM count and alignment, and treat each SM independent of its hierarchy.
|
|
25457
|
+
* This allows more fine grained partitions but at the cost of advanced features (such as large clusters on compute capability 9.0+).
|
|
25458
|
+
* - \p CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE : Compute Capability 9.0+ only. Attempt to create groups that may allow
|
|
25459
|
+
* for maximally sized thread clusters. This can be queried post green context creation using ::cuOccupancyMaxPotentialClusterSize.
|
|
25460
|
+
*
|
|
23787
25461
|
* A successful API call must either have:
|
|
23788
|
-
* - A valid array of \p result pointers of size passed in \p nbGroups, with \p
|
|
23789
|
-
* Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining
|
|
23790
|
-
* - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p
|
|
23791
|
-
* Value of \p minCount must be between 0 and the SM count specified in \p input.
|
|
25462
|
+
* - A valid array of \p result pointers of size passed in \p nbGroups, with \p input of type \p CU_DEV_RESOURCE_TYPE_SM.
|
|
25463
|
+
* Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining may be NULL.
|
|
25464
|
+
* - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p input of type \p CU_DEV_RESOURCE_TYPE_SM.
|
|
25465
|
+
* Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining may be NULL.
|
|
23792
25466
|
* This queries the number of groups that would be created by the API.
|
|
23793
25467
|
*
|
|
23794
25468
|
* Note: The API is not supported on 32-bit platforms.
|
|
@@ -23798,7 +25472,7 @@ CUresult CUDAAPI cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resour
|
|
|
23798
25472
|
* \param input - Input SM resource to be split. Must be a valid \p CU_DEV_RESOURCE_TYPE_SM resource.
|
|
23799
25473
|
* \param remaining - If the input resource cannot be cleanly split among \p nbGroups, the remaining is placed in here.
|
|
23800
25474
|
* Can be ommitted (NULL) if the user does not need the remaining set.
|
|
23801
|
-
* \param useFlags - Flags specifying how these partitions are used or which constraints to abide by when splitting the input.
|
|
25475
|
+
* \param useFlags - Flags specifying how these partitions are used or which constraints to abide by when splitting the input. Zero is valid for default behavior.
|
|
23802
25476
|
* \param minCount - Minimum number of SMs required
|
|
23803
25477
|
*
|
|
23804
25478
|
* \return
|
|
@@ -23821,10 +25495,18 @@ CUresult CUDAAPI cuDevSmResourceSplitByCount(
|
|
|
23821
25495
|
/**
|
|
23822
25496
|
* \brief Generate a resource descriptor
|
|
23823
25497
|
*
|
|
23824
|
-
* Generates a resource descriptor with the set of resources specified in \p resources.
|
|
25498
|
+
* Generates a single resource descriptor with the set of resources specified in \p resources.
|
|
23825
25499
|
* The generated resource descriptor is necessary for the creation of green contexts via the ::cuGreenCtxCreate API.
|
|
23826
|
-
*
|
|
23827
|
-
*
|
|
25500
|
+
* Resources of the same type can be passed in, provided they meet the requirements as noted below.
|
|
25501
|
+
*
|
|
25502
|
+
* A successful API call must have:
|
|
25503
|
+
* - A valid output pointer for the \p phDesc descriptor as well as a valid array of \p resources pointers,
|
|
25504
|
+
* with the array size passed in \p nbResources.
|
|
25505
|
+
* If multiple resources are provided in \p resources, the device they came from must be the same,
|
|
25506
|
+
* otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
|
|
25507
|
+
* If multiple resources are provided in \p resources and they are of type ::CU_DEV_RESOURCE_TYPE_SM,
|
|
25508
|
+
* they must be outputs (whether \p result or \p remaining) from the same split API instance,
|
|
25509
|
+
* otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
|
|
23828
25510
|
*
|
|
23829
25511
|
* Note: The API is not supported on 32-bit platforms.
|
|
23830
25512
|
*
|
|
@@ -23848,15 +25530,16 @@ CUresult CUDAAPI cuDevResourceGenerateDesc(CUdevResourceDesc *phDesc, CUdevResou
|
|
|
23848
25530
|
/**
|
|
23849
25531
|
* \brief Records an event.
|
|
23850
25532
|
*
|
|
23851
|
-
* Captures in \
|
|
23852
|
-
* at the time of this call. \
|
|
23853
|
-
*
|
|
25533
|
+
* Captures in \p hEvent all the activities of the green context of \p hCtx
|
|
25534
|
+
* at the time of this call. \p hEvent and \p hCtx must be from the same
|
|
25535
|
+
* primary context otherwise ::CUDA_ERROR_INVALID_HANDLE is returned.
|
|
25536
|
+
* Calls such as ::cuEventQuery() or ::cuGreenCtxWaitEvent() will
|
|
23854
25537
|
* then examine or wait for completion of the work that was captured. Uses of
|
|
23855
25538
|
* \p hCtx after this call do not modify \p hEvent.
|
|
23856
25539
|
*
|
|
23857
|
-
* \note The API will return
|
|
23858
|
-
* has a stream in the capture mode. In such
|
|
23859
|
-
* all the conflicting captures.
|
|
25540
|
+
* \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the
|
|
25541
|
+
* specified green context \p hCtx has a stream in the capture mode. In such
|
|
25542
|
+
* a case, the call will invalidate all the conflicting captures.
|
|
23860
25543
|
*
|
|
23861
25544
|
* \param hCtx - Green context to record event for
|
|
23862
25545
|
* \param hEvent - Event to record
|
|
@@ -23866,39 +25549,49 @@ CUresult CUDAAPI cuDevResourceGenerateDesc(CUdevResourceDesc *phDesc, CUdevResou
|
|
|
23866
25549
|
* ::CUDA_ERROR_DEINITIALIZED,
|
|
23867
25550
|
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
23868
25551
|
* ::CUDA_ERROR_INVALID_CONTEXT,
|
|
23869
|
-
* ::CUDA_ERROR_INVALID_HANDLE
|
|
25552
|
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
|
25553
|
+
* ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
|
|
23870
25554
|
*
|
|
23871
25555
|
* \sa
|
|
23872
25556
|
* ::cuGreenCtxWaitEvent,
|
|
23873
|
-
* ::cuEventRecord
|
|
25557
|
+
* ::cuEventRecord,
|
|
25558
|
+
* ::cuCtxRecordEvent,
|
|
25559
|
+
* ::cuCtxWaitEvent
|
|
23874
25560
|
*/
|
|
23875
25561
|
CUresult CUDAAPI cuGreenCtxRecordEvent(CUgreenCtx hCtx, CUevent hEvent);
|
|
23876
25562
|
|
|
23877
25563
|
/**
|
|
23878
25564
|
* \brief Make a green context wait on an event
|
|
23879
25565
|
*
|
|
23880
|
-
* Makes all future work submitted to green context \
|
|
23881
|
-
* captured in \
|
|
25566
|
+
* Makes all future work submitted to green context \p hCtx wait for all work
|
|
25567
|
+
* captured in \p hEvent. The synchronization will be performed on the device
|
|
23882
25568
|
* and will not block the calling CPU thread. See ::cuGreenCtxRecordEvent()
|
|
23883
|
-
* for details on what is captured by an event.
|
|
25569
|
+
* or ::cuEventRecord(), for details on what is captured by an event.
|
|
25570
|
+
*
|
|
25571
|
+
* \note \p hEvent may be from a different context or device than \p hCtx.
|
|
25572
|
+
*
|
|
25573
|
+
* \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and
|
|
25574
|
+
* invalidate the capture if the specified event \p hEvent is part of an
|
|
25575
|
+
* ongoing capture sequence or if the specified green context \p hCtx has
|
|
25576
|
+
* a stream in the capture mode.
|
|
23884
25577
|
*
|
|
23885
|
-
* \note The API will return an error and invalidate the capture if the specified
|
|
23886
|
-
* event \p hEvent is part of an ongoing capture sequence.
|
|
23887
|
-
*
|
|
23888
25578
|
* \param hCtx - Green context to wait
|
|
23889
|
-
* \param hEvent - Event to wait on
|
|
25579
|
+
* \param hEvent - Event to wait on
|
|
23890
25580
|
*
|
|
23891
25581
|
* \return
|
|
23892
25582
|
* ::CUDA_SUCCESS,
|
|
23893
25583
|
* ::CUDA_ERROR_DEINITIALIZED,
|
|
23894
25584
|
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
23895
25585
|
* ::CUDA_ERROR_INVALID_CONTEXT,
|
|
23896
|
-
* ::CUDA_ERROR_INVALID_HANDLE
|
|
25586
|
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
|
25587
|
+
* ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
|
|
23897
25588
|
*
|
|
23898
25589
|
* \sa
|
|
23899
25590
|
* ::cuGreenCtxRecordEvent,
|
|
23900
25591
|
* ::cuStreamWaitEvent
|
|
23901
|
-
|
|
25592
|
+
* ::cuCtxRecordEvent,
|
|
25593
|
+
* ::cuCtxWaitEvent
|
|
25594
|
+
*/
|
|
23902
25595
|
CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
|
|
23903
25596
|
|
|
23904
25597
|
/**
|
|
@@ -23910,7 +25603,9 @@ CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
|
|
|
23910
25603
|
* The stream handle \p hStream can refer to any of the following:
|
|
23911
25604
|
* <ul>
|
|
23912
25605
|
* <li>
|
|
23913
|
-
* a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
|
|
25606
|
+
* a stream created via any of the CUDA driver APIs such as ::cuStreamCreate, ::cuStreamCreateWithPriority
|
|
25607
|
+
* and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as
|
|
25608
|
+
* ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
|
|
23914
25609
|
* If during stream creation the context that was active in the calling thread was obtained
|
|
23915
25610
|
* with cuCtxFromGreenCtx, that green context is returned in \p phCtx.
|
|
23916
25611
|
* Otherwise, \p *phCtx is set to NULL instead.
|
|
@@ -23936,9 +25631,13 @@ CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
|
|
|
23936
25631
|
* \notefnerr
|
|
23937
25632
|
*
|
|
23938
25633
|
* \sa ::cuStreamDestroy,
|
|
25634
|
+
* ::cuStreamCreate,
|
|
23939
25635
|
* ::cuStreamCreateWithPriority,
|
|
25636
|
+
* ::cuStreamGetCtx_v2,
|
|
25637
|
+
* ::cuGreenCtxStreamCreate,
|
|
23940
25638
|
* ::cuStreamGetPriority,
|
|
23941
25639
|
* ::cuStreamGetFlags,
|
|
25640
|
+
* ::cuStreamGetDevice
|
|
23942
25641
|
* ::cuStreamWaitEvent,
|
|
23943
25642
|
* ::cuStreamQuery,
|
|
23944
25643
|
* ::cuStreamSynchronize,
|
|
@@ -23948,6 +25647,62 @@ CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
|
|
|
23948
25647
|
*/
|
|
23949
25648
|
CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
|
|
23950
25649
|
|
|
25650
|
+
/**
|
|
25651
|
+
* \brief Create a stream for use in the green context
|
|
25652
|
+
*
|
|
25653
|
+
* Creates a stream for use in the specified green context \p greenCtx and returns a handle in \p phStream.
|
|
25654
|
+
* The stream can be destroyed by calling ::cuStreamDestroy(). Note that the API ignores the context that
|
|
25655
|
+
* is current to the calling thread and creates a stream in the specified green context \p greenCtx.
|
|
25656
|
+
*
|
|
25657
|
+
* The supported values for \p flags are:
|
|
25658
|
+
* - ::CU_STREAM_NON_BLOCKING: This must be specified. It indicates that work running in the created
|
|
25659
|
+
* stream may run concurrently with work in the default stream, and that
|
|
25660
|
+
* the created stream should perform no implicit synchronization with the default stream.
|
|
25661
|
+
*
|
|
25662
|
+
* Specifying \p priority affects the scheduling priority of work in the stream. Priorities provide a
|
|
25663
|
+
* hint to preferentially run work with higher priority when possible, but do not preempt
|
|
25664
|
+
* already-running work or provide any other functional guarantee on execution order.
|
|
25665
|
+
* \p priority follows a convention where lower numbers represent higher priorities.
|
|
25666
|
+
* '0' represents default priority. The range of meaningful numerical priorities can
|
|
25667
|
+
* be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
|
|
25668
|
+
* outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
|
|
25669
|
+
* it will automatically be clamped to the lowest or the highest number in the range.
|
|
25670
|
+
*
|
|
25671
|
+
* \param phStream - Returned newly created stream
|
|
25672
|
+
* \param greenCtx - Green context for which to create the stream for
|
|
25673
|
+
* \param flags - Flags for stream creation. \p CU_STREAM_NON_BLOCKING must be specified.
|
|
25674
|
+
* \param priority - Stream priority. Lower numbers represent higher priorities.
|
|
25675
|
+
* See ::cuCtxGetStreamPriorityRange for more information about
|
|
25676
|
+
* meaningful stream priorities that can be passed.
|
|
25677
|
+
*
|
|
25678
|
+
* \return
|
|
25679
|
+
* ::CUDA_SUCCESS,
|
|
25680
|
+
* ::CUDA_ERROR_DEINITIALIZED,
|
|
25681
|
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
|
25682
|
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
|
25683
|
+
* ::CUDA_ERROR_INVALID_VALUE,
|
|
25684
|
+
* ::CUDA_ERROR_OUT_OF_MEMORY
|
|
25685
|
+
* \notefnerr
|
|
25686
|
+
*
|
|
25687
|
+
* \note In the current implementation, only compute kernels launched in
|
|
25688
|
+
* priority streams are affected by the stream's priority. Stream priorities have
|
|
25689
|
+
* no effect on host-to-device and device-to-host memory operations.
|
|
25690
|
+
*
|
|
25691
|
+
* \sa ::cuStreamDestroy,
|
|
25692
|
+
* ::cuGreenCtxCreate
|
|
25693
|
+
* ::cuStreamCreate,
|
|
25694
|
+
* ::cuStreamGetPriority,
|
|
25695
|
+
* ::cuCtxGetStreamPriorityRange,
|
|
25696
|
+
* ::cuStreamGetFlags,
|
|
25697
|
+
* ::cuStreamGetDevice
|
|
25698
|
+
* ::cuStreamWaitEvent,
|
|
25699
|
+
* ::cuStreamQuery,
|
|
25700
|
+
* ::cuStreamSynchronize,
|
|
25701
|
+
* ::cuStreamAddCallback,
|
|
25702
|
+
* ::cudaStreamCreateWithPriority
|
|
25703
|
+
*/
|
|
25704
|
+
CUresult CUDAAPI cuGreenCtxStreamCreate(CUstream* phStream, CUgreenCtx greenCtx, unsigned int flags, int priority);
|
|
25705
|
+
|
|
23951
25706
|
/** @} */
|
|
23952
25707
|
|
|
23953
25708
|
/*
|
|
@@ -23991,6 +25746,8 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
|
|
|
23991
25746
|
#undef cuMemcpyDtoDAsync
|
|
23992
25747
|
#undef cuMemcpy2DAsync
|
|
23993
25748
|
#undef cuMemcpy3DAsync
|
|
25749
|
+
#undef cuMemcpyBatchAsync
|
|
25750
|
+
#undef cuMemcpy3DBatchAsync
|
|
23994
25751
|
#undef cuMemsetD8
|
|
23995
25752
|
#undef cuMemsetD16
|
|
23996
25753
|
#undef cuMemsetD32
|
|
@@ -24025,6 +25782,7 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
|
|
|
24025
25782
|
#undef cuStreamGetPriority
|
|
24026
25783
|
#undef cuStreamGetId
|
|
24027
25784
|
#undef cuStreamGetFlags
|
|
25785
|
+
#undef cuStreamGetDevice
|
|
24028
25786
|
#undef cuStreamGetCtx
|
|
24029
25787
|
#undef cuStreamWaitEvent
|
|
24030
25788
|
#undef cuStreamAddCallback
|
|
@@ -24083,6 +25841,8 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
|
|
|
24083
25841
|
#undef cuStreamUpdateCaptureDependencies
|
|
24084
25842
|
#undef cuStreamUpdateCaptureDependencies_v2
|
|
24085
25843
|
#undef cuGetProcAddress
|
|
25844
|
+
#undef cuStreamGetCtx_v2
|
|
25845
|
+
#undef cuMemBatchDecompressAsync
|
|
24086
25846
|
|
|
24087
25847
|
CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
|
|
24088
25848
|
CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
|
|
@@ -24250,7 +26010,11 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
|
|
|
24250
26010
|
CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
|
|
24251
26011
|
CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
|
|
24252
26012
|
CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
|
|
24253
|
-
|
|
26013
|
+
CUresult CUDAAPI cuMemcpyBatchAsync(CUdeviceptr *dsts, CUdeviceptr *srcs, size_t *sizes, size_t count,
|
|
26014
|
+
CUmemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs,
|
|
26015
|
+
size_t *failIdx, CUstream hStream);
|
|
26016
|
+
CUresult CUDAAPI cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP *opList,
|
|
26017
|
+
size_t *failIdx, unsigned long long flags, CUstream hStream);
|
|
24254
26018
|
CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
|
|
24255
26019
|
CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
|
|
24256
26020
|
CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
|
|
@@ -24261,7 +26025,9 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
|
|
|
24261
26025
|
CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
|
|
24262
26026
|
CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
|
|
24263
26027
|
CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
|
|
26028
|
+
CUresult CUDAAPI cuStreamGetDevice(CUstream hStream, CUdevice *device);
|
|
24264
26029
|
CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
|
|
26030
|
+
CUresult CUDAAPI cuStreamGetCtx_v2(CUstream hStream, CUcontext *pCtx, CUgreenCtx *pGreenCtx);
|
|
24265
26031
|
CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
|
|
24266
26032
|
CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
|
|
24267
26033
|
CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
|
|
@@ -24330,6 +26096,15 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
|
|
|
24330
26096
|
|
|
24331
26097
|
CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
|
|
24332
26098
|
CUresult CUDAAPI cuStreamUpdateCaptureDependencies_v2(CUstream hStream, CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
|
|
26099
|
+
|
|
26100
|
+
CUresult CUDAAPI cuMemBatchDecompressAsync(
|
|
26101
|
+
CUmemDecompressParams *paramsArray,
|
|
26102
|
+
size_t count,
|
|
26103
|
+
unsigned int flags,
|
|
26104
|
+
size_t *errorIndex,
|
|
26105
|
+
CUstream stream
|
|
26106
|
+
);
|
|
26107
|
+
|
|
24333
26108
|
CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags);
|
|
24334
26109
|
|
|
24335
26110
|
#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
|
|
@@ -24344,6 +26119,152 @@ static inline CUresult cuGetProcAddress_v2_ptsz(const char *symbol, void **funcP
|
|
|
24344
26119
|
#define cuGetProcAddress_v2 cuGetProcAddress_v2_ptsz
|
|
24345
26120
|
#endif
|
|
24346
26121
|
|
|
26122
|
+
/**
|
|
26123
|
+
* \defgroup CUDA_CHECKPOINT CUDA Checkpointing
|
|
26124
|
+
*
|
|
26125
|
+
* ___MANBRIEF___ CUDA checkpoint and restore functionality of the low-level
|
|
26126
|
+
* CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
|
|
26127
|
+
*
|
|
26128
|
+
* This sections describes the checkpoint and restore functions of the low-level
|
|
26129
|
+
* CUDA driver application programming interface.
|
|
26130
|
+
*
|
|
26131
|
+
* The CUDA checkpoint and restore API's provide a way to save and restore GPU
|
|
26132
|
+
* state for full process checkpoints when used with CPU side process
|
|
26133
|
+
* checkpointing solutions. They can also be used to pause GPU work and suspend
|
|
26134
|
+
* a CUDA process to allow other applications to make use of GPU resources.
|
|
26135
|
+
*
|
|
26136
|
+
* Checkpoint and restore capabilities are currently restricted to Linux.
|
|
26137
|
+
*
|
|
26138
|
+
* @{
|
|
26139
|
+
*/
|
|
26140
|
+
|
|
26141
|
+
/**
|
|
26142
|
+
* \brief Returns the restore thread ID for a CUDA process
|
|
26143
|
+
*
|
|
26144
|
+
* Returns in \p *tid the thread ID of the CUDA restore thread for the process
|
|
26145
|
+
* specified by \p pid.
|
|
26146
|
+
*
|
|
26147
|
+
* \param pid - The process ID of the CUDA process
|
|
26148
|
+
* \param tid - Returned restore thread ID
|
|
26149
|
+
*
|
|
26150
|
+
* \return
|
|
26151
|
+
* ::CUDA_SUCCESS
|
|
26152
|
+
* ::CUDA_ERROR_INVALID_VALUE
|
|
26153
|
+
* ::CUDA_ERROR_NOT_INITIALIZED
|
|
26154
|
+
* ::CUDA_ERROR_NOT_SUPPORTED
|
|
26155
|
+
*/
|
|
26156
|
+
CUresult CUDAAPI cuCheckpointProcessGetRestoreThreadId(int pid, int *tid);
|
|
26157
|
+
|
|
26158
|
+
/**
|
|
26159
|
+
* \brief Returns the process state of a CUDA process
|
|
26160
|
+
*
|
|
26161
|
+
* Returns in \p *state the current state of the CUDA process specified by \p pid.
|
|
26162
|
+
*
|
|
26163
|
+
* \param pid - The process ID of the CUDA process
|
|
26164
|
+
* \param state - Returned CUDA process state
|
|
26165
|
+
*
|
|
26166
|
+
* \return
|
|
26167
|
+
* ::CUDA_SUCCESS
|
|
26168
|
+
* ::CUDA_ERROR_INVALID_VALUE
|
|
26169
|
+
* ::CUDA_ERROR_NOT_INITIALIZED
|
|
26170
|
+
* ::CUDA_ERROR_NOT_SUPPORTED
|
|
26171
|
+
*/
|
|
26172
|
+
CUresult CUDAAPI cuCheckpointProcessGetState(int pid, CUprocessState *state);
|
|
26173
|
+
|
|
26174
|
+
/**
|
|
26175
|
+
* \brief Lock a running CUDA process
|
|
26176
|
+
*
|
|
26177
|
+
* Lock the CUDA process specified by \p pid which will block further CUDA API
|
|
26178
|
+
* calls. Process must be in the RUNNING state in order to lock.
|
|
26179
|
+
*
|
|
26180
|
+
* Upon successful return the process will be in the LOCKED state.
|
|
26181
|
+
*
|
|
26182
|
+
* If timeoutMs is specified and the timeout is reached the process will be left
|
|
26183
|
+
* in the RUNNING state upon return.
|
|
26184
|
+
*
|
|
26185
|
+
* \param pid - The process ID of the CUDA process
|
|
26186
|
+
* \param args - Optional lock operation arguments
|
|
26187
|
+
*
|
|
26188
|
+
* \return
|
|
26189
|
+
* ::CUDA_SUCCESS
|
|
26190
|
+
* ::CUDA_ERROR_INVALID_VALUE
|
|
26191
|
+
* ::CUDA_ERROR_NOT_INITIALIZED
|
|
26192
|
+
* ::CUDA_ERROR_ILLEGAL_STATE
|
|
26193
|
+
* ::CUDA_ERROR_NOT_SUPPORTED
|
|
26194
|
+
* ::CUDA_ERROR_NOT_READY
|
|
26195
|
+
*/
|
|
26196
|
+
CUresult CUDAAPI cuCheckpointProcessLock(int pid, CUcheckpointLockArgs *args);
|
|
26197
|
+
|
|
26198
|
+
/**
|
|
26199
|
+
* \brief Checkpoint a CUDA process's GPU memory contents
|
|
26200
|
+
*
|
|
26201
|
+
* Checkpoints a CUDA process specified by \p pid that is in the LOCKED
|
|
26202
|
+
* state. The GPU memory contents will be brought into host memory and all
|
|
26203
|
+
* underlying references will be released. Process must be in the LOCKED state
|
|
26204
|
+
* to checkpoint.
|
|
26205
|
+
*
|
|
26206
|
+
* Upon successful return the process will be in the CHECKPOINTED state.
|
|
26207
|
+
*
|
|
26208
|
+
* \param pid - The process ID of the CUDA process
|
|
26209
|
+
* \param args - Optional checkpoint operation arguments
|
|
26210
|
+
*
|
|
26211
|
+
* \return
|
|
26212
|
+
* ::CUDA_SUCCESS
|
|
26213
|
+
* ::CUDA_ERROR_INVALID_VALUE
|
|
26214
|
+
* ::CUDA_ERROR_NOT_INITIALIZED
|
|
26215
|
+
* ::CUDA_ERROR_ILLEGAL_STATE
|
|
26216
|
+
* ::CUDA_ERROR_NOT_SUPPORTED
|
|
26217
|
+
*/
|
|
26218
|
+
CUresult CUDAAPI cuCheckpointProcessCheckpoint(int pid, CUcheckpointCheckpointArgs *args);
|
|
26219
|
+
|
|
26220
|
+
/**
|
|
26221
|
+
* \brief Restore a CUDA process's GPU memory contents from its last checkpoint
|
|
26222
|
+
*
|
|
26223
|
+
* Restores a CUDA process specified by \p pid from its last checkpoint. Process
|
|
26224
|
+
* must be in the CHECKPOINTED state to restore.
|
|
26225
|
+
*
|
|
26226
|
+
* Upon successful return the process will be in the LOCKED state.
|
|
26227
|
+
*
|
|
26228
|
+
* CUDA process restore requires persistence mode to be enabled or ::cuInit to
|
|
26229
|
+
* have been called before execution.
|
|
26230
|
+
*
|
|
26231
|
+
* \param pid - The process ID of the CUDA process
|
|
26232
|
+
* \param args - Optional restore operation arguments
|
|
26233
|
+
*
|
|
26234
|
+
* \return
|
|
26235
|
+
* ::CUDA_SUCCESS
|
|
26236
|
+
* ::CUDA_ERROR_INVALID_VALUE
|
|
26237
|
+
* ::CUDA_ERROR_NOT_INITIALIZED
|
|
26238
|
+
* ::CUDA_ERROR_ILLEGAL_STATE
|
|
26239
|
+
* ::CUDA_ERROR_NOT_SUPPORTED
|
|
26240
|
+
*
|
|
26241
|
+
* \sa
|
|
26242
|
+
* ::cuInit
|
|
26243
|
+
*/
|
|
26244
|
+
CUresult CUDAAPI cuCheckpointProcessRestore(int pid, CUcheckpointRestoreArgs *args);
|
|
26245
|
+
|
|
26246
|
+
/**
|
|
26247
|
+
* \brief Unlock a CUDA process to allow CUDA API calls
|
|
26248
|
+
*
|
|
26249
|
+
* Unlocks a process specified by \p pid allowing it to resume making CUDA API
|
|
26250
|
+
* calls. Process must be in the LOCKED state.
|
|
26251
|
+
*
|
|
26252
|
+
* Upon successful return the process will be in the RUNNING state.
|
|
26253
|
+
*
|
|
26254
|
+
* \param pid - The process ID of the CUDA process
|
|
26255
|
+
* \param args - Optional unlock operation arguments
|
|
26256
|
+
*
|
|
26257
|
+
* \return
|
|
26258
|
+
* ::CUDA_SUCCESS
|
|
26259
|
+
* ::CUDA_ERROR_INVALID_VALUE
|
|
26260
|
+
* ::CUDA_ERROR_NOT_INITIALIZED
|
|
26261
|
+
* ::CUDA_ERROR_ILLEGAL_STATE
|
|
26262
|
+
* ::CUDA_ERROR_NOT_SUPPORTED
|
|
26263
|
+
*/
|
|
26264
|
+
CUresult CUDAAPI cuCheckpointProcessUnlock(int pid, CUcheckpointUnlockArgs *args);
|
|
26265
|
+
|
|
26266
|
+
/** @} */ /* End CUDA_CHECKPOINT */
|
|
26267
|
+
|
|
24347
26268
|
#ifdef __cplusplus
|
|
24348
26269
|
}
|
|
24349
26270
|
#endif
|