triton-windows 3.2.0.post11__cp312-cp312-win_amd64.whl → 3.3.0a0.post11__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/__init__.py +3 -3
  3. triton/_internal_testing.py +59 -4
  4. triton/_utils.py +35 -0
  5. triton/backends/amd/compiler.py +121 -74
  6. triton/backends/amd/driver.py +77 -43
  7. triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +28 -49
  8. triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +35 -9
  9. triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +761 -284
  10. triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +9 -3
  11. triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +1391 -0
  12. triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +3 -3
  13. triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +44 -0
  14. triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +288 -0
  15. triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +110 -14
  16. triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +504 -103
  17. triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +2 -1
  18. triton/backends/amd/include/hip/amd_detail/host_defines.h +4 -0
  19. triton/backends/amd/include/hip/hip_ext.h +4 -2
  20. triton/backends/amd/include/hip/hip_fp8.h +33 -0
  21. triton/backends/amd/include/hip/hip_runtime_api.h +375 -33
  22. triton/backends/amd/include/hip/hip_version.h +3 -3
  23. triton/backends/amd/include/hip/hiprtc.h +25 -25
  24. triton/backends/amd/include/hsa/amd_hsa_elf.h +40 -14
  25. triton/backends/amd/include/hsa/hsa.h +11 -2
  26. triton/backends/amd/include/hsa/hsa_api_trace.h +30 -17
  27. triton/backends/amd/include/hsa/hsa_api_trace_version.h +68 -0
  28. triton/backends/amd/include/hsa/hsa_ext_amd.h +83 -27
  29. triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +46 -46
  30. triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +416 -0
  31. triton/backends/amd/include/roctracer/hip_ostream_ops.h +84 -4
  32. triton/backends/amd/include/roctracer/hsa_ostream_ops.h +260 -0
  33. triton/backends/amd/include/roctracer/hsa_prof_str.h +51 -19
  34. triton/backends/amd/lib/asanrtl.bc +0 -0
  35. triton/backends/compiler.py +25 -225
  36. triton/backends/driver.py +7 -2
  37. triton/backends/nvidia/bin/ptxas.exe +0 -0
  38. triton/backends/nvidia/compiler.py +135 -90
  39. triton/backends/nvidia/driver.c +0 -1
  40. triton/backends/nvidia/driver.py +135 -49
  41. triton/backends/nvidia/include/cuda.h +2162 -241
  42. triton/backends/nvidia/lib/x64/cuda.lib +0 -0
  43. triton/compiler/__init__.py +2 -2
  44. triton/compiler/code_generator.py +334 -231
  45. triton/compiler/compiler.py +77 -66
  46. triton/language/__init__.py +22 -5
  47. triton/language/core.py +448 -74
  48. triton/language/extra/cuda/_experimental_tma.py +3 -5
  49. triton/language/math.py +1 -1
  50. triton/language/random.py +2 -1
  51. triton/language/semantic.py +206 -52
  52. triton/language/standard.py +35 -18
  53. triton/runtime/_allocation.py +32 -0
  54. triton/runtime/autotuner.py +27 -32
  55. triton/runtime/build.py +1 -48
  56. triton/runtime/cache.py +6 -6
  57. triton/runtime/errors.py +10 -0
  58. triton/runtime/interpreter.py +179 -45
  59. triton/runtime/jit.py +149 -190
  60. triton/testing.py +39 -11
  61. triton/tools/compile.py +27 -20
  62. triton/tools/{compile.c → extra/cuda/compile.c} +1 -0
  63. triton/tools/mxfp.py +301 -0
  64. {triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/METADATA +5 -2
  65. {triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/RECORD +68 -59
  66. {triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/top_level.txt +2 -0
  67. /triton/tools/{compile.h → extra/cuda/compile.h} +0 -0
  68. {triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/WHEEL +0 -0
@@ -89,6 +89,7 @@ typedef uint64_t cuuint64_t;
89
89
  #define cuDeviceTotalMem cuDeviceTotalMem_v2
90
90
  #define cuCtxCreate cuCtxCreate_v2
91
91
  #define cuCtxCreate_v3 cuCtxCreate_v3
92
+ #define cuCtxCreate_v4 cuCtxCreate_v4
92
93
  #define cuModuleGetGlobal cuModuleGetGlobal_v2
93
94
  #define cuMemGetInfo cuMemGetInfo_v2
94
95
  #define cuMemAlloc cuMemAlloc_v2
@@ -115,6 +116,8 @@ typedef uint64_t cuuint64_t;
115
116
  #define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
116
117
  #define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
117
118
  #define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
119
+ #define cuMemcpyBatchAsync __CUDA_API_PTSZ(cuMemcpyBatchAsync)
120
+ #define cuMemcpy3DBatchAsync __CUDA_API_PTSZ(cuMemcpy3DBatchAsync)
118
121
  #define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2)
119
122
  #define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2)
120
123
  #define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2)
@@ -183,7 +186,9 @@ typedef uint64_t cuuint64_t;
183
186
  #define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority)
184
187
  #define cuStreamGetId __CUDA_API_PTSZ(cuStreamGetId)
185
188
  #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags)
189
+ #define cuStreamGetDevice __CUDA_API_PTSZ(cuStreamGetDevice)
186
190
  #define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx)
191
+ #define cuStreamGetCtx_v2 __CUDA_API_PTSZ(cuStreamGetCtx_v2)
187
192
  #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent)
188
193
  #define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture)
189
194
  #define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing)
@@ -202,6 +207,7 @@ typedef uint64_t cuuint64_t;
202
207
  #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources)
203
208
  #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources)
204
209
 
210
+
205
211
  #define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel)
206
212
 
207
213
  #define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync)
@@ -223,6 +229,8 @@ typedef uint64_t cuuint64_t;
223
229
 
224
230
  #endif
225
231
 
232
+ #define cuMemBatchDecompressAsync __CUDA_API_PTSZ(cuMemBatchDecompressAsync)
233
+
226
234
  /**
227
235
  * \file cuda.h
228
236
  * \brief Header file for the CUDA Toolkit application programming interface.
@@ -244,7 +252,7 @@ typedef uint64_t cuuint64_t;
244
252
  /**
245
253
  * CUDA API version number
246
254
  */
247
- #define CUDA_VERSION 12040
255
+ #define CUDA_VERSION 12080
248
256
 
249
257
  #ifdef __cplusplus
250
258
  extern "C" {
@@ -263,7 +271,7 @@ typedef CUdeviceptr_v2 CUdeviceptr; /**< CUDA device po
263
271
 
264
272
  typedef int CUdevice_v1; /**< CUDA device */
265
273
  typedef CUdevice_v1 CUdevice; /**< CUDA device */
266
- typedef struct CUctx_st *CUcontext; /**< CUDA context */
274
+ typedef struct CUctx_st *CUcontext; /**< A regular context handle */
267
275
  typedef struct CUmod_st *CUmodule; /**< CUDA module */
268
276
  typedef struct CUfunc_st *CUfunction; /**< CUDA function */
269
277
  typedef struct CUlib_st *CUlibrary; /**< CUDA library */
@@ -289,6 +297,12 @@ typedef struct CUuserObject_st *CUuserObject; /**< CUDA user obje
289
297
  typedef cuuint64_t CUgraphConditionalHandle; /**< CUDA graph conditional handle */
290
298
  typedef struct CUgraphDeviceUpdatableNode_st *CUgraphDeviceNode; /**< CUDA graph device node handle */
291
299
  typedef struct CUasyncCallbackEntry_st *CUasyncCallbackHandle; /**< CUDA async notification callback handle */
300
+ /*!
301
+ * \typedef typedef struct CUgreenCtx_st* CUgreenCtx
302
+ * A green context handle. This handle can be used safely from only one CPU thread at a time.
303
+ * Created via ::cuGreenCtxCreate
304
+ */
305
+ typedef struct CUgreenCtx_st *CUgreenCtx;
292
306
 
293
307
  #ifndef CU_UUID_HAS_BEEN_DEFINED
294
308
  #define CU_UUID_HAS_BEEN_DEFINED
@@ -617,41 +631,58 @@ typedef void (*CUasyncCallback)(CUasyncNotificationInfo *info, void *userData, C
617
631
  * Array formats
618
632
  */
619
633
  typedef enum CUarray_format_enum {
620
- CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
621
- CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
622
- CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
623
- CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */
624
- CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */
625
- CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
626
- CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
627
- CU_AD_FORMAT_FLOAT = 0x20, /**< 32-bit floating point */
628
- CU_AD_FORMAT_NV12 = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
629
- CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */
630
- CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */
631
- CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */
632
- CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */
633
- CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */
634
- CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */
635
- CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, /**< 1 channel signed 8-bit normalized integer */
636
- CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, /**< 2 channel signed 8-bit normalized integer */
637
- CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, /**< 4 channel signed 8-bit normalized integer */
638
- CU_AD_FORMAT_SNORM_INT16X1 = 0xc9, /**< 1 channel signed 16-bit normalized integer */
639
- CU_AD_FORMAT_SNORM_INT16X2 = 0xca, /**< 2 channel signed 16-bit normalized integer */
640
- CU_AD_FORMAT_SNORM_INT16X4 = 0xcb, /**< 4 channel signed 16-bit normalized integer */
641
- CU_AD_FORMAT_BC1_UNORM = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
642
- CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
643
- CU_AD_FORMAT_BC2_UNORM = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
644
- CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/
645
- CU_AD_FORMAT_BC3_UNORM = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
646
- CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/
647
- CU_AD_FORMAT_BC4_UNORM = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
648
- CU_AD_FORMAT_BC4_SNORM = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */
649
- CU_AD_FORMAT_BC5_UNORM = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
650
- CU_AD_FORMAT_BC5_SNORM = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */
651
- CU_AD_FORMAT_BC6H_UF16 = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
652
- CU_AD_FORMAT_BC6H_SF16 = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */
653
- CU_AD_FORMAT_BC7_UNORM = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
654
- CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
634
+ CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
635
+ CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
636
+ CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
637
+ CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */
638
+ CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */
639
+ CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
640
+ CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
641
+ CU_AD_FORMAT_FLOAT = 0x20, /**< 32-bit floating point */
642
+ CU_AD_FORMAT_NV12 = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
643
+ CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */
644
+ CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */
645
+ CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */
646
+ CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */
647
+ CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */
648
+ CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */
649
+ CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, /**< 1 channel signed 8-bit normalized integer */
650
+ CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, /**< 2 channel signed 8-bit normalized integer */
651
+ CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, /**< 4 channel signed 8-bit normalized integer */
652
+ CU_AD_FORMAT_SNORM_INT16X1 = 0xc9, /**< 1 channel signed 16-bit normalized integer */
653
+ CU_AD_FORMAT_SNORM_INT16X2 = 0xca, /**< 2 channel signed 16-bit normalized integer */
654
+ CU_AD_FORMAT_SNORM_INT16X4 = 0xcb, /**< 4 channel signed 16-bit normalized integer */
655
+ CU_AD_FORMAT_BC1_UNORM = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
656
+ CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
657
+ CU_AD_FORMAT_BC2_UNORM = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
658
+ CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/
659
+ CU_AD_FORMAT_BC3_UNORM = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
660
+ CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/
661
+ CU_AD_FORMAT_BC4_UNORM = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
662
+ CU_AD_FORMAT_BC4_SNORM = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */
663
+ CU_AD_FORMAT_BC5_UNORM = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
664
+ CU_AD_FORMAT_BC5_SNORM = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */
665
+ CU_AD_FORMAT_BC6H_UF16 = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
666
+ CU_AD_FORMAT_BC6H_SF16 = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */
667
+ CU_AD_FORMAT_BC7_UNORM = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
668
+ CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
669
+ CU_AD_FORMAT_P010 = 0x9f, /**< 10-bit YUV planar format, with 4:2:0 sampling */
670
+ CU_AD_FORMAT_P016 = 0xa1, /**< 16-bit YUV planar format, with 4:2:0 sampling */
671
+ CU_AD_FORMAT_NV16 = 0xa2, /**< 8-bit YUV planar format, with 4:2:2 sampling */
672
+ CU_AD_FORMAT_P210 = 0xa3, /**< 10-bit YUV planar format, with 4:2:2 sampling */
673
+ CU_AD_FORMAT_P216 = 0xa4, /**< 16-bit YUV planar format, with 4:2:2 sampling */
674
+ CU_AD_FORMAT_YUY2 = 0xa5, /**< 2 channel, 8-bit YUV packed planar format, with 4:2:2 sampling */
675
+ CU_AD_FORMAT_Y210 = 0xa6, /**< 2 channel, 10-bit YUV packed planar format, with 4:2:2 sampling */
676
+ CU_AD_FORMAT_Y216 = 0xa7, /**< 2 channel, 16-bit YUV packed planar format, with 4:2:2 sampling */
677
+ CU_AD_FORMAT_AYUV = 0xa8, /**< 4 channel, 8-bit YUV packed planar format, with 4:4:4 sampling */
678
+ CU_AD_FORMAT_Y410 = 0xa9, /**< 10-bit YUV packed planar format, with 4:4:4 sampling */
679
+ CU_AD_FORMAT_Y416 = 0xb1, /**< 4 channel, 12-bit YUV packed planar format, with 4:4:4 sampling */
680
+ CU_AD_FORMAT_Y444_PLANAR8 = 0xb2, /**< 3 channel 8-bit YUV planar format, with 4:4:4 sampling */
681
+ CU_AD_FORMAT_Y444_PLANAR10 = 0xb3, /**< 3 channel 10-bit YUV planar format, with 4:4:4 sampling */
682
+ CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4, /**< 3 channel 8-bit YUV semi-planar format, with 4:4:4 sampling */
683
+ CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5, /**< 3 channel 16-bit YUV semi-planar format, with 4:4:4 sampling */
684
+ CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50, /**< 4 channel unorm R10G10B10A2 RGB format */
685
+ CU_AD_FORMAT_MAX = 0x7FFFFFFF
655
686
  } CUarray_format;
656
687
 
657
688
  /**
@@ -811,11 +842,17 @@ typedef enum CUdevice_attribute_enum {
811
842
  CU_DEVICE_ATTRIBUTE_TENSOR_MAP_ACCESS_SUPPORTED = 127, /**< Device supports accessing memory using Tensor Map. */
812
843
  CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED = 128, /**< Device supports exporting memory to a fabric handle with cuMemExportToShareableHandle() or requested with cuMemCreate() */
813
844
  CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS = 129, /**< Device supports unified function pointers. */
814
- CU_DEVICE_ATTRIBUTE_NUMA_CONFIG = 130,
815
- CU_DEVICE_ATTRIBUTE_NUMA_ID = 131,
845
+ CU_DEVICE_ATTRIBUTE_NUMA_CONFIG = 130, /**< NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum */
846
+ CU_DEVICE_ATTRIBUTE_NUMA_ID = 131, /**< NUMA node ID of the GPU memory */
816
847
  CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED = 132, /**< Device supports switch multicast and reduction operations. */
817
848
  CU_DEVICE_ATTRIBUTE_MPS_ENABLED = 133, /**< Indicates if contexts created on this device will be shared via MPS */
818
849
  CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID = 134, /**< NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA. */
850
+ CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED = 135, /**< Device supports CIG with D3D12. */
851
+ CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK = 136, /**< The returned valued shall be interpreted as a bitmask, where the individual bits are described by the ::CUmemDecompressAlgorithm enum. */
852
+ CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_MAXIMUM_LENGTH = 137, /**< The returned valued is the maximum length in bytes of a single decompress operation that is allowed. */
853
+ CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID = 139, /**< The combined 16-bit PCI device ID and 16-bit PCI vendor ID. */
854
+ CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID = 140, /**< The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID. */
855
+ CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED = 143, /**< Device supports HOST_NUMA location IPC between nodes in a multi-node system. */
819
856
  CU_DEVICE_ATTRIBUTE_MAX
820
857
  } CUdevice_attribute;
821
858
 
@@ -860,6 +897,7 @@ typedef enum CUpointer_attribute_enum {
860
897
  CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18, /**< Size of the actual underlying mapping that the pointer belongs to **/
861
898
  CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19, /**< The start address of the mapping that the pointer belongs to **/
862
899
  CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20 /**< A process-wide unique id corresponding to the physical allocation the pointer belongs to **/
900
+ , CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE = 21 /**< Returns in \p *data a boolean that indicates whether the pointer points to memory that is capable to be used for hardware accelerated decompression. */
863
901
  } CUpointer_attribute;
864
902
 
865
903
  /**
@@ -1449,27 +1487,36 @@ typedef enum CUjit_option_enum
1449
1487
  */
1450
1488
  typedef enum CUjit_target_enum
1451
1489
  {
1452
- CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
1453
- CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */
1454
- CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */
1455
- CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */
1456
- CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */
1457
- CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */
1458
- CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */
1459
- CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/
1460
- CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/
1461
- CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/
1462
- CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/
1463
- CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/
1464
- CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/
1465
- CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/
1466
- CU_TARGET_COMPUTE_86 = 86, /**< Compute device class 8.6.*/
1467
- CU_TARGET_COMPUTE_87 = 87, /**< Compute device class 8.7.*/
1468
- CU_TARGET_COMPUTE_89 = 89, /**< Compute device class 8.9.*/
1469
- CU_TARGET_COMPUTE_90 = 90, /**< Compute device class 9.0.*/
1490
+ CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
1491
+ CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */
1492
+ CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */
1493
+ CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */
1494
+ CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */
1495
+ CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */
1496
+ CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */
1497
+ CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/
1498
+ CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/
1499
+ CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/
1500
+ CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/
1501
+ CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/
1502
+ CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/
1503
+ CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/
1504
+ CU_TARGET_COMPUTE_86 = 86, /**< Compute device class 8.6.*/
1505
+ CU_TARGET_COMPUTE_87 = 87, /**< Compute device class 8.7.*/
1506
+ CU_TARGET_COMPUTE_89 = 89, /**< Compute device class 8.9.*/
1507
+ CU_TARGET_COMPUTE_90 = 90, /**< Compute device class 9.0.*/
1508
+ CU_TARGET_COMPUTE_100 = 100, /**< Compute device class 10.0.*/
1509
+ CU_TARGET_COMPUTE_101 = 101, /**< Compute device class 10.1.*/
1510
+ CU_TARGET_COMPUTE_120 = 120, /**< Compute device class 12.0.*/
1470
1511
 
1471
1512
  /**< Compute device class 9.0. with accelerated features.*/
1472
1513
  CU_TARGET_COMPUTE_90A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_90,
1514
+ /**< Compute device class 10.0. with accelerated features.*/
1515
+ CU_TARGET_COMPUTE_100A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_100,
1516
+ /**< Compute device class 10.1 with accelerated features.*/
1517
+ CU_TARGET_COMPUTE_101A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_101,
1518
+ /**< Compute device class 12.0. with accelerated features.*/
1519
+ CU_TARGET_COMPUTE_120A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_120,
1473
1520
  } CUjit_target;
1474
1521
 
1475
1522
  /**
@@ -1585,6 +1632,9 @@ typedef enum CUlimit_enum {
1585
1632
  CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */
1586
1633
  CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
1587
1634
  CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x06, /**< A size in bytes for L2 persisting lines cache size */
1635
+ CU_LIMIT_SHMEM_SIZE = 0x07, /**< A maximum size in bytes of shared memory available to CUDA kernels on a CIG context. Can only be queried, cannot be set */
1636
+ CU_LIMIT_CIG_ENABLED = 0x08, /**< A non-zero value indicates this CUDA context is a CIG-enabled context. Can only be queried, cannot be set */
1637
+ CU_LIMIT_CIG_SHMEM_FALLBACK_ENABLED = 0x09, /**< When set to zero, CUDA will fail to launch a kernel on a CIG context, instead of using the fallback path, if the kernel uses more shared memory than available */
1588
1638
  CU_LIMIT_MAX
1589
1639
  } CUlimit;
1590
1640
 
@@ -1748,8 +1798,9 @@ typedef struct CUDA_HOST_NODE_PARAMS_v2_st {
1748
1798
  * Conditional node types
1749
1799
  */
1750
1800
  typedef enum CUgraphConditionalNodeType_enum {
1751
- CU_GRAPH_COND_TYPE_IF = 0, /**< Conditional 'if' Node. Body executed once if condition value is non-zero. */
1801
+ CU_GRAPH_COND_TYPE_IF = 0, /**< Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If \p size == 2, an optional ELSE graph is created and this is executed if the condition is zero. */
1752
1802
  CU_GRAPH_COND_TYPE_WHILE = 1, /**< Conditional 'while' Node. Body executed repeatedly while condition value is non-zero. */
1803
+ CU_GRAPH_COND_TYPE_SWITCH = 2, /**< Conditional 'switch' Node. Body[n] is executed once, where 'n' is the value of the condition. If the condition does not match a body index, no body is launched. */
1753
1804
  } CUgraphConditionalNodeType;
1754
1805
 
1755
1806
  /**
@@ -1760,7 +1811,8 @@ typedef struct CUDA_CONDITIONAL_NODE_PARAMS {
1760
1811
  Handles must be created in advance of creating the node
1761
1812
  using ::cuGraphConditionalHandleCreate. */
1762
1813
  CUgraphConditionalNodeType type; /**< Type of conditional node. */
1763
- unsigned int size; /**< Size of graph output array. Must be 1. */
1814
+ unsigned int size; /**< Size of graph output array. Allowed values are 1 for CU_GRAPH_COND_TYPE_WHILE, 1 or 2
1815
+ for CU_GRAPH_COND_TYPE_IF, or any value greater than zero for CU_GRAPH_COND_TYPE_SWITCH. */
1764
1816
  CUgraph *phGraph_out; /**< CUDA-owned array populated with conditional node child graphs during creation of the node.
1765
1817
  Valid for the lifetime of the conditional node.
1766
1818
  The contents of the graph(s) are subject to the following constraints:
@@ -1770,7 +1822,17 @@ typedef struct CUDA_CONDITIONAL_NODE_PARAMS {
1770
1822
  - All kernels, including kernels in nested conditionals or child graphs at any level,
1771
1823
  must belong to the same CUDA context.
1772
1824
 
1773
- These graphs may be populated using graph node creation APIs or ::cuStreamBeginCaptureToGraph. */
1825
+ These graphs may be populated using graph node creation APIs or ::cuStreamBeginCaptureToGraph.
1826
+
1827
+ CU_GRAPH_COND_TYPE_IF:
1828
+ phGraph_out[0] is executed when the condition is non-zero. If \p size == 2, phGraph_out[1] will
1829
+ be executed when the condition is zero.
1830
+ CU_GRAPH_COND_TYPE_WHILE:
1831
+ phGraph_out[0] is executed as long as the condition is non-zero.
1832
+ CU_GRAPH_COND_TYPE_SWITCH:
1833
+ phGraph_out[n] is executed when the condition is equal to n. If the condition >= \p size,
1834
+ no body graph is executed.
1835
+ */
1774
1836
  CUcontext ctx; /**< Context on which to run the node. Must match context used to create the handle and all body nodes. */
1775
1837
  } CUDA_CONDITIONAL_NODE_PARAMS;
1776
1838
 
@@ -1790,23 +1852,22 @@ typedef enum CUgraphNodeType_enum {
1790
1852
  CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT = 9, /**< External semaphore wait node */
1791
1853
  CU_GRAPH_NODE_TYPE_MEM_ALLOC = 10,/**< Memory Allocation Node */
1792
1854
  CU_GRAPH_NODE_TYPE_MEM_FREE = 11,/**< Memory Free Node */
1793
- CU_GRAPH_NODE_TYPE_BATCH_MEM_OP = 12 /**< Batch MemOp Node */
1794
- ,
1855
+ CU_GRAPH_NODE_TYPE_BATCH_MEM_OP = 12,/**< Batch MemOp Node */
1795
1856
  CU_GRAPH_NODE_TYPE_CONDITIONAL = 13 /**< Conditional Node
1796
-
1857
+
1797
1858
  May be used to implement a conditional execution path or loop
1798
1859
  inside of a graph. The graph(s) contained within the body of the conditional node
1799
1860
  can be selectively executed or iterated upon based on the value of a conditional
1800
1861
  variable.
1801
-
1862
+
1802
1863
  Handles must be created in advance of creating the node
1803
1864
  using ::cuGraphConditionalHandleCreate.
1804
-
1865
+
1805
1866
  The following restrictions apply to graphs which contain conditional nodes:
1806
1867
  The graph cannot be used in a child node.
1807
1868
  Only one instantiation of the graph may exist at any point in time.
1808
1869
  The graph cannot be cloned.
1809
-
1870
+
1810
1871
  To set the control value, supply a default value when creating the handle and/or
1811
1872
  call ::cudaGraphSetConditional from device code.*/
1812
1873
  } CUgraphNodeType;
@@ -1878,7 +1939,8 @@ typedef enum CUgraphInstantiateResult_enum
1878
1939
  CUDA_GRAPH_INSTANTIATE_ERROR = 1, /**< Instantiation failed for an unexpected reason which is described in the return value of the function */
1879
1940
  CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE = 2, /**< Instantiation failed due to invalid structure, such as cycles */
1880
1941
  CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED = 3, /**< Instantiation for device launch failed because the graph contained an unsupported operation */
1881
- CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED = 4 /**< Instantiation for device launch failed due to the nodes belonging to different contexts */
1942
+ CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED = 4, /**< Instantiation for device launch failed due to the nodes belonging to different contexts */
1943
+ CUDA_GRAPH_INSTANTIATE_CONDITIONAL_HANDLE_UNUSED = 5, /**< One or more conditional handles are not associated with conditional nodes */
1882
1944
  } CUgraphInstantiateResult;
1883
1945
 
1884
1946
  /**
@@ -2004,6 +2066,42 @@ typedef enum CUlaunchAttributeID_enum {
2004
2066
  ::CUlaunchAttributeValue::memSyncDomainMap. */
2005
2067
  , CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN = 10 /**< Valid for streams, graph nodes, launches. See
2006
2068
  ::CUlaunchAttributeValue::memSyncDomain. */
2069
+ , CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11 /**< Valid for graph nodes, launches. Set
2070
+ ::CUlaunchAttributeValue::preferredClusterDim
2071
+ to allow the kernel launch to specify a preferred substitute
2072
+ cluster dimension. Blocks may be grouped according to either
2073
+ the dimensions specified with this attribute (grouped into a
2074
+ "preferred substitute cluster"), or the one specified with
2075
+ ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped
2076
+ into a "regular cluster"). The cluster dimensions of a
2077
+ "preferred substitute cluster" shall be an integer multiple
2078
+ greater than zero of the regular cluster dimensions. The
2079
+ device will attempt - on a best-effort basis - to group
2080
+ thread blocks into preferred clusters over grouping them
2081
+ into regular clusters. When it deems necessary (primarily
2082
+ when the device temporarily runs out of physical resources
2083
+ to launch the larger preferred clusters), the device may
2084
+ switch to launch the regular clusters instead to attempt to
2085
+ utilize as much of the physical device resources as possible.
2086
+ <br>
2087
+ Each type of cluster will have its enumeration / coordinate
2088
+ setup as if the grid consists solely of its type of cluster.
2089
+ For example, if the preferred substitute cluster dimensions
2090
+ double the regular cluster dimensions, there might be
2091
+ simultaneously a regular cluster indexed at (1,0,0), and a
2092
+ preferred cluster indexed at (1,0,0). In this example, the
2093
+ preferred substitute cluster (1,0,0) replaces regular
2094
+ clusters (2,0,0) and (3,0,0) and groups their blocks.
2095
+ <br>
2096
+ This attribute will only take effect when a regular cluster
2097
+ dimension has been specified. The preferred substitute
2098
+ cluster dimension must be an integer multiple greater than
2099
+ zero of the regular cluster dimension and must divide the
2100
+ grid. It must also be no more than `maxBlocksPerCluster`, if
2101
+ it is set in the kernel's `__launch_bounds__`. Otherwise it
2102
+ must be less than the maximum value the driver can support.
2103
+ Otherwise, setting this attribute to a value physically
2104
+ unable to fit on any particular device is permitted. */
2007
2105
  , CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12 /**< Valid for launches. Set
2008
2106
  ::CUlaunchAttributeValue::launchCompletionEvent to record the
2009
2107
  event.
@@ -2054,7 +2152,14 @@ typedef enum CUlaunchAttributeID_enum {
2054
2152
  from within the graph, the graph must be uploaded with ::cuGraphUpload before it
2055
2153
  is launched. For such a graph, if host-side executable graph updates are made to the
2056
2154
  device-updatable nodes, the graph must be uploaded before it is launched again. */
2057
- #ifdef __CUDA_API_VERSION_INTERNAL
2155
+ , CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 14 /**< Valid for launches. On devices where the L1 cache and shared memory use the
2156
+ same hardware resources, setting ::CUlaunchAttributeValue::sharedMemCarveout to a
2157
+ percentage between 0-100 signals the CUDA driver to set the shared memory carveout
2158
+ preference, in percent of the total shared memory for that kernel launch.
2159
+ This attribute takes precedence over ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
2160
+ This is only a hint, and the CUDA driver can choose a different configuration if
2161
+ required for the launch. */
2162
+ #if defined(__CUDA_API_VERSION_INTERNAL) && !defined(__CUDA_API_VERSION_INTERNAL_ODR)
2058
2163
  , CU_LAUNCH_ATTRIBUTE_MAX
2059
2164
  #endif
2060
2165
  } CUlaunchAttributeID;
@@ -2092,27 +2197,64 @@ typedef union CUlaunchAttributeValue_union {
2092
2197
  scheduling policy preference for the kernel. */
2093
2198
  int programmaticStreamSerializationAllowed; /**< Value of launch attribute
2094
2199
  ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION. */
2200
+ /**
2201
+ * Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
2202
+ * with the following fields:
2203
+ * - \p CUevent event - Event to fire when all blocks trigger it.
2204
+ * - \p Event record flags, see ::cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
2205
+ * - \p triggerAtBlockStart - If this is set to non-0, each block launch will automatically trigger the event.
2206
+ */
2095
2207
  struct {
2096
- CUevent event; /**< Event to fire when all blocks trigger it */
2097
- int flags; /**< Event record flags, see ::cuEventRecordWithFlags. Does not accept
2098
- ::CU_EVENT_RECORD_EXTERNAL. */
2099
- int triggerAtBlockStart; /**< If this is set to non-0, each block launch will automatically trigger the event */
2100
- } programmaticEvent; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT. */
2208
+ CUevent event;
2209
+ int flags;
2210
+ int triggerAtBlockStart;
2211
+ } programmaticEvent;
2212
+ /**
2213
+ * Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT
2214
+ * with the following fields:
2215
+ * - \p CUevent event - Event to fire when the last block launches
2216
+ * - \p int flags; - Event record flags, see ::cuEventRecordWithFlags. Does not accept ::CU_EVENT_RECORD_EXTERNAL.
2217
+ */
2101
2218
  struct {
2102
- CUevent event; /**< Event to fire when the last block launches */
2103
- int flags; /**< Event record flags, see ::cuEventRecordWithFlags. Does not accept ::CU_EVENT_RECORD_EXTERNAL. */
2104
- } launchCompletionEvent; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT. */
2219
+ CUevent event;
2220
+ int flags;
2221
+ } launchCompletionEvent;
2105
2222
  int priority; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution priority of the kernel. */
2106
2223
  CUlaunchMemSyncDomainMap memSyncDomainMap; /**< Value of launch attribute
2107
2224
  ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP. See
2108
2225
  ::CUlaunchMemSyncDomainMap. */
2109
2226
  CUlaunchMemSyncDomain memSyncDomain; /**< Value of launch attribute
2110
2227
  ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN. See::CUlaunchMemSyncDomain */
2228
+ /**
2229
+ * Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
2230
+ * that represents the desired preferred cluster dimensions for the kernel.
2231
+ * Opaque type with the following fields:
2232
+ * - \p x - The X dimension of the preferred cluster, in blocks. Must
2233
+ * be a divisor of the grid X dimension, and must be a
2234
+ * multiple of the \p x field of ::CUlaunchAttributeValue::clusterDim.
2235
+ * - \p y - The Y dimension of the preferred cluster, in blocks. Must
2236
+ * be a divisor of the grid Y dimension, and must be a
2237
+ * multiple of the \p y field of ::CUlaunchAttributeValue::clusterDim.
2238
+ * - \p z - The Z dimension of the preferred cluster, in blocks. Must be
2239
+ * equal to the \p z field of ::CUlaunchAttributeValue::clusterDim.
2240
+ */
2241
+ struct {
2242
+ unsigned int x;
2243
+ unsigned int y;
2244
+ unsigned int z;
2245
+ } preferredClusterDim;
2111
2246
 
2247
+ /**
2248
+ * Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE.
2249
+ * with the following fields:
2250
+ * - \p int deviceUpdatable - Whether or not the resulting kernel node should be device-updatable.
2251
+ * - \p CUgraphDeviceNode devNode - Returns a handle to pass to the various device-side update functions.
2252
+ */
2112
2253
  struct {
2113
- int deviceUpdatable; /**< Whether or not the resulting kernel node should be device-updatable. */
2114
- CUgraphDeviceNode devNode; /**< Returns a handle to pass to the various device-side update functions. */
2115
- } deviceUpdatableKernelNode; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. */
2254
+ int deviceUpdatable;
2255
+ CUgraphDeviceNode devNode;
2256
+ } deviceUpdatableKernelNode;
2257
+ unsigned int sharedMemCarveout; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT. */
2116
2258
  } CUlaunchAttributeValue;
2117
2259
 
2118
2260
  /**
@@ -2148,7 +2290,9 @@ typedef CUlaunchAttributeID CUkernelNodeAttrID;
2148
2290
  #define CU_KERNEL_NODE_ATTRIBUTE_PRIORITY CU_LAUNCH_ATTRIBUTE_PRIORITY
2149
2291
  #define CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP
2150
2292
  #define CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN
2293
+ #define CU_KERNEL_NODE_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
2151
2294
  #define CU_KERNEL_NODE_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE
2295
+ #define CU_KERNEL_NODE_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
2152
2296
 
2153
2297
  typedef CUlaunchAttributeValue CUkernelNodeAttrValue_v1;
2154
2298
  typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue;
@@ -2231,6 +2375,29 @@ typedef struct CUexecAffinityParam_st {
2231
2375
  */
2232
2376
  typedef CUexecAffinityParam_v1 CUexecAffinityParam;
2233
2377
 
2378
+ typedef enum CUcigDataType_enum {
2379
+ CIG_DATA_TYPE_D3D12_COMMAND_QUEUE = 0x1, /** D3D12 Command Queue Handle */
2380
+ } CUcigDataType;
2381
+
2382
+ /**
2383
+ * CIG Context Create Params
2384
+ */
2385
+ typedef struct CUctxCigParam_st {
2386
+ CUcigDataType sharedDataType;
2387
+ void* sharedData;
2388
+ } CUctxCigParam;
2389
+
2390
+ /**
2391
+ * Params for creating CUDA context
2392
+ * Exactly one of execAffinityParams and cigParams
2393
+ * must be non-NULL.
2394
+ */
2395
+ typedef struct CUctxCreateParams_st {
2396
+ CUexecAffinityParam *execAffinityParams;
2397
+ int numExecAffinityParams;
2398
+ CUctxCigParam *cigParams;
2399
+ } CUctxCreateParams;
2400
+
2234
2401
  /**
2235
2402
  * Library options to be specified with ::cuLibraryLoadData() or ::cuLibraryLoadFromFile()
2236
2403
  */
@@ -2502,6 +2669,17 @@ typedef enum cudaError_enum {
2502
2669
  */
2503
2670
  CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC = 225,
2504
2671
 
2672
+ /**
2673
+ * This indicates that an exception occurred on the device that is now
2674
+ * contained by the GPU's error containment capability. Common causes are -
2675
+ * a. Certain types of invalid accesses of peer GPU memory over nvlink
2676
+ * b. Certain classes of hardware errors
2677
+ * This leaves the process in an inconsistent state and any further CUDA
2678
+ * work will return the same error. To continue using CUDA, the process must
2679
+ * be terminated and relaunched.
2680
+ */
2681
+ CUDA_ERROR_CONTAINED = 226,
2682
+
2505
2683
  /**
2506
2684
  * This indicates that the device kernel source is invalid. This includes
2507
2685
  * compilation/linker errors encountered in device code or user error.
@@ -2718,6 +2896,14 @@ typedef enum cudaError_enum {
2718
2896
  */
2719
2897
  CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720,
2720
2898
 
2899
+ /**
2900
+ * An exception occurred on the device while exiting a kernel using tensor memory: the
2901
+ * tensor memory was not completely deallocated. This leaves the process in an inconsistent
2902
+ * state and any further CUDA work will return the same error. To continue using CUDA, the
2903
+ * process must be terminated and relaunched.
2904
+ */
2905
+ CUDA_ERROR_TENSOR_MEMORY_LEAK = 721,
2906
+
2721
2907
  /**
2722
2908
  * This error indicates that the attempted operation is not permitted.
2723
2909
  */
@@ -2894,6 +3080,12 @@ typedef enum cudaError_enum {
2894
3080
  */
2895
3081
  CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION = 915,
2896
3082
 
3083
+ /**
3084
+ * This error indicates that an error happened during the key rotation
3085
+ * sequence.
3086
+ */
3087
+ CUDA_ERROR_KEY_ROTATION = 916,
3088
+
2897
3089
  /**
2898
3090
  * This indicates that an unknown internal error has occurred.
2899
3091
  */
@@ -3307,7 +3499,10 @@ typedef enum CUtensorMapDataType_enum {
3307
3499
  CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
3308
3500
  CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,
3309
3501
  CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,
3310
- CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ
3502
+ CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ,
3503
+ CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,
3504
+ CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,
3505
+ CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B
3311
3506
  } CUtensorMapDataType;
3312
3507
 
3313
3508
  /**
@@ -3327,6 +3522,9 @@ typedef enum CUtensorMapSwizzle_enum {
3327
3522
  CU_TENSOR_MAP_SWIZZLE_32B,
3328
3523
  CU_TENSOR_MAP_SWIZZLE_64B,
3329
3524
  CU_TENSOR_MAP_SWIZZLE_128B,
3525
+ CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,
3526
+ CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B,
3527
+ CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B
3330
3528
  } CUtensorMapSwizzle;
3331
3529
 
3332
3530
  /**
@@ -3347,6 +3545,14 @@ typedef enum CUtensorMapFloatOOBfill_enum {
3347
3545
  CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
3348
3546
  } CUtensorMapFloatOOBfill;
3349
3547
 
3548
+ /**
3549
+ * Tensor map Im2Col wide mode
3550
+ */
3551
+ typedef enum CUtensorMapIm2ColWideMode_enum {
3552
+ CU_TENSOR_MAP_IM2COL_WIDE_MODE_W = 0,
3553
+ CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
3554
+ } CUtensorMapIm2ColWideMode;
3555
+
3350
3556
  /**
3351
3557
  * GPU Direct v3 tokens
3352
3558
  */
@@ -3418,7 +3624,7 @@ typedef enum CUexternalMemoryHandleType_enum {
3418
3624
  /**
3419
3625
  * Handle is an NvSciBuf object
3420
3626
  */
3421
- CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8
3627
+ CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8,
3422
3628
  } CUexternalMemoryHandleType;
3423
3629
 
3424
3630
  /**
@@ -3862,6 +4068,13 @@ typedef enum CUmemRangeHandleType_enum
3862
4068
  CU_MEM_RANGE_HANDLE_TYPE_MAX = 0x7FFFFFFF
3863
4069
  } CUmemRangeHandleType;
3864
4070
 
4071
+ /**
4072
+ * Flag for requesting handle type for address range.
4073
+ */
4074
+ typedef enum CUmemRangeFlags_enum {
4075
+ CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE = 0x1 /**< Indicates that DMA_BUF handle should be mapped via PCIe BAR1 */
4076
+ } CUmemRangeFlags;
4077
+
3865
4078
  /**
3866
4079
  * Sparse subresource types
3867
4080
  */
@@ -3951,6 +4164,11 @@ typedef enum CUmemAllocationCompType_enum {
3951
4164
  * This flag if set indicates that the memory will be used as a tile pool.
3952
4165
  */
3953
4166
  #define CU_MEM_CREATE_USAGE_TILE_POOL 0x1
4167
+ /**
4168
+ * This flag, if set, indicates that the memory will be used as a buffer for
4169
+ * hardware accelerated decompression.
4170
+ */
4171
+ #define CU_MEM_CREATE_USAGE_HW_DECOMPRESS 0x2
3954
4172
 
3955
4173
  /**
3956
4174
  * Specifies the allocation properties for a allocation.
@@ -4137,6 +4355,12 @@ typedef enum CUmemPool_attribute_enum {
4137
4355
  CU_MEMPOOL_ATTR_USED_MEM_HIGH
4138
4356
  } CUmemPool_attribute;
4139
4357
 
4358
+ /**
4359
+ * This flag, if set, indicates that the memory will be used as a buffer for
4360
+ * hardware accelerated decompression.
4361
+ */
4362
+ #define CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS 0x2
4363
+
4140
4364
  /**
4141
4365
  * Specifies the properties of allocations made from the pool.
4142
4366
  */
@@ -4152,7 +4376,8 @@ typedef struct CUmemPoolProps_st {
4152
4376
  */
4153
4377
  void *win32SecurityAttributes;
4154
4378
  size_t maxSize; /**< Maximum pool size. When set to 0, defaults to a system dependent value. */
4155
- unsigned char reserved[56]; /**< reserved for future use, must be 0 */
4379
+ unsigned short usage; /**< Bitmask indicating intended usage for the pool. */
4380
+ unsigned char reserved[54]; /**< reserved for future use, must be 0 */
4156
4381
  } CUmemPoolProps_v1;
4157
4382
  typedef CUmemPoolProps_v1 CUmemPoolProps;
4158
4383
 
@@ -4350,6 +4575,12 @@ typedef struct CUgraphNodeParams_st {
4350
4575
  */
4351
4576
  #define CUDA_ARRAY3D_DEFERRED_MAPPING 0x80
4352
4577
 
4578
+ /**
4579
+ * This flag indicates that the CUDA array will be used for hardware accelerated
4580
+ * video encode/decode operations.
4581
+ */
4582
+ #define CUDA_ARRAY3D_VIDEO_ENCODE_DECODE 0x100
4583
+
4353
4584
  /**
4354
4585
  * Override the texref format with a format inferred from the array.
4355
4586
  * Flag for ::cuTexRefSetArray()
@@ -4494,9 +4725,9 @@ typedef enum CUgraphDebugDot_flags_enum {
4494
4725
  CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES = 1<<10, /**< Adds node handles and every kernel function handle to output */
4495
4726
  CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS = 1<<11, /**< Adds memory alloc node parameters to output */
4496
4727
  CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS = 1<<12, /**< Adds memory free node parameters to output */
4497
- CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS = 1<<13 /**< Adds batch mem op node parameters to output */
4498
- , CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO = 1<<14 /**< Adds edge numbering information */
4499
- , CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS = 1<<15 /**< Adds conditional node parameters to output */
4728
+ CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS = 1<<13, /**< Adds batch mem op node parameters to output */
4729
+ CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO = 1<<14, /**< Adds edge numbering information */
4730
+ CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS = 1<<15 /**< Adds conditional node parameters to output */
4500
4731
  } CUgraphDebugDot_flags;
4501
4732
 
4502
4733
  /**
@@ -4528,11 +4759,180 @@ typedef enum CUgraphInstantiate_flags_enum {
4528
4759
  priority of the stream it is launched into. */
4529
4760
  } CUgraphInstantiate_flags;
4530
4761
 
4762
+ /**
4763
+ * CUDA device NUMA configuration
4764
+ */
4531
4765
  typedef enum CUdeviceNumaConfig_enum {
4532
4766
  CU_DEVICE_NUMA_CONFIG_NONE = 0, /**< The GPU is not a NUMA node */
4533
4767
  CU_DEVICE_NUMA_CONFIG_NUMA_NODE, /**< The GPU is a NUMA node, CU_DEVICE_ATTRIBUTE_NUMA_ID contains its NUMA ID */
4534
4768
  } CUdeviceNumaConfig;
4535
4769
 
4770
+ /**
4771
+ * CUDA Process States
4772
+ */
4773
+ typedef enum CUprocessState_enum {
4774
+ CU_PROCESS_STATE_RUNNING = 0, /**< Default process state */
4775
+ CU_PROCESS_STATE_LOCKED, /**< CUDA API locks are taken so further CUDA API calls will block */
4776
+ CU_PROCESS_STATE_CHECKPOINTED, /**< Application memory contents have been checkpointed and underlying allocations and device handles have been released */
4777
+ CU_PROCESS_STATE_FAILED, /**< Application entered an uncorrectable error during the checkpoint/restore process */
4778
+ } CUprocessState;
4779
+
4780
+ /**
4781
+ * CUDA checkpoint optional lock arguments
4782
+ */
4783
+ typedef struct CUcheckpointLockArgs_st {
4784
+ unsigned int timeoutMs; /**< Timeout in milliseconds to attempt to lock the process, 0 indicates no timeout */
4785
+ unsigned int reserved0; /**< Reserved for future use, must be zero */
4786
+ cuuint64_t reserved1[7]; /**< Reserved for future use, must be zeroed */
4787
+ } CUcheckpointLockArgs;
4788
+
4789
+ /**
4790
+ * CUDA checkpoint optional checkpoint arguments
4791
+ */
4792
+ typedef struct CUcheckpointCheckpointArgs_st {
4793
+ cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
4794
+ } CUcheckpointCheckpointArgs;
4795
+
4796
+ /**
4797
+ * CUDA checkpoint optional restore arguments
4798
+ */
4799
+ typedef struct CUcheckpointRestoreArgs_st {
4800
+ cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
4801
+ } CUcheckpointRestoreArgs;
4802
+
4803
+ /**
4804
+ * CUDA checkpoint optional unlock arguments
4805
+ */
4806
+ typedef struct CUcheckpointUnlockArgs_st {
4807
+ cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
4808
+ } CUcheckpointUnlockArgs;
4809
+
4810
+ /**
4811
+ * Flags to specify for copies within a batch. For more details see ::cuMemcpyBatchAsync.
4812
+ */
4813
+ typedef enum CUmemcpyFlags_enum {
4814
+ CU_MEMCPY_FLAG_DEFAULT = 0x0,
4815
+
4816
+ /**
4817
+ * Hint to the driver to try and overlap the copy with compute work on the SMs.
4818
+ */
4819
+ CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE = 0x1
4820
+ } CUmemcpyFlags;
4821
+
4822
+ /**
4823
+ * These flags allow applications to convey the source access ordering CUDA must maintain.
4824
+ * The destination will always be accessed in stream order.
4825
+ */
4826
+ typedef enum CUmemcpySrcAccessOrder_enum {
4827
+ /**
4828
+ * Default invalid.
4829
+ */
4830
+ CU_MEMCPY_SRC_ACCESS_ORDER_INVALID = 0x0,
4831
+
4832
+ /**
4833
+ * Indicates that access to the source pointer must be in stream order.
4834
+ */
4835
+ CU_MEMCPY_SRC_ACCESS_ORDER_STREAM = 0x1,
4836
+
4837
+ /**
4838
+ * Indicates that access to the source pointer can be out of stream order and
4839
+ * all accesses must be complete before the API call returns. This flag is suited for
4840
+ * ephemeral sources (ex., stack variables) when it's known that no prior operations
4841
+ * in the stream can be accessing the memory and also that the lifetime of the memory
4842
+ * is limited to the scope that the source variable was declared in. Specifying
4843
+ * this flag allows the driver to optimize the copy and removes the need for the user
4844
+ * to synchronize the stream after the API call.
4845
+ */
4846
+ CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL = 0x2,
4847
+
4848
+ /**
4849
+ * Indicates that access to the source pointer can be out of stream order and the accesses
4850
+ * can happen even after the API call returns. This flag is suited for host pointers
4851
+ * allocated outside CUDA (ex., via malloc) when it's known that no prior operations
4852
+ * in the stream can be accessing the memory. Specifying this flag allows the driver
4853
+ * to optimize the copy on certain platforms.
4854
+ */
4855
+ CU_MEMCPY_SRC_ACCESS_ORDER_ANY = 0x3,
4856
+
4857
+ CU_MEMCPY_SRC_ACCESS_ORDER_MAX = 0x7FFFFFFF
4858
+ } CUmemcpySrcAccessOrder;
4859
+
4860
+ /**
4861
+ * Attributes specific to copies within a batch. For more details on usage see ::cuMemcpyBatchAsync.
4862
+ */
4863
+ typedef struct CUmemcpyAttributes_st {
4864
+ CUmemcpySrcAccessOrder srcAccessOrder; /**< Source access ordering to be observed for copies with this attribute. */
4865
+ CUmemLocation srcLocHint; /**< Hint location for the source operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
4866
+ CUmemLocation dstLocHint; /**< Hint location for the destination operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
4867
+ unsigned int flags; /**< Additional flags for copies with this attribute. See ::CUmemcpyFlags */
4868
+ } CUmemcpyAttributes_v1;
4869
+ typedef CUmemcpyAttributes_v1 CUmemcpyAttributes;
4870
+
4871
+ /**
4872
+ * These flags allow applications to convey the operand type for individual copies specified in ::cuMemcpy3DBatchAsync.
4873
+ */
4874
+ typedef enum CUmemcpy3DOperandType_enum {
4875
+ CU_MEMCPY_OPERAND_TYPE_POINTER = 0x1, /**< Memcpy operand is a valid pointer. */
4876
+ CU_MEMCPY_OPERAND_TYPE_ARRAY = 0x2, /**< Memcpy operand is a CUarray. */
4877
+ CU_MEMCPY_OPERAND_TYPE_MAX = 0x7FFFFFFF
4878
+ } CUmemcpy3DOperandType;
4879
+
4880
+ /**
4881
+ * Struct representing offset into a CUarray in elements
4882
+ */
4883
+ typedef struct CUoffset3D_st {
4884
+ size_t x;
4885
+ size_t y;
4886
+ size_t z;
4887
+ } CUoffset3D_v1;
4888
+ typedef CUoffset3D_v1 CUoffset3D;
4889
+
4890
+ /**
4891
+ * Struct representing width/height/depth of a CUarray in elements
4892
+ */
4893
+ typedef struct CUextent3D_st {
4894
+ size_t width;
4895
+ size_t height;
4896
+ size_t depth;
4897
+ } CUextent3D_v1;
4898
+ typedef CUextent3D_v1 CUextent3D;
4899
+
4900
+ /**
4901
+ * Struct representing an operand for copy with ::cuMemcpy3DBatchAsync
4902
+ */
4903
+ typedef struct CUmemcpy3DOperand_st {
4904
+ CUmemcpy3DOperandType type;
4905
+ union {
4906
+ /**
4907
+ * Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_POINTER
4908
+ */
4909
+ struct {
4910
+ CUdeviceptr ptr;
4911
+ size_t rowLength; /**< Length of each row in elements. */
4912
+ size_t layerHeight; /**< Height of each layer in elements. */
4913
+ CUmemLocation locHint; /**< Hint location for the operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
4914
+ } ptr;
4915
+
4916
+ /**
4917
+ * Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_ARRAY
4918
+ */
4919
+ struct {
4920
+ CUarray array;
4921
+ CUoffset3D offset;
4922
+ } array;
4923
+ } op;
4924
+ } CUmemcpy3DOperand_v1;
4925
+ typedef CUmemcpy3DOperand_v1 CUmemcpy3DOperand;
4926
+
4927
+ typedef struct CUDA_MEMCPY3D_BATCH_OP_st {
4928
+ CUmemcpy3DOperand src; /**< Source memcpy operand. */
4929
+ CUmemcpy3DOperand dst; /**< Destination memcpy operand. */
4930
+ CUextent3D extent; /**< Extents of the memcpy between src and dst. The width, height and depth components must not be 0.*/
4931
+ CUmemcpySrcAccessOrder srcAccessOrder; /**< Source access ordering to be observed for copy from src to dst. */
4932
+ unsigned int flags; /**< Additional flags for copies with this attribute. See ::CUmemcpyFlags */
4933
+ } CUDA_MEMCPY3D_BATCH_OP_v1;
4934
+ typedef CUDA_MEMCPY3D_BATCH_OP_v1 CUDA_MEMCPY3D_BATCH_OP;
4935
+
4536
4936
  /** @} */ /* END CUDA_TYPES */
4537
4937
 
4538
4938
  #if defined(__GNUC__)
@@ -5124,6 +5524,12 @@ CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
5124
5524
  * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here.
5125
5525
  * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC
5126
5526
  * - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
5527
+ * - ::CU_DEVICE_ATTRIBUTE_NUMA_CONFIG: NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum
5528
+ * - ::CU_DEVICE_ATTRIBUTE_NUMA_ID: NUMA node ID of the GPU memory
5529
+ * - ::CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED: Device supports switch multicast and reduction operations.
5530
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID: The combined 16-bit PCI device ID and 16-bit PCI vendor ID.
5531
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID: The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID.
5532
+ ID.
5127
5533
  *
5128
5534
  * \param pi - Returned device attribute value
5129
5535
  * \param attrib - Device attribute to query
@@ -5310,6 +5716,15 @@ CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type
5310
5716
  * determined by comparing the numerical values between the two enums, with
5311
5717
  * smaller scopes having smaller values.
5312
5718
  *
5719
+ * On platforms that support GPUDirect RDMA writes via more than one path in
5720
+ * hardware (see ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE), the user should
5721
+ * consider those paths as belonging to separate ordering domains. Note that in
5722
+ * such cases CUDA driver will report both RDMA writes ordering and RDMA write
5723
+ * scope as ALL_DEVICES and a call to cuFlushGPUDirectRDMA will be a no-op,
5724
+ * but when these multiple paths are used simultaneously, it is the user's
5725
+ * responsibility to ensure ordering by using mechanisms outside the scope of
5726
+ * CUDA.
5727
+ *
5313
5728
  * Users may query support for this API via
5314
5729
  * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS.
5315
5730
  *
@@ -5991,6 +6406,161 @@ CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
5991
6406
  */
5992
6407
  CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev);
5993
6408
 
6409
+ /**
6410
+ * \brief Create a CUDA context
6411
+ *
6412
+ * Creates a new CUDA context and associates it with the calling thread. The
6413
+ * \p flags parameter is described below. The context is created with a usage
6414
+ * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy()
6415
+ * when done using the context. If a context is already current to the thread,
6416
+ * it is supplanted by the newly created context and may be restored by a subsequent
6417
+ * call to ::cuCtxPopCurrent().
6418
+ *
6419
+ * CUDA context can be created with execution affinity. The type and the amount of
6420
+ execution resource the context can use is limited by \p paramsArray and \p numExecAffinityParams
6421
+ in \p execAffinity. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numExecAffinityParams
6422
+ * describes the size of the paramsArray. If two \p CUexecAffinityParam in the array have the same type,
6423
+ * the latter execution affinity parameter overrides the former execution affinity parameter.
6424
+ * The supported execution affinity types are:
6425
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
6426
+ * of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
6427
+ * rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
6428
+ * affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
6429
+ * is only supported under Volta+ MPS.
6430
+ *
6431
+ * CUDA context can be created in CIG(CUDA in Graphics) mode by setting \p cigParams.
6432
+ * Data from graphics client is shared with CUDA via the \p sharedData in \p cigParams.
6433
+ * Support for D3D12 graphics client can be determined using ::cuDeviceGetAttribute() with
6434
+ * ::CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED. \p sharedData is a ID3D12CommandQueue handle.
6435
+ * Either \p execAffinityParams or \p cigParams can be set to a non-null value. Setting both to a
6436
+ * non-null value will result in an undefined behavior.
6437
+ *
6438
+ * The three LSBs of the \p flags parameter can be used to control how the OS
6439
+ * thread, which owns the CUDA context at the time of an API call, interacts
6440
+ * with the OS scheduler when waiting for results from the GPU. Only one of
6441
+ * the scheduling flags can be set when creating a context.
6442
+ *
6443
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
6444
+ * results from the GPU. This can decrease latency when waiting for the GPU,
6445
+ * but may lower the performance of CPU threads if they are performing work in
6446
+ * parallel with the CUDA thread.
6447
+ *
6448
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
6449
+ * results from the GPU. This can increase latency when waiting for the GPU,
6450
+ * but can increase the performance of CPU threads performing work in parallel
6451
+ * with the GPU.
6452
+ *
6453
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
6454
+ * synchronization primitive when waiting for the GPU to finish work.
6455
+ *
6456
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
6457
+ * synchronization primitive when waiting for the GPU to finish work. <br>
6458
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
6459
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
6460
+ *
6461
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
6462
+ * uses a heuristic based on the number of active CUDA contexts in the
6463
+ * process \e C and the number of logical processors in the system \e P. If
6464
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
6465
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
6466
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
6467
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
6468
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
6469
+ * for low-powered devices.
6470
+ *
6471
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
6472
+ * This flag must be set in order to allocate pinned host memory that is
6473
+ * accessible to the GPU.
6474
+ *
6475
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
6476
+ * after resizing local memory for a kernel. This can prevent thrashing by
6477
+ * local memory allocations when launching many kernels with high local
6478
+ * memory usage at the cost of potentially increased memory usage. <br>
6479
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
6480
+ * by this flag is now the default and cannot be disabled.
6481
+ * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
6482
+ *
6483
+ * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
6484
+ * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
6485
+ * be set during context creation to instruct CUDA to create a coredump if
6486
+ * this context raises an exception during execution. These environment variables
6487
+ * are described in the CUDA-GDB user guide under the "GPU core dump support"
6488
+ * section.
6489
+ * The initial attributes will be taken from the global attributes at the time of
6490
+ * context creation. The other attributes that control coredump output can be
6491
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
6492
+ * it becomes current. This flag is not supported when CUDA context is created in
6493
+ * CIG(CUDA in Graphics) mode.
6494
+ *
6495
+ * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
6496
+ * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment
6497
+ * variables, this flag can be set during context creation to instruct CUDA to
6498
+ * create a coredump if data is written to a certain pipe that is present in the
6499
+ * OS space. These environment variables are described in the CUDA-GDB user
6500
+ * guide under the "GPU core dump support" section.
6501
+ * It is important to note that the pipe name *must* be set with
6502
+ * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
6503
+ * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
6504
+ * The initial attributes will be taken from the global attributes at the time of
6505
+ * context creation. The other attributes that control coredump output can be
6506
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
6507
+ * it becomes current.
6508
+ * Setting this flag on any context creation is equivalent to setting the
6509
+ * ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally.
6510
+ * This flag is not supported when CUDA context is created in
6511
+ * CIG(CUDA in Graphics) mode.
6512
+ *
6513
+ * - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated
6514
+ * on this context will always synchronize. See further documentation in the
6515
+ * section titled "API Synchronization behavior" to learn more about cases when
6516
+ * synchronous memory operations can exhibit asynchronous behavior.
6517
+ *
6518
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
6519
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
6520
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
6521
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
6522
+ * the compute mode for * devices.
6523
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
6524
+ * -h option to it.
6525
+ *
6526
+ * Context creation will fail with :: CUDA_ERROR_INVALID_VALUE if invalid parameter was
6527
+ * passed by client to create the CUDA context.
6528
+ *
6529
+ * Context creation in CIG mode will fail with ::CUDA_ERROR_NOT_SUPPORTED if CIG is not supported
6530
+ * by the device or the driver.
6531
+ * \param pctx - Returned context handle of the new context
6532
+ * \param ctxCreateParams - Context creation parameters
6533
+ * \param flags - Context creation flags
6534
+ * \param dev - Device to create context on
6535
+ *
6536
+ * \return
6537
+ * ::CUDA_SUCCESS,
6538
+ * ::CUDA_ERROR_DEINITIALIZED,
6539
+ * ::CUDA_ERROR_NOT_INITIALIZED,
6540
+ * ::CUDA_ERROR_INVALID_CONTEXT,
6541
+ * ::CUDA_ERROR_INVALID_DEVICE,
6542
+ * ::CUDA_ERROR_INVALID_VALUE,
6543
+ * ::CUDA_ERROR_NOT_SUPPORTED,
6544
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
6545
+ * ::CUDA_ERROR_UNKNOWN
6546
+ * \notefnerr
6547
+ *
6548
+ * \sa ::cuCtxDestroy,
6549
+ * ::cuCtxGetApiVersion,
6550
+ * ::cuCtxGetCacheConfig,
6551
+ * ::cuCtxGetDevice,
6552
+ * ::cuCtxGetFlags,
6553
+ * ::cuCtxGetLimit,
6554
+ * ::cuCtxPopCurrent,
6555
+ * ::cuCtxPushCurrent,
6556
+ * ::cuCtxSetCacheConfig,
6557
+ * ::cuCtxSetLimit,
6558
+ * ::cuCoredumpSetAttributeGlobal,
6559
+ * ::cuCoredumpSetAttribute,
6560
+ * ::cuCtxSynchronize
6561
+ */
6562
+ CUresult CUDAAPI cuCtxCreate_v4(CUcontext *pctx, CUctxCreateParams *ctxCreateParams, unsigned int flags, CUdevice dev);
6563
+
5994
6564
  /**
5995
6565
  * \brief Destroy a CUDA context
5996
6566
  *
@@ -6002,9 +6572,11 @@ CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArra
6002
6572
  * Destroys and cleans up all resources associated with the context.
6003
6573
  * It is the caller's responsibility to ensure that the context or its resources
6004
6574
  * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior.
6005
- * These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
6575
+ * These resources include CUDA types ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
6006
6576
  * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref,
6007
6577
  * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore.
6578
+ * These resources also include memory allocations by ::cuMemAlloc(), ::cuMemAllocHost(),
6579
+ * ::cuMemAllocManaged() and ::cuMemAllocPitch().
6008
6580
  *
6009
6581
  * If \p ctx is current to the calling thread then \p ctx will also be
6010
6582
  * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
@@ -6012,6 +6584,10 @@ CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArra
6012
6584
  * remain current to those threads, and attempting to access \p ctx from
6013
6585
  * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
6014
6586
  *
6587
+ * \note ::cuCtxDestroy() will not destroy memory allocations by ::cuMemCreate(), ::cuMemAllocAsync() and
6588
+ * ::cuMemAllocFromPoolAsync(). These memory allocations are not associated with any CUDA context and need to
6589
+ * be destroyed explicitly.
6590
+ *
6015
6591
  * \param ctx - Context to destroy
6016
6592
  *
6017
6593
  * \return
@@ -6158,11 +6734,11 @@ CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
6158
6734
  CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
6159
6735
 
6160
6736
  /**
6161
- * \brief Returns the device ID for the current context
6737
+ * \brief Returns the device handle for the current context
6162
6738
  *
6163
- * Returns in \p *device the ordinal of the current context's device.
6739
+ * Returns in \p *device the handle of the current context's device.
6164
6740
  *
6165
- * \param device - Returned device ID for the current context
6741
+ * \param device - Returned device handle for the current context
6166
6742
  *
6167
6743
  * \return
6168
6744
  * ::CUDA_SUCCESS,
@@ -6278,9 +6854,11 @@ CUresult CUDAAPI cuCtxSetFlags(unsigned int flags);
6278
6854
  CUresult CUDAAPI cuCtxGetId(CUcontext ctx, unsigned long long *ctxId);
6279
6855
 
6280
6856
  /**
6281
- * \brief Block for a context's tasks to complete
6857
+ * \brief Block for the current context's tasks to complete
6282
6858
  *
6283
- * Blocks until the device has completed all preceding requested tasks.
6859
+ * Blocks until the current context has completed all preceding requested tasks.
6860
+ * If the current context is the primary context, green contexts that have been
6861
+ * created will also be synchronized.
6284
6862
  * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
6285
6863
  * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
6286
6864
  * CPU thread will block until the GPU context has finished its work.
@@ -6662,14 +7240,87 @@ CUresult CUDAAPI cuCtxResetPersistingL2Cache(void);
6662
7240
  */
6663
7241
  CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
6664
7242
 
6665
-
6666
- /** @} */ /* END CUDA_CTX */
6667
-
6668
7243
  /**
6669
- * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
7244
+ * \brief Records an event.
6670
7245
  *
6671
- * ___MANBRIEF___ deprecated context management functions of the low-level CUDA
6672
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
7246
+ * Captures in \p hEvent all the activities of the context \p hCtx
7247
+ * at the time of this call. \p hEvent and \p hCtx must be from the same
7248
+ * CUDA context, otherwise ::CUDA_ERROR_INVALID_HANDLE will be returned.
7249
+ * Calls such as ::cuEventQuery() or ::cuCtxWaitEvent() will then examine
7250
+ * or wait for completion of the work that was captured.
7251
+ * Uses of \p hCtx after this call do not modify \p hEvent.
7252
+ * If the context passed to \p hCtx is the primary context, \p hEvent will
7253
+ * capture all the activities of the primary context and its green contexts.
7254
+ * If the context passed to \p hCtx is a context converted from green context
7255
+ * via ::cuCtxFromGreenCtx(), \p hEvent will capture only the activities of the green context.
7256
+ *
7257
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the
7258
+ * specified context \p hCtx has a stream in the capture mode. In such a case,
7259
+ * the call will invalidate all the conflicting captures.
7260
+ *
7261
+ * \param hCtx - Context to record event for
7262
+ * \param hEvent - Event to record
7263
+ *
7264
+ * \return
7265
+ * ::CUDA_SUCCESS
7266
+ * ::CUDA_ERROR_DEINITIALIZED,
7267
+ * ::CUDA_ERROR_NOT_INITIALIZED,
7268
+ * ::CUDA_ERROR_INVALID_CONTEXT,
7269
+ * ::CUDA_ERROR_INVALID_HANDLE,
7270
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
7271
+ *
7272
+ * \sa
7273
+ * ::cuCtxWaitEvent,
7274
+ * ::cuGreenCtxRecordEvent,
7275
+ * ::cuGreenCtxWaitEvent,
7276
+ * ::cuEventRecord
7277
+ */
7278
+ CUresult CUDAAPI cuCtxRecordEvent(CUcontext hCtx, CUevent hEvent);
7279
+
7280
+ /**
7281
+ * \brief Make a context wait on an event
7282
+ *
7283
+ * Makes all future work submitted to context \p hCtx wait for all work
7284
+ * captured in \p hEvent. The synchronization will be performed on the device
7285
+ * and will not block the calling CPU thread. See ::cuCtxRecordEvent()
7286
+ * for details on what is captured by an event.
7287
+ * If the context passed to \p hCtx is the primary context, the primary context
7288
+ * and its green contexts will wait for \p hEvent.
7289
+ * If the context passed to \p hCtx is a context converted from green context
7290
+ * via ::cuCtxFromGreenCtx(), the green context will wait for \p hEvent.
7291
+ *
7292
+ * \note \p hEvent may be from a different context or device than \p hCtx.
7293
+ *
7294
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and
7295
+ * invalidate the capture if the specified event \p hEvent is part of an ongoing
7296
+ * capture sequence or if the specified context \p hCtx has a stream in the capture mode.
7297
+ *
7298
+ * \param hCtx - Context to wait
7299
+ * \param hEvent - Event to wait on
7300
+ *
7301
+ * \return
7302
+ * ::CUDA_SUCCESS,
7303
+ * ::CUDA_ERROR_DEINITIALIZED,
7304
+ * ::CUDA_ERROR_NOT_INITIALIZED,
7305
+ * ::CUDA_ERROR_INVALID_CONTEXT,
7306
+ * ::CUDA_ERROR_INVALID_HANDLE,
7307
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
7308
+ *
7309
+ * \sa
7310
+ * ::cuCtxRecordEvent,
7311
+ * ::cuGreenCtxRecordEvent,
7312
+ * ::cuGreenCtxWaitEvent,
7313
+ * ::cuStreamWaitEvent
7314
+ */
7315
+ CUresult CUDAAPI cuCtxWaitEvent(CUcontext hCtx, CUevent hEvent);
7316
+
7317
+ /** @} */ /* END CUDA_CTX */
7318
+
7319
+ /**
7320
+ * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
7321
+ *
7322
+ * ___MANBRIEF___ deprecated context management functions of the low-level CUDA
7323
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
6673
7324
  *
6674
7325
  * This section describes the deprecated context management functions of the low-level
6675
7326
  * CUDA driver application programming interface.
@@ -7203,6 +7854,11 @@ CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hm
7203
7854
  * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
7204
7855
  * will accumulate data until the CUlinkState is destroyed.
7205
7856
  *
7857
+ * The data passed in via ::cuLinkAddData and ::cuLinkAddFile will be treated
7858
+ * as relocatable (-rdc=true to nvcc) when linking the final cubin during
7859
+ * ::cuLinkComplete and will have similar consequences as offline relocatable
7860
+ * device code linking.
7861
+ *
7206
7862
  * \p optionValues must remain valid for the life of the CUlinkState if output
7207
7863
  * options are used. No other references to inputs are maintained after this
7208
7864
  * call returns.
@@ -7471,6 +8127,7 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmod
7471
8127
  *
7472
8128
  * The \p code may be a \e cubin or \e fatbin as output by \b nvcc,
7473
8129
  * or a NULL-terminated \e PTX, either as output by \b nvcc or hand-written.
8130
+ * A fatbin should also contain relocatable code when doing separate compilation.
7474
8131
  *
7475
8132
  * Options are passed as an array via \p jitOptions and any corresponding parameters are passed in
7476
8133
  * \p jitOptionsValues. The number of total JIT options is supplied via \p numJitOptions.
@@ -7479,6 +8136,9 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmod
7479
8136
  * Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
7480
8137
  * \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
7481
8138
  *
8139
+ * \note If the library contains managed variables and no device in the system
8140
+ * supports managed variables this call is expected to return ::CUDA_ERROR_NOT_SUPPORTED
8141
+ *
7482
8142
  * \param library - Returned library
7483
8143
  * \param code - Code to load
7484
8144
  * \param jitOptions - Options for JIT
@@ -7499,7 +8159,8 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmod
7499
8159
  * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
7500
8160
  * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
7501
8161
  * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
7502
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
8162
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
8163
+ * ::CUDA_ERROR_NOT_SUPPORTED
7503
8164
  *
7504
8165
  * \sa ::cuLibraryLoadFromFile,
7505
8166
  * ::cuLibraryUnload,
@@ -7528,6 +8189,7 @@ CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
7528
8189
  *
7529
8190
  * The file should be a \e cubin file as output by \b nvcc, or a \e PTX file either
7530
8191
  * as output by \b nvcc or handwritten, or a \e fatbin file as output by \b nvcc.
8192
+ * A fatbin should also contain relocatable code when doing separate compilation.
7531
8193
  *
7532
8194
  * Options are passed as an array via \p jitOptions and any corresponding parameters are
7533
8195
  * passed in \p jitOptionsValues. The number of total options is supplied via \p numJitOptions.
@@ -7536,6 +8198,9 @@ CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
7536
8198
  * Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
7537
8199
  * \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
7538
8200
  *
8201
+ * \note If the library contains managed variables and no device in the system
8202
+ * supports managed variables this call is expected to return ::CUDA_ERROR_NOT_SUPPORTED
8203
+ *
7539
8204
  * \param library - Returned library
7540
8205
  * \param fileName - File to load from
7541
8206
  * \param jitOptions - Options for JIT
@@ -7556,7 +8221,8 @@ CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
7556
8221
  * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
7557
8222
  * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
7558
8223
  * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
7559
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
8224
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
8225
+ * ::CUDA_ERROR_NOT_SUPPORTED
7560
8226
  *
7561
8227
  * \sa ::cuLibraryLoadData,
7562
8228
  * ::cuLibraryUnload,
@@ -7702,6 +8368,29 @@ CUresult CUDAAPI cuLibraryGetModule(CUmodule *pMod, CUlibrary library);
7702
8368
  */
7703
8369
  CUresult CUDAAPI cuKernelGetFunction(CUfunction *pFunc, CUkernel kernel);
7704
8370
 
8371
+ /**
8372
+ * \brief Returns a library handle
8373
+ *
8374
+ * Returns in \p pLib the handle of the library for the requested kernel \p kernel
8375
+ *
8376
+ * \param pLib - Returned library handle
8377
+ * \param kernel - Kernel to retrieve library handle
8378
+ *
8379
+ * \return
8380
+ * ::CUDA_SUCCESS,
8381
+ * ::CUDA_ERROR_DEINITIALIZED,
8382
+ * ::CUDA_ERROR_NOT_INITIALIZED,
8383
+ * ::CUDA_ERROR_INVALID_VALUE,
8384
+ * ::CUDA_ERROR_INVALID_HANDLE,
8385
+ * ::CUDA_ERROR_NOT_FOUND
8386
+ *
8387
+ * \sa ::cuLibraryLoadData,
8388
+ * ::cuLibraryLoadFromFile,
8389
+ * ::cuLibraryUnload,
8390
+ * ::cuLibraryGetKernel
8391
+ */
8392
+ CUresult CUDAAPI cuKernelGetLibrary(CUlibrary *pLib, CUkernel kernel);
8393
+
7705
8394
  /**
7706
8395
  * \brief Returns a global device pointer
7707
8396
  *
@@ -7744,9 +8433,6 @@ CUresult CUDAAPI cuLibraryGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUlibrary
7744
8433
  * Note that managed memory for library \p library is shared across devices and is registered
7745
8434
  * when the library is loaded into atleast one context.
7746
8435
  *
7747
- * \note The API requires a CUDA context to be present and initialized on at least one device.
7748
- * If no context is present, the call returns ::CUDA_ERROR_NOT_FOUND.
7749
- *
7750
8436
  * \param dptr - Returned pointer to the managed memory
7751
8437
  * \param bytes - Returned memory size in bytes
7752
8438
  * \param library - Library to retrieve managed memory from
@@ -7923,6 +8609,9 @@ CUresult CUDAAPI cuKernelGetAttribute(int *pi, CUfunction_attribute attrib, CUke
7923
8609
  * positive. The validity of the cluster dimensions is checked at launch time.
7924
8610
  * If the value is set during compile time, it cannot be set at runtime.
7925
8611
  * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
8612
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
8613
+ * the function can be launched with non-portable cluster size. 1 is allowed,
8614
+ * 0 is disallowed.
7926
8615
  * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
7927
8616
  * scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
7928
8617
  *
@@ -8222,9 +8911,10 @@ CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t Width
8222
8911
  * ::cuMemAllocPitch(), ::cuMemAllocManaged(), ::cuMemAllocAsync(), ::cuMemAllocFromPoolAsync()
8223
8912
  *
8224
8913
  * Note - This API will not perform any implict synchronization when the pointer was allocated with
8225
- * ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to the
8914
+ * ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to these
8226
8915
  * pointer have completed before invoking ::cuMemFree. For best performance and memory reuse, users
8227
8916
  * should use ::cuMemFreeAsync to free memory allocated via the stream ordered memory allocator.
8917
+ * For all other pointers, this API may perform implicit synchronization.
8228
8918
  *
8229
8919
  * \param dptr - Pointer to memory to free
8230
8920
  *
@@ -8776,7 +9466,8 @@ CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
8776
9466
  *
8777
9467
  * IPC functionality is restricted to devices with support for unified
8778
9468
  * addressing on Linux and Windows operating systems.
8779
- * IPC functionality on Windows is restricted to GPUs in TCC mode
9469
+ * IPC functionality on Windows is supported for compatibility purposes
9470
+ * but not recommended as it comes with performance cost.
8780
9471
  * Users can test their device for IPC functionality by calling
8781
9472
  * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
8782
9473
  *
@@ -8819,7 +9510,8 @@ CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
8819
9510
  *
8820
9511
  * IPC functionality is restricted to devices with support for unified
8821
9512
  * addressing on Linux and Windows operating systems.
8822
- * IPC functionality on Windows is restricted to GPUs in TCC mode
9513
+ * IPC functionality on Windows is supported for compatibility purposes
9514
+ * but not recommended as it comes with performance cost.
8823
9515
  * Users can test their device for IPC functionality by calling
8824
9516
  * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
8825
9517
  *
@@ -8864,7 +9556,8 @@ CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle)
8864
9556
  *
8865
9557
  * IPC functionality is restricted to devices with support for unified
8866
9558
  * addressing on Linux and Windows operating systems.
8867
- * IPC functionality on Windows is restricted to GPUs in TCC mode
9559
+ * IPC functionality on Windows is supported for compatibility purposes
9560
+ * but not recommended as it comes with performance cost.
8868
9561
  * Users can test their device for IPC functionality by calling
8869
9562
  * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
8870
9563
  *
@@ -8919,7 +9612,8 @@ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
8919
9612
  *
8920
9613
  * IPC functionality is restricted to devices with support for unified
8921
9614
  * addressing on Linux and Windows operating systems.
8922
- * IPC functionality on Windows is restricted to GPUs in TCC mode
9615
+ * IPC functionality on Windows is supported for compatibility purposes
9616
+ * but not recommended as it comes with performance cost.
8923
9617
  * Users can test their device for IPC functionality by calling
8924
9618
  * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
8925
9619
  *
@@ -8964,7 +9658,8 @@ CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, u
8964
9658
  *
8965
9659
  * IPC functionality is restricted to devices with support for unified
8966
9660
  * addressing on Linux and Windows operating systems.
8967
- * IPC functionality on Windows is restricted to GPUs in TCC mode
9661
+ * IPC functionality on Windows is supported for compatibility purposes
9662
+ * but not recommended as it comes with performance cost.
8968
9663
  * Users can test their device for IPC functionality by calling
8969
9664
  * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
8970
9665
  *
@@ -10643,6 +11338,153 @@ CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
10643
11338
  */
10644
11339
  CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
10645
11340
 
11341
+ /**
11342
+ * \brief Performs a batch of memory copies asynchronously.
11343
+ *
11344
+ * Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
11345
+ * batch are not guaranteed to execute in any specific order. This API only supports pointer-to-pointer copies.
11346
+ * For copies involving CUDA arrays, please see ::cuMemcpy3DBatchAsync.
11347
+ *
11348
+ * Performs memory copies from source buffers specified in \p srcs to destination buffers specified in \p dsts.
11349
+ * The size of each copy is specified in \p sizes. All three arrays must be of the same length as specified
11350
+ * by \p count. Since there are no ordering guarantees for copies within a batch, specifying any dependent copies
11351
+ * within a batch will result in undefined behavior.
11352
+ *
11353
+ * Every copy in the batch has to be associated with a set of attributes specified in the \p attrs array.
11354
+ * Each entry in this array can apply to more than one copy. This can be done by specifying in the \p attrsIdxs array,
11355
+ * the index of the first copy that the corresponding entry in the \p attrs array applies to. Both \p attrs and
11356
+ * \p attrsIdxs must be of the same length as specified by \p numAttrs. For example, if a batch has 10 copies listed
11357
+ * in dst/src/sizes, the first 6 of which have one set of attributes and the remaining 4 another, then \p numAttrs
11358
+ * will be 2, \p attrsIdxs will be {0, 6} and \p attrs will contains the two sets of attributes. Note that the first entry
11359
+ * in \p attrsIdxs must always be 0. Also, each entry must be greater than the previous entry and the last entry should be
11360
+ * less than \p count. Furthermore, \p numAttrs must be lesser than or equal to \p count.
11361
+ *
11362
+ * The ::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
11363
+ * with the attribute. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will
11364
+ * be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then
11365
+ * it indicates that access to the source pointer can be out of stream order and all accesses must be complete before
11366
+ * the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
11367
+ * operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
11368
+ * that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
11369
+ * need for the user to synchronize the stream after the API call. If the source access order is set to
11370
+ * ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the
11371
+ * accesses can happen even after the API call returns. This flag is suited for host pointers allocated
11372
+ * outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
11373
+ * Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcpy operation in the batch must
11374
+ * have a valid ::CUmemcpyAttributes corresponding to it including the appropriate srcAccessOrder setting, otherwise the API
11375
+ * will return ::CUDA_ERROR_INVALID_VALUE.
11376
+ *
11377
+ * The ::CUmemcpyAttributes::srcLocHint and ::CUmemcpyAttributes::dstLocHint allows applications to specify hint locations
11378
+ * for operands of a copy when the operand doesn't have a fixed location. That is, these hints are
11379
+ * only applicable for managed memory pointers on devices where ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or
11380
+ * system-allocated pageable memory on devices where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true.
11381
+ * For other cases, these hints are ignored.
11382
+ *
11383
+ * The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
11384
+ * ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with
11385
+ * any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
11386
+ *
11387
+ * If any error is encountered while parsing the batch, the index within the batch where the error was encountered
11388
+ * will be returned in \p failIdx.
11389
+ *
11390
+ * \param dsts - Array of destination pointers.
11391
+ * \param srcs - Array of memcpy source pointers.
11392
+ * \param sizes - Array of sizes for memcpy operations.
11393
+ * \param count - Size of \p dsts, \p srcs and \p sizes arrays
11394
+ * \param attrs - Array of memcpy attributes.
11395
+ * \param attrsIdxs - Array of indices to specify which copies each entry in the \p attrs array applies to.
11396
+ The attributes specified in attrs[k] will be applied to copies starting from attrsIdxs[k]
11397
+ through attrsIdxs[k+1] - 1. Also attrs[numAttrs-1] will apply to copies starting from
11398
+ attrsIdxs[numAttrs-1] through count - 1.
11399
+ * \param numAttrs - Size of \p attrs and \p attrsIdxs arrays.
11400
+ * \param failIdx - Pointer to a location to return the index of the copy where a failure was encountered.
11401
+ The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
11402
+ * \param hStream - The stream to enqueue the operations in. Must not be legacy NULL stream.
11403
+ *
11404
+ * \return
11405
+ * ::CUDA_SUCCESS
11406
+ * ::CUDA_ERROR_DEINITIALIZED
11407
+ * ::CUDA_ERROR_NOT_INITIALIZED
11408
+ * ::CUDA_ERROR_INVALID_VALUE
11409
+ * \notefnerr
11410
+ * \note_async
11411
+ * \note_memcpy
11412
+ */
11413
+ CUresult CUDAAPI cuMemcpyBatchAsync(CUdeviceptr *dsts, CUdeviceptr *srcs, size_t *sizes, size_t count,
11414
+ CUmemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs,
11415
+ size_t *failIdx, CUstream hStream);
11416
+
11417
+ /**
11418
+ * \brief Performs a batch of 3D memory copies asynchronously.
11419
+ *
11420
+ * Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
11421
+ * batch are not guaranteed to execute in any specific order. Note that this means specifying any dependent
11422
+ * copies within a batch will result in undefined behavior.
11423
+ *
11424
+ * Performs memory copies as specified in the \p opList array. The length of this array is specified in \p numOps.
11425
+ * Each entry in this array describes a copy operation. This includes among other things, the source and destination
11426
+ * operands for the copy as specified in ::CUDA_MEMCPY3D_BATCH_OP::src and ::CUDA_MEMCPY3D_BATCH_OP::dst respectively.
11427
+ * The source and destination operands of a copy can either be a pointer or a CUDA array. The width, height and depth
11428
+ * of a copy is specified in ::CUDA_MEMCPY3D_BATCH_OP::extent. The width, height and depth of a copy are specified in
11429
+ * elements and must not be zero. For pointer-to-pointer copies, the element size is considered to be 1. For pointer
11430
+ * to CUDA array or vice versa copies, the element size is determined by the CUDA array. For CUDA array to CUDA array copies,
11431
+ * the element size of the two CUDA arrays must match.
11432
+ *
11433
+ * For a given operand, if ::CUmemcpy3DOperand::type is specified as ::CU_MEMCPY_OPERAND_TYPE_POINTER, then
11434
+ * ::CUmemcpy3DOperand::op::ptr will be used. The ::CUmemcpy3DOperand::op::ptr::ptr field must contain the pointer where
11435
+ * the copy should begin. The ::CUmemcpy3DOperand::op::ptr::rowLength field specifies the length of each row in elements and
11436
+ * must either be zero or be greater than or equal to the width of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::width.
11437
+ * The ::CUmemcpy3DOperand::op::ptr::layerHeight field specifies the height of each layer and must either be zero or be greater than
11438
+ * or equal to the height of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::height. When either of these values is zero,
11439
+ * that aspect of the operand is considered to be tightly packed according to the copy extent. For managed memory pointers on devices where
11440
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or system-allocated pageable memory on devices where
11441
+ * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true, the ::CUmemcpy3DOperand::op::ptr::locHint field can be used to hint
11442
+ * the location of the operand.
11443
+ *
11444
+ * If an operand's type is specified as ::CU_MEMCPY_OPERAND_TYPE_ARRAY, then ::CUmemcpy3DOperand::op::array will be used.
11445
+ * The ::CUmemcpy3DOperand::op::array::array field specifies the CUDA array and ::CUmemcpy3DOperand::op::array::offset specifies
11446
+ * the 3D offset into that array where the copy begins.
11447
+ *
11448
+ * The ::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
11449
+ * with the attribute. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will
11450
+ * be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then
11451
+ * it indicates that access to the source pointer can be out of stream order and all accesses must be complete before
11452
+ * the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
11453
+ * operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
11454
+ * that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
11455
+ * need for the user to synchronize the stream after the API call. If the source access order is set to
11456
+ * ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the
11457
+ * accesses can happen even after the API call returns. This flag is suited for host pointers allocated
11458
+ * outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
11459
+ * Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcopy operation in \p opList must
11460
+ * have a valid srcAccessOrder setting, otherwise this API will return ::CUDA_ERROR_INVALID_VALUE.
11461
+ *
11462
+ * The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
11463
+ * ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with
11464
+ * any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
11465
+ *
11466
+ * If any error is encountered while parsing the batch, the index within the batch where the error was encountered
11467
+ * will be returned in \p failIdx.
11468
+ *
11469
+ * \param numOps - Total number of memcpy operations.
11470
+ * \param opList - Array of size \p numOps containing the actual memcpy operations.
11471
+ * \param failIdx - Pointer to a location to return the index of the copy where a failure was encountered.
11472
+ * The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
11473
+ * \param flags - Flags for future use, must be zero now.
11474
+ * \param hStream - The stream to enqueue the operations in. Must not be default NULL stream.
11475
+ *
11476
+ * \return
11477
+ * ::CUDA_SUCCESS
11478
+ * ::CUDA_ERROR_DEINITIALIZED
11479
+ * ::CUDA_ERROR_NOT_INITIALIZED
11480
+ * ::CUDA_ERROR_INVALID_VALUE
11481
+ * \notefnerr
11482
+ * \note_async
11483
+ * \note_memcpy
11484
+ */
11485
+ CUresult CUDAAPI cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP *opList,
11486
+ size_t *failIdx, unsigned long long flags, CUstream hStream);
11487
+
10646
11488
  /**
10647
11489
  * \brief Initializes device memory
10648
11490
  *
@@ -11139,8 +11981,51 @@ CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsi
11139
11981
  CU_AD_FORMAT_SIGNED_INT16 = 0x09,
11140
11982
  CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
11141
11983
  CU_AD_FORMAT_HALF = 0x10,
11142
- CU_AD_FORMAT_FLOAT = 0x20
11143
- } CUarray_format;
11984
+ CU_AD_FORMAT_FLOAT = 0x20,
11985
+ CU_AD_FORMAT_NV12 = 0xb0,
11986
+ CU_AD_FORMAT_UNORM_INT8X1 = 0xc0,
11987
+ CU_AD_FORMAT_UNORM_INT8X2 = 0xc1,
11988
+ CU_AD_FORMAT_UNORM_INT8X4 = 0xc2,
11989
+ CU_AD_FORMAT_UNORM_INT16X1 = 0xc3,
11990
+ CU_AD_FORMAT_UNORM_INT16X2 = 0xc4,
11991
+ CU_AD_FORMAT_UNORM_INT16X4 = 0xc5,
11992
+ CU_AD_FORMAT_SNORM_INT8X1 = 0xc6,
11993
+ CU_AD_FORMAT_SNORM_INT8X2 = 0xc7,
11994
+ CU_AD_FORMAT_SNORM_INT8X4 = 0xc8,
11995
+ CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
11996
+ CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
11997
+ CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
11998
+ CU_AD_FORMAT_BC1_UNORM = 0x91,
11999
+ CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
12000
+ CU_AD_FORMAT_BC2_UNORM = 0x93,
12001
+ CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
12002
+ CU_AD_FORMAT_BC3_UNORM = 0x95,
12003
+ CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
12004
+ CU_AD_FORMAT_BC4_UNORM = 0x97,
12005
+ CU_AD_FORMAT_BC4_SNORM = 0x98,
12006
+ CU_AD_FORMAT_BC5_UNORM = 0x99,
12007
+ CU_AD_FORMAT_BC5_SNORM = 0x9a,
12008
+ CU_AD_FORMAT_BC6H_UF16 = 0x9b,
12009
+ CU_AD_FORMAT_BC6H_SF16 = 0x9c,
12010
+ CU_AD_FORMAT_BC7_UNORM = 0x9d,
12011
+ CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
12012
+ CU_AD_FORMAT_P010 = 0x9f,
12013
+ CU_AD_FORMAT_P016 = 0xa1,
12014
+ CU_AD_FORMAT_NV16 = 0xa2,
12015
+ CU_AD_FORMAT_P210 = 0xa3,
12016
+ CU_AD_FORMAT_P216 = 0xa4,
12017
+ CU_AD_FORMAT_YUY2 = 0xa5,
12018
+ CU_AD_FORMAT_Y210 = 0xa6,
12019
+ CU_AD_FORMAT_Y216 = 0xa7,
12020
+ CU_AD_FORMAT_AYUV = 0xa8,
12021
+ CU_AD_FORMAT_Y410 = 0xa9,
12022
+ CU_AD_FORMAT_Y416 = 0xb1,
12023
+ CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
12024
+ CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
12025
+ CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
12026
+ CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
12027
+ CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
12028
+ } CUarray_format;
11144
12029
  * \endcode
11145
12030
  * - \p NumChannels specifies the number of packed components per CUDA array
11146
12031
  * element; it may be 1, 2, or 4;
@@ -11459,7 +12344,50 @@ CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
11459
12344
  CU_AD_FORMAT_SIGNED_INT16 = 0x09,
11460
12345
  CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
11461
12346
  CU_AD_FORMAT_HALF = 0x10,
11462
- CU_AD_FORMAT_FLOAT = 0x20
12347
+ CU_AD_FORMAT_FLOAT = 0x20,
12348
+ CU_AD_FORMAT_NV12 = 0xb0,
12349
+ CU_AD_FORMAT_UNORM_INT8X1 = 0xc0,
12350
+ CU_AD_FORMAT_UNORM_INT8X2 = 0xc1,
12351
+ CU_AD_FORMAT_UNORM_INT8X4 = 0xc2,
12352
+ CU_AD_FORMAT_UNORM_INT16X1 = 0xc3,
12353
+ CU_AD_FORMAT_UNORM_INT16X2 = 0xc4,
12354
+ CU_AD_FORMAT_UNORM_INT16X4 = 0xc5,
12355
+ CU_AD_FORMAT_SNORM_INT8X1 = 0xc6,
12356
+ CU_AD_FORMAT_SNORM_INT8X2 = 0xc7,
12357
+ CU_AD_FORMAT_SNORM_INT8X4 = 0xc8,
12358
+ CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
12359
+ CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
12360
+ CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
12361
+ CU_AD_FORMAT_BC1_UNORM = 0x91,
12362
+ CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
12363
+ CU_AD_FORMAT_BC2_UNORM = 0x93,
12364
+ CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
12365
+ CU_AD_FORMAT_BC3_UNORM = 0x95,
12366
+ CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
12367
+ CU_AD_FORMAT_BC4_UNORM = 0x97,
12368
+ CU_AD_FORMAT_BC4_SNORM = 0x98,
12369
+ CU_AD_FORMAT_BC5_UNORM = 0x99,
12370
+ CU_AD_FORMAT_BC5_SNORM = 0x9a,
12371
+ CU_AD_FORMAT_BC6H_UF16 = 0x9b,
12372
+ CU_AD_FORMAT_BC6H_SF16 = 0x9c,
12373
+ CU_AD_FORMAT_BC7_UNORM = 0x9d,
12374
+ CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
12375
+ CU_AD_FORMAT_P010 = 0x9f,
12376
+ CU_AD_FORMAT_P016 = 0xa1,
12377
+ CU_AD_FORMAT_NV16 = 0xa2,
12378
+ CU_AD_FORMAT_P210 = 0xa3,
12379
+ CU_AD_FORMAT_P216 = 0xa4,
12380
+ CU_AD_FORMAT_YUY2 = 0xa5,
12381
+ CU_AD_FORMAT_Y210 = 0xa6,
12382
+ CU_AD_FORMAT_Y216 = 0xa7,
12383
+ CU_AD_FORMAT_AYUV = 0xa8,
12384
+ CU_AD_FORMAT_Y410 = 0xa9,
12385
+ CU_AD_FORMAT_Y416 = 0xb1,
12386
+ CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
12387
+ CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
12388
+ CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
12389
+ CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
12390
+ CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
11463
12391
  } CUarray_format;
11464
12392
  * \endcode
11465
12393
  *
@@ -11680,7 +12608,50 @@ CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescripto
11680
12608
  CU_AD_FORMAT_SIGNED_INT16 = 0x09,
11681
12609
  CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
11682
12610
  CU_AD_FORMAT_HALF = 0x10,
11683
- CU_AD_FORMAT_FLOAT = 0x20
12611
+ CU_AD_FORMAT_FLOAT = 0x20,
12612
+ CU_AD_FORMAT_NV12 = 0xb0,
12613
+ CU_AD_FORMAT_UNORM_INT8X1 = 0xc0,
12614
+ CU_AD_FORMAT_UNORM_INT8X2 = 0xc1,
12615
+ CU_AD_FORMAT_UNORM_INT8X4 = 0xc2,
12616
+ CU_AD_FORMAT_UNORM_INT16X1 = 0xc3,
12617
+ CU_AD_FORMAT_UNORM_INT16X2 = 0xc4,
12618
+ CU_AD_FORMAT_UNORM_INT16X4 = 0xc5,
12619
+ CU_AD_FORMAT_SNORM_INT8X1 = 0xc6,
12620
+ CU_AD_FORMAT_SNORM_INT8X2 = 0xc7,
12621
+ CU_AD_FORMAT_SNORM_INT8X4 = 0xc8,
12622
+ CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
12623
+ CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
12624
+ CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
12625
+ CU_AD_FORMAT_BC1_UNORM = 0x91,
12626
+ CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
12627
+ CU_AD_FORMAT_BC2_UNORM = 0x93,
12628
+ CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
12629
+ CU_AD_FORMAT_BC3_UNORM = 0x95,
12630
+ CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
12631
+ CU_AD_FORMAT_BC4_UNORM = 0x97,
12632
+ CU_AD_FORMAT_BC4_SNORM = 0x98,
12633
+ CU_AD_FORMAT_BC5_UNORM = 0x99,
12634
+ CU_AD_FORMAT_BC5_SNORM = 0x9a,
12635
+ CU_AD_FORMAT_BC6H_UF16 = 0x9b,
12636
+ CU_AD_FORMAT_BC6H_SF16 = 0x9c,
12637
+ CU_AD_FORMAT_BC7_UNORM = 0x9d,
12638
+ CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
12639
+ CU_AD_FORMAT_P010 = 0x9f,
12640
+ CU_AD_FORMAT_P016 = 0xa1,
12641
+ CU_AD_FORMAT_NV16 = 0xa2,
12642
+ CU_AD_FORMAT_P210 = 0xa3,
12643
+ CU_AD_FORMAT_P216 = 0xa4,
12644
+ CU_AD_FORMAT_YUY2 = 0xa5,
12645
+ CU_AD_FORMAT_Y210 = 0xa6,
12646
+ CU_AD_FORMAT_Y216 = 0xa7,
12647
+ CU_AD_FORMAT_AYUV = 0xa8,
12648
+ CU_AD_FORMAT_Y410 = 0xa9,
12649
+ CU_AD_FORMAT_Y416 = 0xb1,
12650
+ CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
12651
+ CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
12652
+ CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
12653
+ CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
12654
+ CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
11684
12655
  } CUarray_format;
11685
12656
  * \endcode
11686
12657
  *
@@ -11842,12 +12813,18 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
11842
12813
  * have identical allocation properties. Users are also expected to retrieve a
11843
12814
  * new handle every time the underlying physical allocation(s) corresponding
11844
12815
  * to a previously queried VA range are changed.
12816
+ *
12817
+ * For CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, users may set
12818
+ * flags to ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE. Which when set on a
12819
+ * supported platform, will give a DMA_BUF handle mapped via PCIE BAR1 or will
12820
+ * return an error otherwise.
11845
12821
  *
11846
12822
  * \param[out] handle - Pointer to the location where the returned handle will be stored.
11847
12823
  * \param[in] dptr - Pointer to a valid CUDA device allocation. Must be aligned to host page size.
11848
12824
  * \param[in] size - Length of the address range. Must be aligned to host page size.
11849
12825
  * \param[in] handleType - Type of handle requested (defines type and size of the \p handle output parameter)
11850
- * \param[in] flags - Reserved, must be zero
12826
+ * \param[in] flags - When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD the value could be
12827
+ * ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE, otherwise 0.
11851
12828
  *
11852
12829
  * \return
11853
12830
  * CUDA_SUCCESS
@@ -11856,6 +12833,112 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
11856
12833
  */
11857
12834
  CUresult CUDAAPI cuMemGetHandleForAddressRange(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
11858
12835
 
12836
+ /**
12837
+ * \brief Bitmasks for CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK.
12838
+ */
12839
+ typedef enum CUmemDecompressAlgorithm_enum {
12840
+ CU_MEM_DECOMPRESS_UNSUPPORTED = 0, /**< Decompression is unsupported. */
12841
+ CU_MEM_DECOMPRESS_ALGORITHM_DEFLATE = 1<<0, /**< Deflate is supported. */
12842
+ CU_MEM_DECOMPRESS_ALGORITHM_SNAPPY = 1<<1 /**< Snappy is supported. */
12843
+ } CUmemDecompressAlgorithm;
12844
+
12845
+ /**
12846
+ * \brief Structure describing the parameters that compose a single
12847
+ * decompression operation.
12848
+ */
12849
+ typedef struct CUmemDecompressParams_st {
12850
+ /** The number of bytes to be read and decompressed from
12851
+ * ::CUmemDecompressParams_st.src. */
12852
+ size_t srcNumBytes;
12853
+ /** The number of bytes that the decompression operation will be expected to
12854
+ * write to ::CUmemDecompressParams_st.dst. This value is optional; if
12855
+ * present, it may be used by the CUDA driver as a heuristic for scheduling
12856
+ * the individual decompression operations. */
12857
+ size_t dstNumBytes;
12858
+ /** After the decompression operation has completed, the actual number of
12859
+ * bytes written to ::CUmemDecompressParams.dst will be recorded as a 32-bit
12860
+ * unsigned integer in the memory at this address. */
12861
+ cuuint32_t *dstActBytes;
12862
+ /** Pointer to a buffer of at least ::CUmemDecompressParams_st.srcNumBytes
12863
+ * compressed bytes. */
12864
+ const void *src;
12865
+ /** Pointer to a buffer where the decompressed data will be written. The
12866
+ * number of bytes written to this location will be recorded in the memory
12867
+ * pointed to by ::CUmemDecompressParams_st.dstActBytes */
12868
+ void *dst;
12869
+ /** The decompression algorithm to use. */
12870
+ CUmemDecompressAlgorithm algo;
12871
+ /* These bytes are unused and must be zeroed. This ensures compatibility if
12872
+ * additional fields are added in the future. */
12873
+ unsigned char padding[20];
12874
+ } CUmemDecompressParams;
12875
+
12876
+ /**
12877
+ * \brief Submit a batch of \p count independent decompression operations.
12878
+ *
12879
+ * \details Each of the \p count decompression operations is described by a
12880
+ * single entry in the \p paramsArray array. Once the batch has been
12881
+ * submitted, the function will return, and decompression will happen
12882
+ * asynchronously w.r.t. the CPU. To the work completion tracking
12883
+ * mechanisms in the CUDA driver, the batch will be considered a single
12884
+ * unit of work and processed according to stream semantics, i.e., it
12885
+ * is not possible to query the completion of individual decompression
12886
+ * operations within a batch.
12887
+ *
12888
+ * The memory pointed to by each of ::CUmemDecompressParams.src,
12889
+ * ::CUmemDecompressParams.dst, and ::CUmemDecompressParams.dstActBytes,
12890
+ * must be capable of usage with the hardware decompress feature. That
12891
+ * is, for each of said pointers, the pointer attribute
12892
+ * ::CU_POINTER_ATTRIBUTE_IS_MEM_DECOMPRESS_CAPABLE should give a
12893
+ * non-zero value. To ensure this, the memory backing the pointers
12894
+ * should have been allocated using one of the following CUDA memory
12895
+ * allocators:
12896
+ * * ::cuMemAlloc()
12897
+ * * ::cuMemCreate() with the usage flag ::CU_MEM_CREATE_USAGE_HW_DECOMPRESS
12898
+ * * ::cuMemAllocFromPoolAsync() from a pool that was created with
12899
+ * the usage flag ::CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS
12900
+ * Additionally, ::CUmemDecompressParams.src, ::CUmemDecompressParams.dst,
12901
+ * and ::CUmemDecompressParams.dstActBytes, must all be accessible from
12902
+ * the device associated with the context where \p stream was created.
12903
+ * For information on how to ensure this, see the documentation for the
12904
+ * allocator of interest.
12905
+ *
12906
+ * \param[in] paramsArray The array of structures describing the independent
12907
+ * decompression operations.
12908
+ * \param[in] count The number of entries in \p paramsArray array.
12909
+ * \param[in] flags Must be 0.
12910
+ * \param[out] errorIndex The index into \p paramsArray of the decompression
12911
+ * operation for which the error returned by this
12912
+ * function pertains to. If \p index is SIZE_MAX and
12913
+ * the value returned is not ::CUDA_SUCCESS, then the
12914
+ * error returned by this function should be considered
12915
+ * a general error that does not pertain to a
12916
+ * particular decompression operation. May be \p NULL,
12917
+ * in which case, no index will be recorded in the
12918
+ * event of error.
12919
+ * \param[in] stream The stream where the work will be enqueued.
12920
+ *
12921
+ * \return
12922
+ * ::CUDA_SUCCESS,
12923
+ * ::CUDA_ERROR_DEINITIALIZED,
12924
+ * ::CUDA_ERROR_NOT_INITIALIZED,
12925
+ * ::CUDA_ERROR_INVALID_CONTEXT,
12926
+ * ::CUDA_ERROR_INVALID_VALUE,
12927
+ * ::CUDA_ERROR_INVALID_HANDLE
12928
+ * \notefnerr
12929
+ * \note_async
12930
+ * \note_null_stream
12931
+ *
12932
+ * \sa ::cuMemAlloc, ::cuMemPoolCreate, ::cuMemAllocFromPoolAsync
12933
+ */
12934
+ CUresult CUDAAPI cuMemBatchDecompressAsync(
12935
+ CUmemDecompressParams *paramsArray,
12936
+ size_t count,
12937
+ unsigned int flags,
12938
+ size_t *errorIndex,
12939
+ CUstream stream
12940
+ );
12941
+
11859
12942
  /** @} */ /* END CUDA_MEM */
11860
12943
 
11861
12944
  /**
@@ -11937,17 +13020,23 @@ CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size);
11937
13020
  * set ::CUmemAllocationProp::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
11938
13021
  * ::CUmemAllocationProp::CUmemLocation::id must specify the NUMA ID of the CPU.
11939
13022
  * On systems where NUMA is not available ::CUmemAllocationProp::CUmemLocation::id must be set to 0.
13023
+ * Specifying ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or ::CU_MEM_LOCATION_TYPE_HOST as the
13024
+ * ::CUmemLocation::type will result in ::CUDA_ERROR_INVALID_VALUE.
13025
+ *
13026
+ * Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
13027
+ * (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices
13028
+ * (2) have at least one IMEX channel file accessible by the user launching the application.
13029
+ *
13030
+ * When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
13031
+ * share memory.
11940
13032
  *
11941
- * Applications can set ::CUmemAllocationProp::requestedHandleTypes to
11942
- * ::CU_MEM_HANDLE_TYPE_FABRIC in order to create allocations suitable for sharing
11943
- * within an IMEX domain. An IMEX domain is either an OS instance or a group of securely
11944
- * connected OS instances using the NVIDIA IMEX daemon. An IMEX channel is a global resource
11945
- * within the IMEX domain that represents a logical entity that aims to provide fine grained
11946
- * accessibility control for the participating processes. When exporter and importer CUDA processes
11947
- * have been granted access to the same IMEX channel, they can securely share memory.
11948
- * If the allocating process does not have access setup for an IMEX channel, attempting to create
11949
- * a ::CUmemGenericAllocationHandle with ::CU_MEM_HANDLE_TYPE_FABRIC will result in ::CUDA_ERROR_NOT_PERMITTED.
11950
- * The nvidia-modprobe CLI provides more information regarding setting up of IMEX channels.
13033
+ * The IMEX channel security model works on a per user basis. Which means all processes under a user can share
13034
+ * memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
13035
+ * channel is required for each user.
13036
+ *
13037
+ * These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
13038
+ * native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
13039
+ * users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
11951
13040
  *
11952
13041
  * If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then
11953
13042
  * the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays
@@ -12637,25 +13726,31 @@ CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPo
12637
13726
  * Creates a CUDA memory pool and returns the handle in \p pool. The \p poolProps determines
12638
13727
  * the properties of the pool such as the backing device and IPC capabilities.
12639
13728
  *
12640
- * To create a memory pool targeting a specific host NUMA node, applications must
12641
- * set ::CUmemPoolProps::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
12642
- * ::CUmemPoolProps::CUmemLocation::id must specify the NUMA ID of the host memory node.
13729
+ * To create a memory pool targeting a specific host NUMA node, applications must
13730
+ * set ::CUmemPoolProps::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
13731
+ * ::CUmemPoolProps::CUmemLocation::id must specify the NUMA ID of the host memory node.
13732
+ * Specifying ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or ::CU_MEM_LOCATION_TYPE_HOST as the
13733
+ * ::CUmemPoolProps::CUmemLocation::type will result in ::CUDA_ERROR_INVALID_VALUE.
12643
13734
  * By default, the pool's memory will be accessible from the device it is allocated on.
12644
13735
  * In the case of pools created with ::CU_MEM_LOCATION_TYPE_HOST_NUMA, their default accessibility
12645
13736
  * will be from the host CPU.
12646
13737
  * Applications can control the maximum size of the pool by specifying a non-zero value for ::CUmemPoolProps::maxSize.
12647
13738
  * If set to 0, the maximum size of the pool will default to a system dependent value.
12648
13739
  *
12649
- * Applications can set ::CUmemPoolProps::handleTypes to ::CU_MEM_HANDLE_TYPE_FABRIC
12650
- * in order to create ::CUmemoryPool suitable for sharing within an IMEX domain.
12651
- * An IMEX domain is either an OS instance or a group of securely connected OS instances
12652
- * using the NVIDIA IMEX daemon. An IMEX channel is a global resource within the IMEX domain
12653
- * that represents a logical entity that aims to provide fine grained accessibility control
12654
- * for the participating processes. When exporter and importer CUDA processes have been
12655
- * granted access to the same IMEX channel, they can securely share memory.
12656
- * If the allocating process does not have access setup for an IMEX channel, attempting to export
12657
- * a ::CUmemoryPool with ::CU_MEM_HANDLE_TYPE_FABRIC will result in ::CUDA_ERROR_NOT_PERMITTED.
12658
- * The nvidia-modprobe CLI provides more information regarding setting up of IMEX channels.
13740
+ * Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
13741
+ * (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices
13742
+ * (2) have at least one IMEX channel file accessible by the user launching the application.
13743
+ *
13744
+ * When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
13745
+ * share memory.
13746
+ *
13747
+ * The IMEX channel security model works on a per user basis. Which means all processes under a user can share
13748
+ * memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
13749
+ * channel is required for each user.
13750
+ *
13751
+ * These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
13752
+ * native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
13753
+ * users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
12659
13754
  *
12660
13755
  * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
12661
13756
  *
@@ -12962,8 +14057,8 @@ CUresult CUDAAPI cuMulticastAddDevice(CUmemGenericAllocationHandle mcHandle, CUd
12962
14057
  * returned by ::cuMulticastGetGranularity with the flag
12963
14058
  * ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
12964
14059
  *
12965
- * The \p size + \p memOffset must be smaller than the size of the allocated
12966
- * memory. Similarly the \p size + \p mcOffset must be smaller than the size
14060
+ * The \p size + \p memOffset cannot be larger than the size of the allocated
14061
+ * memory. Similarly the \p size + \p mcOffset cannot be larger than the size
12967
14062
  * of the multicast object.
12968
14063
  * The memory allocation must have beeen created on one of the devices
12969
14064
  * that was added to the multicast team via ::cuMulticastAddDevice.
@@ -13010,8 +14105,8 @@ CUresult CUDAAPI cuMulticastBindMem(CUmemGenericAllocationHandle mcHandle, size_
13010
14105
  * aligned to the value returned by ::cuMulticastGetGranularity with the flag
13011
14106
  * ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
13012
14107
  *
13013
- * The \p size must be smaller than the size of the allocated memory.
13014
- * Similarly the \p size + \p mcOffset must be smaller than the total size
14108
+ * The \p size cannot be larger than the size of the allocated memory.
14109
+ * Similarly the \p size + \p mcOffset cannot be larger than the total size
13015
14110
  * of the multicast object.
13016
14111
  * The memory allocation must have beeen created on one of the devices
13017
14112
  * that was added to the multicast team via ::cuMulticastAddDevice.
@@ -13052,7 +14147,7 @@ CUresult CUDAAPI cuMulticastBindAddr(CUmemGenericAllocationHandle mcHandle, size
13052
14147
  * The intended \p size of the unbind and the offset in the multicast range
13053
14148
  * ( \p mcOffset ) must be a multiple of the value returned by
13054
14149
  * ::cuMulticastGetGranularity flag ::CU_MULTICAST_GRANULARITY_MINIMUM.
13055
- * The \p size + \p mcOffset must be smaller than the total size of the
14150
+ * The \p size + \p mcOffset cannot be larger than the total size of the
13056
14151
  * multicast object.
13057
14152
  *
13058
14153
  * \note
@@ -13343,6 +14438,12 @@ CUresult CUDAAPI cuMulticastGetGranularity(size_t *granularity, const CUmulticas
13343
14438
  *
13344
14439
  * Returns in \p *data the handle to the mempool that the allocation was obtained from.
13345
14440
  *
14441
+ * - ::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE:
14442
+ *
14443
+ * Returns in \p *data a boolean that indicates whether the pointer points
14444
+ * to memory that is capable to be used for hardware accelerated
14445
+ * decompression.
14446
+ *
13346
14447
  * \par
13347
14448
  *
13348
14449
  * Note that for most allocations in the unified virtual address space
@@ -13397,7 +14498,9 @@ CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute
13397
14498
  * base device pointer of the memory to be prefetched and \p dstDevice is the
13398
14499
  * destination device. \p count specifies the number of bytes to copy. \p hStream
13399
14500
  * is the stream in which the operation is enqueued. The memory range must refer
13400
- * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
14501
+ * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables
14502
+ * or it may also refer to system-allocated memory on systems with non-zero
14503
+ * CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
13401
14504
  *
13402
14505
  * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
13403
14506
  * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
@@ -13957,6 +15060,7 @@ CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute at
13957
15060
  * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
13958
15061
  * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
13959
15062
  * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
15063
+ * - ::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE
13960
15064
  *
13961
15065
  * \param numAttributes - Number of attributes to query
13962
15066
  * \param attributes - An array of attributes to query
@@ -14027,8 +15131,10 @@ CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_at
14027
15131
  *
14028
15132
  * \sa ::cuStreamDestroy,
14029
15133
  * ::cuStreamCreateWithPriority,
15134
+ * ::cuGreenCtxStreamCreate,
14030
15135
  * ::cuStreamGetPriority,
14031
15136
  * ::cuStreamGetFlags,
15137
+ * ::cuStreamGetDevice
14032
15138
  * ::cuStreamWaitEvent,
14033
15139
  * ::cuStreamQuery,
14034
15140
  * ::cuStreamSynchronize,
@@ -14078,9 +15184,11 @@ CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags);
14078
15184
  *
14079
15185
  * \sa ::cuStreamDestroy,
14080
15186
  * ::cuStreamCreate,
15187
+ * ::cuGreenCtxStreamCreate,
14081
15188
  * ::cuStreamGetPriority,
14082
15189
  * ::cuCtxGetStreamPriorityRange,
14083
15190
  * ::cuStreamGetFlags,
15191
+ * ::cuStreamGetDevice
14084
15192
  * ::cuStreamWaitEvent,
14085
15193
  * ::cuStreamQuery,
14086
15194
  * ::cuStreamSynchronize,
@@ -14093,7 +15201,7 @@ CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int fla
14093
15201
  /**
14094
15202
  * \brief Query the priority of a given stream
14095
15203
  *
14096
- * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
15204
+ * Query the priority of a stream created using ::cuStreamCreate, ::cuStreamCreateWithPriority or ::cuGreenCtxStreamCreate
14097
15205
  * and return the priority in \p priority. Note that if the stream was created with a
14098
15206
  * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
14099
15207
  * this function returns the clamped priority.
@@ -14114,16 +15222,44 @@ CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int fla
14114
15222
  * \sa ::cuStreamDestroy,
14115
15223
  * ::cuStreamCreate,
14116
15224
  * ::cuStreamCreateWithPriority,
15225
+ * ::cuGreenCtxStreamCreate,
14117
15226
  * ::cuCtxGetStreamPriorityRange,
14118
15227
  * ::cuStreamGetFlags,
15228
+ * ::cuStreamGetDevice
14119
15229
  * ::cudaStreamGetPriority
14120
15230
  */
14121
15231
  CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
14122
15232
 
15233
+ /**
15234
+ * \brief Returns the device handle of the stream
15235
+ *
15236
+ * Returns in \p *device the device handle of the stream
15237
+ *
15238
+ * \param hStream - Handle to the stream to be queried
15239
+ * \param device - Returns the device to which a stream belongs
15240
+ *
15241
+ * \return
15242
+ * ::CUDA_SUCCESS,
15243
+ * ::CUDA_ERROR_DEINITIALIZED,
15244
+ * ::CUDA_ERROR_NOT_INITIALIZED,
15245
+ * ::CUDA_ERROR_INVALID_CONTEXT,
15246
+ * ::CUDA_ERROR_INVALID_VALUE,
15247
+ * ::CUDA_ERROR_INVALID_HANDLE,
15248
+ * ::CUDA_ERROR_OUT_OF_MEMORY
15249
+ * \notefnerr
15250
+ *
15251
+ * \sa
15252
+ * ::cuStreamDestroy,
15253
+ * ::cuStreamCreate,
15254
+ * ::cuGreenCtxStreamCreate,
15255
+ * ::cuStreamGetFlags
15256
+ */
15257
+ CUresult CUDAAPI cuStreamGetDevice(CUstream hStream, CUdevice *device);
15258
+
14123
15259
  /**
14124
15260
  * \brief Query the flags of a given stream
14125
15261
  *
14126
- * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
15262
+ * Query the flags of a stream created using ::cuStreamCreate, ::cuStreamCreateWithPriority or ::cuGreenCtxStreamCreate
14127
15263
  * and return the flags in \p flags.
14128
15264
  *
14129
15265
  * \param hStream - Handle to the stream to be queried
@@ -14143,8 +15279,10 @@ CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
14143
15279
  *
14144
15280
  * \sa ::cuStreamDestroy,
14145
15281
  * ::cuStreamCreate,
15282
+ * ::cuGreenCtxStreamCreate,
14146
15283
  * ::cuStreamGetPriority,
14147
15284
  * ::cudaStreamGetFlags
15285
+ * ::cuStreamGetDevice
14148
15286
  */
14149
15287
  CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
14150
15288
 
@@ -14186,6 +15324,10 @@ CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
14186
15324
  *
14187
15325
  * Returns the CUDA context that the stream is associated with.
14188
15326
  *
15327
+ * Note there is a later version of this API, ::cuStreamGetCtx_v2. It will
15328
+ * supplant this version in CUDA 13.0. It is recommended to use ::cuStreamGetCtx_v2
15329
+ * till then as this version will return ::CUDA_ERROR_NOT_SUPPORTED for streams created via the API ::cuGreenCtxStreamCreate.
15330
+ *
14189
15331
  * The stream handle \p hStream can refer to any of the following:
14190
15332
  * <ul>
14191
15333
  * <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
@@ -14210,21 +15352,82 @@ CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
14210
15352
  * ::CUDA_ERROR_NOT_INITIALIZED,
14211
15353
  * ::CUDA_ERROR_INVALID_CONTEXT,
14212
15354
  * ::CUDA_ERROR_INVALID_HANDLE,
15355
+ * ::CUDA_ERROR_NOT_SUPPORTED
14213
15356
  * \notefnerr
14214
15357
  *
14215
15358
  * \sa ::cuStreamDestroy,
14216
15359
  * ::cuStreamCreateWithPriority,
14217
15360
  * ::cuStreamGetPriority,
14218
15361
  * ::cuStreamGetFlags,
15362
+ * ::cuStreamGetDevice
14219
15363
  * ::cuStreamWaitEvent,
14220
15364
  * ::cuStreamQuery,
14221
15365
  * ::cuStreamSynchronize,
14222
15366
  * ::cuStreamAddCallback,
14223
15367
  * ::cudaStreamCreate,
15368
+ * ::cuStreamGetCtx_v2,
14224
15369
  * ::cudaStreamCreateWithFlags
14225
15370
  */
14226
15371
  CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
14227
15372
 
15373
+ /**
15374
+ * \brief Query the contexts associated with a stream
15375
+ *
15376
+ * Returns the contexts that the stream is associated with.
15377
+ *
15378
+ * If the stream is associated with a green context, the API returns the green context in \p pGreenCtx
15379
+ * and the primary context of the associated device in \p pCtx.
15380
+ *
15381
+ * If the stream is associated with a regular context, the API returns the regular context in \p pCtx
15382
+ * and NULL in \p pGreenCtx.
15383
+ *
15384
+ * The stream handle \p hStream can refer to any of the following:
15385
+ * <ul>
15386
+ * <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate,
15387
+ * ::cuStreamCreateWithPriority and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as
15388
+ * ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
15389
+ * Passing an invalid handle will result in undefined behavior.</li>
15390
+ * <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
15391
+ * ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
15392
+ * which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
15393
+ * If any of the special handles are specified, the API will operate on the context current to the
15394
+ * calling thread. If a green context (that was converted via ::cuCtxFromGreenCtx() before setting it current)
15395
+ * is current to the calling thread, the API will return the green context in \p pGreenCtx
15396
+ * and the primary context of the associated device in \p pCtx. If a regular context is current,
15397
+ * the API returns the regular context in \p pCtx and NULL in \p pGreenCtx.
15398
+ * Note that specifying ::CU_STREAM_PER_THREAD or ::cudaStreamPerThread will return ::CUDA_ERROR_INVALID_HANDLE
15399
+ * if a green context is current to the calling thread.
15400
+ * If no context is current to the calling thread, ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
15401
+ * </ul>
15402
+ *
15403
+ * \param hStream - Handle to the stream to be queried
15404
+ * \param pCtx - Returned regular context associated with the stream
15405
+ * \param pGreenCtx - Returned green context if the stream is associated with a green context or NULL if not
15406
+ *
15407
+ * \return
15408
+ * ::CUDA_SUCCESS,
15409
+ * ::CUDA_ERROR_DEINITIALIZED,
15410
+ * ::CUDA_ERROR_NOT_INITIALIZED,
15411
+ * ::CUDA_ERROR_INVALID_CONTEXT,
15412
+ * ::CUDA_ERROR_INVALID_HANDLE
15413
+ * \notefnerr
15414
+ *
15415
+ * \sa ::cuStreamDestroy,
15416
+ * ::cuStreamCreate
15417
+ * ::cuStreamCreateWithPriority,
15418
+ * ::cuGreenCtxStreamCreate,
15419
+ * ::cuStreamGetPriority,
15420
+ * ::cuStreamGetFlags,
15421
+ * ::cuStreamGetDevice
15422
+ * ::cuStreamWaitEvent,
15423
+ * ::cuStreamQuery,
15424
+ * ::cuStreamSynchronize,
15425
+ * ::cuStreamAddCallback,
15426
+ * ::cudaStreamCreate,
15427
+ * ::cudaStreamCreateWithFlags,
15428
+ */
15429
+ CUresult CUDAAPI cuStreamGetCtx_v2(CUstream hStream, CUcontext *pCtx, CUgreenCtx *pGreenCtx);
15430
+
14228
15431
  /**
14229
15432
  * \brief Make a compute stream wait on an event
14230
15433
  *
@@ -14545,6 +15748,7 @@ CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
14545
15748
  */
14546
15749
  CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
14547
15750
 
15751
+
14548
15752
  /**
14549
15753
  * \brief Query a stream's capture state
14550
15754
  *
@@ -15031,7 +16235,8 @@ CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
15031
16235
  * \brief Records an event
15032
16236
  *
15033
16237
  * Captures in \p hEvent the contents of \p hStream at the time of this call.
15034
- * \p hEvent and \p hStream must be from the same context.
16238
+ * \p hEvent and \p hStream must be from the same context otherwise
16239
+ * ::CUDA_ERROR_INVALID_HANDLE is returned.
15035
16240
  * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
15036
16241
  * examine or wait for completion of the work that was captured. Uses of
15037
16242
  * \p hStream after this call do not modify \p hEvent. See note on default
@@ -15073,7 +16278,8 @@ CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
15073
16278
  * \brief Records an event
15074
16279
  *
15075
16280
  * Captures in \p hEvent the contents of \p hStream at the time of this call.
15076
- * \p hEvent and \p hStream must be from the same context.
16281
+ * \p hEvent and \p hStream must be from the same context otherwise
16282
+ * ::CUDA_ERROR_INVALID_HANDLE is returned.
15077
16283
  * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
15078
16284
  * examine or wait for completion of the work that was captured. Uses of
15079
16285
  * \p hStream after this call do not modify \p hEvent. See note on default
@@ -15231,6 +16437,9 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
15231
16437
  * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
15232
16438
  * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
15233
16439
  * ::CUDA_ERROR_INVALID_HANDLE.
16440
+ *
16441
+ * Note there is a later version of this API, ::cuEventElapsedTime_v2. It will
16442
+ * supplant this version in CUDA 13.0, which is retained for minor version compatibility.
15234
16443
  *
15235
16444
  * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
15236
16445
  * \param hStart - Starting event
@@ -15255,6 +16464,54 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
15255
16464
  */
15256
16465
  CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
15257
16466
 
16467
+ /**
16468
+ * \brief Computes the elapsed time between two events
16469
+ *
16470
+ * Computes the elapsed time between two events (in milliseconds with a
16471
+ * resolution of around 0.5 microseconds). Note this API is not guaranteed
16472
+ * to return the latest errors for pending work. As such this API is intended to
16473
+ * serve as an elapsed time calculation only and any polling for completion on the
16474
+ * events to be compared should be done with ::cuEventQuery instead.
16475
+ *
16476
+ * If either event was last recorded in a non-NULL stream, the resulting time
16477
+ * may be greater than expected (even if both used the same stream handle). This
16478
+ * happens because the ::cuEventRecord() operation takes place asynchronously
16479
+ * and there is no guarantee that the measured latency is actually just between
16480
+ * the two events. Any number of other different stream operations could execute
16481
+ * in between the two measured events, thus altering the timing in a significant
16482
+ * way.
16483
+ *
16484
+ * If ::cuEventRecord() has not been called on either event then
16485
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
16486
+ * on both events but one or both of them has not yet been completed (that is,
16487
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
16488
+ * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
16489
+ * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
16490
+ * ::CUDA_ERROR_INVALID_HANDLE.
16491
+ *
16492
+ * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
16493
+ * \param hStart - Starting event
16494
+ * \param hEnd - Ending event
16495
+ *
16496
+ * \return
16497
+ * ::CUDA_SUCCESS,
16498
+ * ::CUDA_ERROR_DEINITIALIZED,
16499
+ * ::CUDA_ERROR_NOT_INITIALIZED,
16500
+ * ::CUDA_ERROR_INVALID_CONTEXT,
16501
+ * ::CUDA_ERROR_INVALID_HANDLE,
16502
+ * ::CUDA_ERROR_NOT_READY,
16503
+ * ::CUDA_ERROR_UNKNOWN
16504
+ * \notefnerr
16505
+ *
16506
+ * \sa ::cuEventCreate,
16507
+ * ::cuEventRecord,
16508
+ * ::cuEventQuery,
16509
+ * ::cuEventSynchronize,
16510
+ * ::cuEventDestroy,
16511
+ * ::cudaEventElapsedTime
16512
+ */
16513
+ CUresult CUDAAPI cuEventElapsedTime_v2(float *pMilliseconds, CUevent hStart, CUevent hEnd);
16514
+
15258
16515
  /** @} */ /* END CUDA_EVENT */
15259
16516
 
15260
16517
  /**
@@ -15308,7 +16565,7 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven
15308
16565
  CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5,
15309
16566
  CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6,
15310
16567
  CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
15311
- CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8
16568
+ CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8,
15312
16569
  } CUexternalMemoryHandleType;
15313
16570
  * \endcode
15314
16571
  *
@@ -15522,6 +16779,7 @@ CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternal
15522
16779
  * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
15523
16780
  * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1.
15524
16781
  *
16782
+ *
15525
16783
  * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy.
15526
16784
  *
15527
16785
  * \param mipmap - Returned CUDA mipmapped array
@@ -16280,6 +17538,9 @@ CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunc
16280
17538
  * positive. The validity of the cluster dimensions is checked at launch time.
16281
17539
  * If the value is set during compile time, it cannot be set at runtime.
16282
17540
  * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
17541
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
17542
+ * the function can be launched with non-portable cluster size. 1 is allowed,
17543
+ * 0 is disallowed.
16283
17544
  * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
16284
17545
  * scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
16285
17546
  *
@@ -16679,6 +17940,7 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f,
16679
17940
  * CU_LAUNCH_ATTRIBUTE_PRIORITY = 8,
16680
17941
  * CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP = 9,
16681
17942
  * CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN = 10,
17943
+ * CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11,
16682
17944
  * CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12,
16683
17945
  * CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE = 13,
16684
17946
  * } CUlaunchAttributeID;
@@ -16706,6 +17968,11 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f,
16706
17968
  * CUlaunchMemSyncDomainMap memSyncDomainMap;
16707
17969
  * CUlaunchMemSyncDomain memSyncDomain;
16708
17970
  * struct {
17971
+ * unsigned int x;
17972
+ * unsigned int y;
17973
+ * unsigned int z;
17974
+ * } preferredClusterDim;
17975
+ * struct {
16709
17976
  * CUevent event;
16710
17977
  * int flags;
16711
17978
  * } launchCompletionEvent;
@@ -16776,6 +18043,36 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f,
16776
18043
  * opt out, and any attempt to set the attribute to 0 will result in an error. Graphs
16777
18044
  * containing one or more device-updatable node also do not allow multiple instantiation.
16778
18045
  *
18046
+ * ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION allows the kernel launch to
18047
+ * specify a preferred substitute cluster dimension. Blocks may be grouped
18048
+ * according to either the dimensions specified with this attribute (grouped
18049
+ * into a "preferred substitute cluster"), or the one specified with
18050
+ * ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped into a "regular
18051
+ * cluster"). The cluster dimensions of a "preferred substitute cluster" shall
18052
+ * be an integer multiple greater than zero of the regular cluster dimensions.
18053
+ * The device will attempt - on a best-effort basis - to group thread blocks
18054
+ * into preferred clusters over grouping them into regular clusters. When it
18055
+ * deems necessary (primarily when the device temporarily runs out of physical
18056
+ * resources to launch the larger preferred clusters), the device may switch to
18057
+ * launch the regular clusters instead to attempt to utilize as much of the
18058
+ * physical device resources as possible.
18059
+ *
18060
+ * Each type of cluster will have its enumeration / coordinate setup as if the
18061
+ * grid consists solely of its type of cluster. For example, if the preferred
18062
+ * substitute cluster dimensions double the regular cluster dimensions, there
18063
+ * might be simultaneously a regular cluster indexed at (1,0,0), and a preferred
18064
+ * cluster indexed at (1,0,0). In this example, the preferred substitute cluster
18065
+ * (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their
18066
+ * blocks.
18067
+ *
18068
+ * This attribute will only take effect when a regular cluster dimension has
18069
+ * been specified. The preferred substitute The preferred substitute cluster
18070
+ * dimension must be an integer multiple greater than zero of the regular
18071
+ * cluster dimension and must divide the grid. It must also be no more than
18072
+ * `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`.
18073
+ * Otherwise it must be less than the maximum value the driver can support.
18074
+ * Otherwise, setting this attribute to a value physically unable to fit on any
18075
+ * particular device is permitted.
16779
18076
  *
16780
18077
  * The effect of other attributes is consistent with their effect when set via
16781
18078
  * persistent APIs.
@@ -16844,12 +18141,6 @@ CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config,
16844
18141
  * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
16845
18142
  * \p blockDimZ threads.
16846
18143
  *
16847
- * Note that the API can also be used to launch context-less kernel ::CUkernel
16848
- * by querying the handle using ::cuLibraryGetKernel() and then passing it
16849
- * to the API by casting to ::CUfunction. Here, the context to launch
16850
- * the kernel on will either be taken from the specified stream \p hStream
16851
- * or the current context in case of NULL stream.
16852
- *
16853
18144
  * \p sharedMemBytes sets the amount of dynamic shared memory that will be
16854
18145
  * available to each thread block.
16855
18146
  *
@@ -19826,18 +21117,22 @@ CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphN
19826
21117
  * contained \p memsetParams at instantiation. hNode must remain in the graph which was
19827
21118
  * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored.
19828
21119
  *
19829
- * The destination memory in \p memsetParams must be allocated from the same
19830
- * contexts as the original destination memory. Both the instantiation-time
19831
- * memory operand and the memory operand in \p memsetParams must be 1-dimensional.
19832
- * Zero-length operations are not supported.
21120
+ * Zero sized operations are not supported.
21121
+ *
21122
+ * The new destination pointer in memsetParams must be to the same kind of allocation
21123
+ * as the original destination pointer and have the same context association and device mapping
21124
+ * as the original destination pointer.
21125
+ *
21126
+ * Both the value and pointer address may be updated.
21127
+ * Changing other aspects of the memset (width, height, element size or pitch) may cause the update to be rejected.
21128
+ * Specifically, for 2d memsets, all dimension changes are rejected.
21129
+ * For 1d memsets, changes in height are explicitly rejected and other changes are oportunistically allowed
21130
+ * if the resulting work maps onto the work resources already allocated for the node.
19833
21131
  *
19834
21132
  * The modifications only affect future launches of \p hGraphExec. Already enqueued
19835
21133
  * or running launches of \p hGraphExec are not affected by this call. hNode is also
19836
21134
  * not modified by this call.
19837
21135
  *
19838
- * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or
19839
- * either the original or new memory operand are multidimensional.
19840
- *
19841
21136
  * \param hGraphExec - The executable graph in which to set the specified node
19842
21137
  * \param hNode - Memset node from the graph which was used to instantiate graphExec
19843
21138
  * \param memsetParams - The updated parameters to set
@@ -20319,7 +21614,9 @@ CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
20319
21614
  * - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
20320
21615
  * - The source/destination memory must be allocated from the same contexts as the original
20321
21616
  * source/destination memory.
20322
- * - Only 1D memsets can be changed.
21617
+ * - For 2d memsets, only address and assinged value may be updated.
21618
+ * - For 1d memsets, updating dimensions is also allowed, but may fail if the resulting operation doesn't
21619
+ * map onto the work resources already allocated for the node.
20323
21620
  * - Additional memcpy node restrictions:
20324
21621
  * - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
20325
21622
  * CU_MEMORYTYPE_ARRAY, etc.) is not supported.
@@ -20776,6 +22073,7 @@ CUresult CUDAAPI cuGraphExecNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hN
20776
22073
  * \param hGraph - Graph which will contain the conditional node using this handle.
20777
22074
  * \param ctx - Context for the handle and associated conditional node.
20778
22075
  * \param defaultLaunchValue - Optional initial value for the conditional variable.
22076
+ * Applied at the beginning of each graph execution if CU_GRAPH_COND_ASSIGN_DEFAULT is set in \p flags.
20779
22077
  * \param flags - Currently must be CU_GRAPH_COND_ASSIGN_DEFAULT or 0.
20780
22078
  *
20781
22079
  * \return
@@ -20810,6 +22108,11 @@ CUresult CUDAAPI cuGraphConditionalHandleCreate(CUgraphConditionalHandle *pHandl
20810
22108
  * Returns in \p *numBlocks the number of the maximum active blocks per
20811
22109
  * streaming multiprocessor.
20812
22110
  *
22111
+ * Note that the API can also be used with context-less kernel ::CUkernel
22112
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
22113
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
22114
+ * will be the current context.
22115
+ *
20813
22116
  * \param numBlocks - Returned occupancy
20814
22117
  * \param func - Kernel for which occupancy is calculated
20815
22118
  * \param blockSize - Block size the kernel is intended to be launched with
@@ -20851,6 +22154,11 @@ CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUf
20851
22154
  * can be found about this feature in the "Unified L1/Texture Cache"
20852
22155
  * section of the Maxwell tuning guide.
20853
22156
  *
22157
+ * Note that the API can also be with launch context-less kernel ::CUkernel
22158
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
22159
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
22160
+ * will be the current context.
22161
+ *
20854
22162
  * \param numBlocks - Returned occupancy
20855
22163
  * \param func - Kernel for which occupancy is calculated
20856
22164
  * \param blockSize - Block size the kernel is intended to be launched with
@@ -20902,6 +22210,11 @@ CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBl
20902
22210
  * size_t blockToSmem(int blockSize);
20903
22211
  * \endcode
20904
22212
  *
22213
+ * Note that the API can also be used with context-less kernel ::CUkernel
22214
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
22215
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
22216
+ * will be the current context.
22217
+ *
20905
22218
  * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
20906
22219
  * \param blockSize - Returned maximum block size that can achieve the maximum occupancy
20907
22220
  * \param func - Kernel for which launch configuration is calculated
@@ -20947,6 +22260,11 @@ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSi
20947
22260
  * can be found about this feature in the "Unified L1/Texture Cache"
20948
22261
  * section of the Maxwell tuning guide.
20949
22262
  *
22263
+ * Note that the API can also be used with context-less kernel ::CUkernel
22264
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
22265
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
22266
+ * will be the current context.
22267
+ *
20950
22268
  * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
20951
22269
  * \param blockSize - Returned maximum block size that can achieve the maximum occupancy
20952
22270
  * \param func - Kernel for which launch configuration is calculated
@@ -20974,6 +22292,11 @@ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int
20974
22292
  *
20975
22293
  * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM.
20976
22294
  *
22295
+ * Note that the API can also be used with context-less kernel ::CUkernel
22296
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
22297
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
22298
+ * will be the current context.
22299
+ *
20977
22300
  * \param dynamicSmemSize - Returned maximum dynamic shared memory
20978
22301
  * \param func - Kernel function for which occupancy is calculated
20979
22302
  * \param numBlocks - Number of blocks to fit on SM
@@ -21004,6 +22327,12 @@ CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize
21004
22327
  *
21005
22328
  * This function will respect the compile time launch bounds.
21006
22329
  *
22330
+ * Note that the API can also be used with context-less kernel ::CUkernel
22331
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
22332
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
22333
+ * will either be taken from the specified stream \p config->hStream
22334
+ * or the current context in case of NULL stream.
22335
+ *
21007
22336
  * \param clusterSize - Returned maximum cluster size that can be launched
21008
22337
  * for the given kernel function and launch configuration
21009
22338
  * \param func - Kernel function for which maximum cluster
@@ -21040,6 +22369,12 @@ CUresult CUDAAPI cuOccupancyMaxPotentialClusterSize(int *clusterSize, CUfunction
21040
22369
  * calculation. Runtime environment may affect how the hardware schedules
21041
22370
  * the clusters, so the calculated occupancy is not guaranteed to be achievable.
21042
22371
  *
22372
+ * Note that the API can also be used with context-less kernel ::CUkernel
22373
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
22374
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
22375
+ * will either be taken from the specified stream \p config->hStream
22376
+ * or the current context in case of NULL stream.
22377
+ *
21043
22378
  * \param numClusters - Returned maximum number of clusters that
21044
22379
  * could co-exist on the target device
21045
22380
  * \param func - Kernel function for which maximum number
@@ -22004,7 +23339,8 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref
22004
23339
  * \p pResViewDesc is an optional argument that specifies an alternate format for
22005
23340
  * the data described by \p pResDesc, and also describes the subresource region
22006
23341
  * to restrict access to when texturing. \p pResViewDesc can only be specified if
22007
- * the type of resource is a CUDA array or a CUDA mipmapped array.
23342
+ * the type of resource is a CUDA array or a CUDA mipmapped array not in a block
23343
+ * compressed format.
22008
23344
  *
22009
23345
  * Texture objects are only supported on devices of compute capability 3.0 or higher.
22010
23346
  * Additionally, a texture object is an opaque value, and, as such, should only be
@@ -22412,7 +23748,7 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
22412
23748
  *
22413
23749
  * Tensor map objects are only supported on devices of compute capability 9.0 or higher.
22414
23750
  * Additionally, a tensor map object is an opaque value, and, as such, should only be
22415
- * accessed through CUDA API calls.
23751
+ * accessed through CUDA APIs and PTX.
22416
23752
  *
22417
23753
  * The parameters passed are bound to the following requirements:
22418
23754
  *
@@ -22433,21 +23769,33 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
22433
23769
  CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, // 2 bytes
22434
23770
  CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ, // 4 bytes
22435
23771
  CU_TENSOR_MAP_DATA_TYPE_TFLOAT32, // 4 bytes
22436
- CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ // 4 bytes
23772
+ CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ, // 4 bytes
23773
+ CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, // 4 bits
23774
+ CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, // 4 bits
23775
+ CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B // 6 bits
22437
23776
  } CUtensorMapDataType;
22438
23777
  * \endcode
23778
+ * ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
23779
+ * ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
23780
+ * ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
22439
23781
  *
22440
23782
  * - \p tensorRank must be non-zero and less than or equal to the maximum supported dimensionality of 5. If \p interleave is not
22441
23783
  * ::CU_TENSOR_MAP_INTERLEAVE_NONE, then \p tensorRank must additionally be greater than or equal to 3.
22442
23784
  *
22443
- * - \p globalAddress, which specifies the starting address of the memory region described, must be 32 byte aligned when \p interleave is
22444
- * ::CU_TENSOR_MAP_INTERLEAVE_32B and 16 byte aligned otherwise.
23785
+ * - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
23786
+ * - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
23787
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
22445
23788
  *
22446
- * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
22447
- * equal to 2^32.
23789
+ * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
23790
+ * equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
23791
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
23792
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
23793
+ * - Dimension for the packed data types must reflect the number of individual U# values.
22448
23794
  *
22449
23795
  * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
22450
- * multiple of 16 and less than 2^40. Additionally, the stride must be a multiple of 32 when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B.
23796
+ * multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
23797
+ * - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
23798
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
22451
23799
  * Each following dimension specified includes previous dimension stride:
22452
23800
  * \code
22453
23801
  globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
@@ -22457,9 +23805,9 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
22457
23805
  * \endcode
22458
23806
  *
22459
23807
  * - \p boxDim array, which specifies number of elements to be traversed along each of the \p tensorRank dimensions, must be non-zero
22460
- * and less than or equal to 256.
22461
- * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, { \p boxDim[0] * elementSizeInBytes( \p tensorDataType ) } must be a multiple
22462
- * of 16 bytes.
23808
+ * and less than or equal to 256. Additionally, the following requirements need to be met:
23809
+ * - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, { \p boxDim[0] * elementSizeInBytes( \p tensorDataType ) } must be a multiple of 16 bytes.
23810
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, boxDim[0] must be 128.
22463
23811
  *
22464
23812
  * - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
22465
23813
  * than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
@@ -22480,17 +23828,21 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
22480
23828
  * uses 32 bytes.
22481
23829
  * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
22482
23830
  * (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size.
22483
- * - CU_TENSOR_MAP_SWIZZLE_32B implies the bounding box inner dimension will be <= 32.
22484
- * - CU_TENSOR_MAP_SWIZZLE_64B implies the bounding box inner dimension will be <= 64.
22485
- * - CU_TENSOR_MAP_SWIZZLE_128B implies the bounding box inner dimension will be <= 128.
23831
+ * - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension to be <= 32.
23832
+ * - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
23833
+ * - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
23834
+ * Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
22486
23835
  *
22487
23836
  * - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
22488
23837
  * \code
22489
23838
  typedef enum CUtensorMapSwizzle_enum {
22490
23839
  CU_TENSOR_MAP_SWIZZLE_NONE = 0,
22491
- CU_TENSOR_MAP_SWIZZLE_32B,
22492
- CU_TENSOR_MAP_SWIZZLE_64B,
22493
- CU_TENSOR_MAP_SWIZZLE_128B
23840
+ CU_TENSOR_MAP_SWIZZLE_32B, // Swizzle 16B chunks within 32B span
23841
+ CU_TENSOR_MAP_SWIZZLE_64B, // Swizzle 16B chunks within 64B span
23842
+ CU_TENSOR_MAP_SWIZZLE_128B, // Swizzle 16B chunks within 128B span
23843
+ CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B, // Swizzle 32B chunks within 128B span
23844
+ CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B, // Swizzle 32B chunks within 128B span, additionally swap lower 8B with upper 8B within each 16B for every alternate row
23845
+ CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B // Swizzle 64B chunks within 128B span
22494
23846
  } CUtensorMapSwizzle;
22495
23847
  * \endcode
22496
23848
  * Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
@@ -22498,6 +23850,15 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
22498
23850
  * problem, data can be loaded to shared memory with shuffling across shared memory banks.
22499
23851
  * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
22500
23852
  * Other interleave modes can have any swizzling pattern.
23853
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
23854
+ * - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
23855
+ * - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
23856
+ * - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
23857
+ * - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only)
23858
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
23859
+ * - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
23860
+ * - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
23861
+ * - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
22501
23862
  *
22502
23863
  * - \p l2Promotion specifies L2 fetch size which indicates the byte granurality at which L2 requests is filled from DRAM. It must be of
22503
23864
  * type ::CUtensorMapL2promotion, which is defined as:
@@ -22518,7 +23879,8 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
22518
23879
  CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
22519
23880
  } CUtensorMapFloatOOBfill;
22520
23881
  * \endcode
22521
- * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type.
23882
+ * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
23883
+ * and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
22522
23884
  *
22523
23885
  * \param tensorMap - Tensor map object to create
22524
23886
  * \param tensorDataType - Tensor data type
@@ -22542,11 +23904,11 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
22542
23904
  *
22543
23905
  * \sa
22544
23906
  * ::cuTensorMapEncodeIm2col,
23907
+ * ::cuTensorMapEncodeIm2colWide,
22545
23908
  * ::cuTensorMapReplaceAddress
22546
23909
  */
22547
23910
  CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const cuuint32_t *boxDim, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
22548
23911
 
22549
-
22550
23912
  /**
22551
23913
  * \brief Create a tensor map descriptor object representing im2col memory region
22552
23914
  *
@@ -22555,7 +23917,7 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
22555
23917
  *
22556
23918
  * Tensor map objects are only supported on devices of compute capability 9.0 or higher.
22557
23919
  * Additionally, a tensor map object is an opaque value, and, as such, should only be
22558
- * accessed through CUDA API calls.
23920
+ * accessed through CUDA APIs and PTX.
22559
23921
  *
22560
23922
  * The parameters passed are bound to the following requirements:
22561
23923
  *
@@ -22577,19 +23939,31 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
22577
23939
  CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ, // 4 bytes
22578
23940
  CU_TENSOR_MAP_DATA_TYPE_TFLOAT32, // 4 bytes
22579
23941
  CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ // 4 bytes
23942
+ CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, // 4 bits
23943
+ CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, // 4 bits
23944
+ CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B // 6 bits
22580
23945
  } CUtensorMapDataType;
22581
23946
  * \endcode
23947
+ * ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
23948
+ * ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
23949
+ * ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
22582
23950
  *
22583
23951
  * - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
22584
23952
  *
22585
- * - \p globalAddress, which specifies the starting address of the memory region described, must be 32 byte aligned when \p interleave is
22586
- * ::CU_TENSOR_MAP_INTERLEAVE_32B and 16 byte aligned otherwise.
23953
+ * - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
23954
+ * - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
23955
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
22587
23956
  *
22588
23957
  * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
22589
- * equal to 2^32.
23958
+ * equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
23959
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
23960
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
23961
+ * - Dimension for the packed data types must reflect the number of individual U# values.
22590
23962
  *
22591
23963
  * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
22592
- * multiple of 16 and less than 2^40. Additionally, the stride must be a multiple of 32 when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B.
23964
+ * multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
23965
+ * - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
23966
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
22593
23967
  * Each following dimension specified includes previous dimension stride:
22594
23968
  * \code
22595
23969
  globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
@@ -22612,6 +23986,7 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
22612
23986
  * The bounding box specified by \p pixelBoxLowerCorner and \p pixelBoxUpperCorner must have non-zero area.
22613
23987
  *
22614
23988
  * - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
23989
+ * Additionally, when \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p channelsPerPixel must be 128.
22615
23990
  *
22616
23991
  * - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the {N, D, H, W} dimensions, must be less than or
22617
23992
  * equal to 1024.
@@ -22634,18 +24009,22 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
22634
24009
  * TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
22635
24010
  * uses 32 bytes.
22636
24011
  * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
22637
- * (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size.
22638
- * - CU_TENSOR_MAP_SWIZZLE_32B implies the bounding box inner dimension will be <= 32.
22639
- * - CU_TENSOR_MAP_SWIZZLE_64B implies the bounding box inner dimension will be <= 64.
22640
- * - CU_TENSOR_MAP_SWIZZLE_128B implies the bounding box inner dimension will be <= 128.
24012
+ * (computed as \p channelsPerPixel multiplied by element size in bytes derived from \p tensorDataType) must be less than or equal to the swizzle size.
24013
+ * - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension to be <= 32.
24014
+ * - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
24015
+ * - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
24016
+ * Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
22641
24017
  *
22642
24018
  * - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
22643
24019
  * \code
22644
24020
  typedef enum CUtensorMapSwizzle_enum {
22645
24021
  CU_TENSOR_MAP_SWIZZLE_NONE = 0,
22646
- CU_TENSOR_MAP_SWIZZLE_32B,
22647
- CU_TENSOR_MAP_SWIZZLE_64B,
22648
- CU_TENSOR_MAP_SWIZZLE_128B
24022
+ CU_TENSOR_MAP_SWIZZLE_32B, // Swizzle 16B chunks within 32B span
24023
+ CU_TENSOR_MAP_SWIZZLE_64B, // Swizzle 16B chunks within 64B span
24024
+ CU_TENSOR_MAP_SWIZZLE_128B, // Swizzle 16B chunks within 128B span
24025
+ CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B, // Swizzle 32B chunks within 128B span
24026
+ CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B, // Swizzle 32B chunks within 128B span, additionally swap lower 8B with upper 8B within each 16B for every alternate row
24027
+ CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B // Swizzle 64B chunks within 128B span
22649
24028
  } CUtensorMapSwizzle;
22650
24029
  * \endcode
22651
24030
  * Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
@@ -22653,6 +24032,15 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
22653
24032
  * problem, data can be loaded to shared memory with shuffling across shared memory banks.
22654
24033
  * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
22655
24034
  * Other interleave modes can have any swizzling pattern.
24035
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
24036
+ * - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
24037
+ * - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
24038
+ * - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
24039
+ * - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only)
24040
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
24041
+ * - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
24042
+ * - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
24043
+ * - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
22656
24044
  *
22657
24045
  * - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
22658
24046
  * type ::CUtensorMapL2promotion, which is defined as:
@@ -22673,7 +24061,8 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
22673
24061
  CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
22674
24062
  } CUtensorMapFloatOOBfill;
22675
24063
  * \endcode
22676
- * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type.
24064
+ * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
24065
+ * and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
22677
24066
  *
22678
24067
  * \param tensorMap - Tensor map object to create
22679
24068
  * \param tensorDataType - Tensor data type
@@ -22700,12 +24089,197 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
22700
24089
  *
22701
24090
  * \sa
22702
24091
  * ::cuTensorMapEncodeTiled,
24092
+ * ::cuTensorMapEncodeIm2colWide,
22703
24093
  * ::cuTensorMapReplaceAddress
22704
24094
  */
22705
24095
  CUresult CUDAAPI cuTensorMapEncodeIm2col(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner, const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
22706
24096
 
22707
24097
  /**
22708
- * \brief Modify an existing tensor map descriptor with an updated global address
24098
+ * \brief Create a tensor map descriptor object representing im2col memory region, but where
24099
+ * the elements are exclusively loaded along the W dimension.
24100
+ *
24101
+ * Creates a descriptor for Tensor Memory Access (TMA) object specified by the parameters
24102
+ * describing a im2col memory layout and where the row is always loaded along the W dimensuin
24103
+ * and returns it in \p tensorMap. This assumes the tensor layout in memory is either NDHWC,
24104
+ * NHWC, or NWC.
24105
+ *
24106
+ * This API is only supported on devices of compute capability 10.0 or higher.
24107
+ * Additionally, a tensor map object is an opaque value, and, as such, should only be
24108
+ * accessed through CUDA APIs and PTX.
24109
+ *
24110
+ * The parameters passed are bound to the following requirements:
24111
+ *
24112
+ * - \p tensorMap address must be aligned to 64 bytes.
24113
+ *
24114
+ * - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as:
24115
+ * \code
24116
+ typedef enum CUtensorMapDataType_enum {
24117
+ CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0, // 1 byte
24118
+ CU_TENSOR_MAP_DATA_TYPE_UINT16, // 2 bytes
24119
+ CU_TENSOR_MAP_DATA_TYPE_UINT32, // 4 bytes
24120
+ CU_TENSOR_MAP_DATA_TYPE_INT32, // 4 bytes
24121
+ CU_TENSOR_MAP_DATA_TYPE_UINT64, // 8 bytes
24122
+ CU_TENSOR_MAP_DATA_TYPE_INT64, // 8 bytes
24123
+ CU_TENSOR_MAP_DATA_TYPE_FLOAT16, // 2 bytes
24124
+ CU_TENSOR_MAP_DATA_TYPE_FLOAT32, // 4 bytes
24125
+ CU_TENSOR_MAP_DATA_TYPE_FLOAT64, // 8 bytes
24126
+ CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, // 2 bytes
24127
+ CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ, // 4 bytes
24128
+ CU_TENSOR_MAP_DATA_TYPE_TFLOAT32, // 4 bytes
24129
+ CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ // 4 bytes
24130
+ CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, // 4 bits
24131
+ CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, // 4 bits
24132
+ CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B // 6 bits
24133
+ } CUtensorMapDataType;
24134
+ * \endcode
24135
+ * ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
24136
+ * ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
24137
+ * ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
24138
+ *
24139
+ * - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
24140
+ *
24141
+ * - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
24142
+ * - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
24143
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
24144
+ *
24145
+ * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
24146
+ * equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
24147
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
24148
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
24149
+ * - Dimension for the packed data types must reflect the number of individual U# values.
24150
+ *
24151
+ * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
24152
+ * multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
24153
+ * - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
24154
+ * - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
24155
+ * Each following dimension specified includes previous dimension stride:
24156
+ * \code
24157
+ globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
24158
+ for (i = 1; i < tensorRank - 1; i++)
24159
+ globalStrides[i] = globalStrides[i – 1] * (globalDim[i] + padding[i]);
24160
+ assert(globalStrides[i] >= globalDim[i]);
24161
+ * \endcode
24162
+ *
24163
+ * - \p pixelBoxLowerCornerWidth specifies the coordinate offset W of the bounding box from left corner. The offset must be
24164
+ * within range [-32768, 32767].
24165
+ *
24166
+ * - \p pixelBoxUpperCornerWidth specifies the coordinate offset W of the bounding box from right corner. The offset must be
24167
+ * within range [-32768, 32767].
24168
+ *
24169
+ * The bounding box specified by \p pixelBoxLowerCornerWidth and \p pixelBoxUpperCornerWidth must have non-zero area. Note
24170
+ * that the size of the box along D and H dimensions is always equal to one.
24171
+ *
24172
+ * - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
24173
+ * Additionally, when \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p channelsPerPixel must be 128.
24174
+ *
24175
+ * - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the W dimension, must be less than or
24176
+ * equal to 1024. This field is ignored when \p mode is ::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128.
24177
+ *
24178
+ * - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
24179
+ * than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
24180
+ * TMA doesn’t support the stride for dimension zero.
24181
+ * When all elements of the \p elementStrides array are one, \p boxDim specifies the number of elements to load. However, if \p elementStrides[i]
24182
+ * is not equal to one for some \p i, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension.
24183
+ * To load N elements along i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i].
24184
+ *
24185
+ * - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as:
24186
+ * \code
24187
+ typedef enum CUtensorMapInterleave_enum {
24188
+ CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
24189
+ CU_TENSOR_MAP_INTERLEAVE_16B,
24190
+ CU_TENSOR_MAP_INTERLEAVE_32B
24191
+ } CUtensorMapInterleave;
24192
+ * \endcode
24193
+ * TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
24194
+ * uses 32 bytes.
24195
+ * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the bounding box inner dimension (computed as \p channelsPerPixel multiplied by
24196
+ * element size in bytes derived from \p tensorDataType) must be less than or equal to the swizzle size.
24197
+ * - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
24198
+ * - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
24199
+ * Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
24200
+ *
24201
+ * - \p mode, which describes loading of elements loaded along the W dimension, has to be one of the following ::CUtensorMapIm2ColWideMode types:
24202
+ * \code
24203
+ * CU_TENSOR_MAP_IM2COL_WIDE_MODE_W,
24204
+ * CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
24205
+ * \endcode
24206
+ * ::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W allows the number of elements loaded along the W dimension to be specified
24207
+ * via the \p pixelsPerColumn field.
24208
+ *
24209
+ * - \p swizzle, which specifies the shared memory bank swizzling pattern, must be one of the following
24210
+ * ::CUtensorMapSwizzle modes (other swizzle modes are not supported):
24211
+ * \code
24212
+ typedef enum CUtensorMapSwizzle_enum {
24213
+ CU_TENSOR_MAP_SWIZZLE_64B, // Swizzle 16B chunks within 64B span
24214
+ CU_TENSOR_MAP_SWIZZLE_128B, // Swizzle 16B chunks within 128B span
24215
+ CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B, // Swizzle 32B chunks within 128B span
24216
+ } CUtensorMapSwizzle;
24217
+ * \endcode
24218
+ * Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
24219
+ * in shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this
24220
+ * problem, data can be loaded to shared memory with shuffling across shared memory banks.
24221
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
24222
+ * - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
24223
+ * - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
24224
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
24225
+ * - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
24226
+ * - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
24227
+ *
24228
+ * - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
24229
+ * type ::CUtensorMapL2promotion, which is defined as:
24230
+ * \code
24231
+ typedef enum CUtensorMapL2promotion_enum {
24232
+ CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
24233
+ CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
24234
+ CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
24235
+ CU_TENSOR_MAP_L2_PROMOTION_L2_256B
24236
+ } CUtensorMapL2promotion;
24237
+ * \endcode
24238
+ *
24239
+ * - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type
24240
+ * ::CUtensorMapFloatOOBfill which is defined as:
24241
+ * \code
24242
+ typedef enum CUtensorMapFloatOOBfill_enum {
24243
+ CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
24244
+ CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
24245
+ } CUtensorMapFloatOOBfill;
24246
+ * \endcode
24247
+ * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
24248
+ * and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
24249
+ *
24250
+ * \param tensorMap - Tensor map object to create
24251
+ * \param tensorDataType - Tensor data type
24252
+ * \param tensorRank - Dimensionality of tensor; must be at least 3
24253
+ * \param globalAddress - Starting address of memory region described by tensor
24254
+ * \param globalDim - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions
24255
+ * \param globalStrides - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions
24256
+ * \param pixelBoxLowerCornerWidth - Width offset of left box corner
24257
+ * \param pixelBoxUpperCornerWidth - Width offset of right box corner
24258
+ * \param channelsPerPixel - Number of channels per pixel
24259
+ * \param pixelsPerColumn - Number of pixels per column
24260
+ * \param elementStrides - Array containing traversal stride in each of the \p tensorRank dimensions
24261
+ * \param interleave - Type of interleaved layout the tensor addresses
24262
+ * \param mode - W or W128 mode
24263
+ * \param swizzle - Bank swizzling pattern inside shared memory
24264
+ * \param l2Promotion - L2 promotion size
24265
+ * \param oobFill - Indicate whether zero or special NaN constant will be used to fill out-of-bound elements
24266
+ *
24267
+ * \return
24268
+ * ::CUDA_SUCCESS,
24269
+ * ::CUDA_ERROR_DEINITIALIZED,
24270
+ * ::CUDA_ERROR_NOT_INITIALIZED,
24271
+ * ::CUDA_ERROR_INVALID_CONTEXT,
24272
+ * ::CUDA_ERROR_INVALID_VALUE
24273
+ *
24274
+ * \sa
24275
+ * ::cuTensorMapEncodeTiled,
24276
+ * ::cuTensorMapEncodeIm2col,
24277
+ * ::cuTensorMapReplaceAddress
24278
+ */
24279
+ CUresult CUDAAPI cuTensorMapEncodeIm2colWide(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, int pixelBoxLowerCornerWidth, int pixelBoxUpperCornerWidth, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapIm2ColWideMode mode, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
24280
+
24281
+ /**
24282
+ * \brief Modify an existing tensor map descriptor with an updated global address
22709
24283
  *
22710
24284
  * Modifies the descriptor for Tensor Memory Access (TMA) object passed in \p tensorMap with
22711
24285
  * an updated \p globalAddress.
@@ -22727,6 +24301,7 @@ CUresult CUDAAPI cuTensorMapEncodeIm2col(CUtensorMap *tensorMap, CUtensorMapData
22727
24301
  * \sa
22728
24302
  * ::cuTensorMapEncodeTiled,
22729
24303
  * ::cuTensorMapEncodeIm2col
24304
+ * ::cuTensorMapEncodeIm2colWide
22730
24305
  */
22731
24306
  CUresult CUDAAPI cuTensorMapReplaceAddress(CUtensorMap *tensorMap, void *globalAddress);
22732
24307
 
@@ -23261,9 +24836,29 @@ typedef enum CUcoredumpSettings_enum {
23261
24836
  CU_COREDUMP_ENABLE_USER_TRIGGER,
23262
24837
  CU_COREDUMP_FILE,
23263
24838
  CU_COREDUMP_PIPE,
24839
+ CU_COREDUMP_GENERATION_FLAGS,
23264
24840
  CU_COREDUMP_MAX
23265
24841
  } CUcoredumpSettings;
23266
24842
 
24843
+ /**
24844
+ * Flags for controlling coredump contents
24845
+ */
24846
+ typedef enum CUCoredumpGenerationFlags {
24847
+ CU_COREDUMP_DEFAULT_FLAGS = 0,
24848
+ CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES = (1 << 0),
24849
+ CU_COREDUMP_SKIP_GLOBAL_MEMORY = (1 << 1),
24850
+ CU_COREDUMP_SKIP_SHARED_MEMORY = (1 << 2),
24851
+ CU_COREDUMP_SKIP_LOCAL_MEMORY = (1 << 3),
24852
+ CU_COREDUMP_SKIP_ABORT = (1 << 4),
24853
+ CU_COREDUMP_SKIP_CONSTBANK_MEMORY = (1 << 5),
24854
+
24855
+ CU_COREDUMP_LIGHTWEIGHT_FLAGS = CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES
24856
+ | CU_COREDUMP_SKIP_GLOBAL_MEMORY
24857
+ | CU_COREDUMP_SKIP_SHARED_MEMORY
24858
+ | CU_COREDUMP_SKIP_LOCAL_MEMORY
24859
+ | CU_COREDUMP_SKIP_CONSTBANK_MEMORY
24860
+ } CUCoredumpGenerationFlags;
24861
+
23267
24862
  /**
23268
24863
  * \brief Allows caller to fetch a coredump attribute value for the current context
23269
24864
  *
@@ -23280,10 +24875,12 @@ typedef enum CUcoredumpSettings_enum {
23280
24875
  * CU_CTX_USER_COREDUMP_ENABLE flag was set during context creation.
23281
24876
  * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
23282
24877
  * also create a coredump. The default value is ::true unless set to ::false globally or
23283
- * or locally.
24878
+ * or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
24879
+ * flag to disable host device abort() if needed.
23284
24880
  * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
23285
24881
  * will not have a dump of GPU memory or non-reloc ELF images. The default value is
23286
- * ::false unless set to ::true globally or locally.
24882
+ * ::false unless set to ::true globally or locally. This attribute is deprecated as
24883
+ * of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS instead.
23287
24884
  * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
23288
24885
  * created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
23289
24886
  * value is ::false unless set to ::true globally or locally.
@@ -23295,6 +24892,22 @@ typedef enum CUcoredumpSettings_enum {
23295
24892
  * that will be monitored if user-triggered coredumps are enabled. The default value is
23296
24893
  * ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
23297
24894
  * the CUDA application and ::PID is the process ID of the CUDA application.
24895
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
24896
+ * contained in a coredump specified as a bitwise OR combination of the following values:
24897
+ * + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
24898
+ * default settings of including all memory regions that it is able to access
24899
+ * + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
24900
+ * CUDA source modules that are not relocated at runtime.
24901
+ * + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
24902
+ * that does not belong to any context.
24903
+ * + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
24904
+ * for the warp that the dumped kernel belonged to.
24905
+ * + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
24906
+ * + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
24907
+ * the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
24908
+ * + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
24909
+ * process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
24910
+ * behavior.
23298
24911
  *
23299
24912
  * \param attrib - The enum defining which value to fetch.
23300
24913
  * \param value - void* containing the requested data.
@@ -23330,10 +24943,13 @@ CUresult CUDAAPI cuCoredumpGetAttribute(CUcoredumpSettings attrib, void* value,
23330
24943
  * this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
23331
24944
  * The default value is ::false.
23332
24945
  * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
23333
- * also create a coredump. The default value is ::true.
24946
+ * also create a coredump. The default value is ::true unless set to ::false globally or
24947
+ * or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
24948
+ * flag to disable host device abort() if needed.
23334
24949
  * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
23335
24950
  * will not have a dump of GPU memory or non-reloc ELF images. The default value is
23336
- * ::false.
24951
+ * ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
24952
+ * instead.
23337
24953
  * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
23338
24954
  * created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
23339
24955
  * value is ::false.
@@ -23345,6 +24961,22 @@ CUresult CUDAAPI cuCoredumpGetAttribute(CUcoredumpSettings attrib, void* value,
23345
24961
  * that will be monitored if user-triggered coredumps are enabled. The default value is
23346
24962
  * ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
23347
24963
  * the CUDA application and ::PID is the process ID of the CUDA application.
24964
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
24965
+ * contained in a coredump specified as a bitwise OR combination of the following values:
24966
+ * + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
24967
+ * default settings of including all memory regions that it is able to access
24968
+ * + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
24969
+ * CUDA source modules that are not relocated at runtime.
24970
+ * + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
24971
+ * that does not belong to any context.
24972
+ * + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
24973
+ * for the warp that the dumped kernel belonged to.
24974
+ * + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
24975
+ * + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
24976
+ * the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
24977
+ * + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
24978
+ * process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
24979
+ * behavior.
23348
24980
  *
23349
24981
  * \param attrib - The enum defining which value to fetch.
23350
24982
  * \param value - void* containing the requested data.
@@ -23369,7 +25001,7 @@ CUresult CUDAAPI cuCoredumpGetAttributeGlobal(CUcoredumpSettings attrib, void *v
23369
25001
  *
23370
25002
  * An important design decision to note is that any coredump environment variable values
23371
25003
  * set before CUDA initializes will take permanent precedence over any values set with this
23372
- * this function. This decision was made to ensure no change in behavior for any users that
25004
+ * function. This decision was made to ensure no change in behavior for any users that
23373
25005
  * may be currently using these variables to get coredumps.
23374
25006
  *
23375
25007
  * \p *value shall contain the requested value specified by \p set. It is up to the caller
@@ -23389,14 +25021,33 @@ CUresult CUDAAPI cuCoredumpGetAttributeGlobal(CUcoredumpSettings attrib, void *v
23389
25021
  * this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
23390
25022
  * The default value is ::false.
23391
25023
  * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
23392
- * also create a coredump. The default value is ::true.
25024
+ * also create a coredump. The default value is ::true unless set to ::false globally or
25025
+ * or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
25026
+ * flag to disable host device abort() if needed.
23393
25027
  * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
23394
25028
  * will not have a dump of GPU memory or non-reloc ELF images. The default value is
23395
- * ::false.
25029
+ * ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
25030
+ * instead.
23396
25031
  * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
23397
25032
  * any coredumps generated by this context will be written. The default value is
23398
25033
  * ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
23399
25034
  * the CUDA applications and ::PID is the process ID of the CUDA application.
25035
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
25036
+ * contained in a coredump specified as a bitwise OR combination of the following values:
25037
+ * + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
25038
+ * default settings of including all memory regions that it is able to access
25039
+ * + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
25040
+ * CUDA source modules that are not relocated at runtime.
25041
+ * + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
25042
+ * that does not belong to any context.
25043
+ * + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
25044
+ * for the warp that the dumped kernel belonged to.
25045
+ * + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
25046
+ * + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
25047
+ * the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
25048
+ * + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
25049
+ * process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
25050
+ * behavior.
23400
25051
  *
23401
25052
  * \param attrib - The enum defining which value to set.
23402
25053
  * \param value - void* containing the requested data.
@@ -23427,7 +25078,7 @@ CUresult CUDAAPI cuCoredumpSetAttribute(CUcoredumpSettings attrib, void* value,
23427
25078
  *
23428
25079
  * An important design decision to note is that any coredump environment variable values
23429
25080
  * set before CUDA initializes will take permanent precedence over any values set with this
23430
- * this function. This decision was made to ensure no change in behavior for any users that
25081
+ * function. This decision was made to ensure no change in behavior for any users that
23431
25082
  * may be currently using these variables to get coredumps.
23432
25083
  *
23433
25084
  * \p *value shall contain the requested value specified by \p set. It is up to the caller
@@ -23441,10 +25092,13 @@ CUresult CUDAAPI cuCoredumpSetAttribute(CUcoredumpSettings attrib, void* value,
23441
25092
  * this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
23442
25093
  * The default value is ::false.
23443
25094
  * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
23444
- * also create a coredump. The default value is ::true.
25095
+ * also create a coredump. The default value is ::true unless set to ::false globally or
25096
+ * or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
25097
+ * flag to disable host device abort() if needed.
23445
25098
  * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
23446
25099
  * will not have a dump of GPU memory or non-reloc ELF images. The default value is
23447
- * ::false.
25100
+ * ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
25101
+ * instead.
23448
25102
  * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
23449
25103
  * created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
23450
25104
  * value is ::false.
@@ -23457,6 +25111,22 @@ CUresult CUDAAPI cuCoredumpSetAttribute(CUcoredumpSettings attrib, void* value,
23457
25111
  * changed after ::CU_COREDUMP_ENABLE_USER_TRIGGER is set to ::true. The default
23458
25112
  * value is ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine
23459
25113
  * running the CUDA application and ::PID is the process ID of the CUDA application.
25114
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
25115
+ * contained in a coredump specified as a bitwise OR combination of the following values:
25116
+ * + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
25117
+ * default settings of including all memory regions that it is able to access
25118
+ * + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
25119
+ * CUDA source modules that are not relocated at runtime.
25120
+ * + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
25121
+ * that does not belong to any context.
25122
+ * + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
25123
+ * for the warp that the dumped kernel belonged to.
25124
+ * + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
25125
+ * + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
25126
+ * the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
25127
+ * + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
25128
+ * process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
25129
+ * behavior.
23460
25130
  *
23461
25131
  * \param attrib - The enum defining which value to set.
23462
25132
  * \param value - void* containing the requested data.
@@ -23523,13 +25193,6 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExp
23523
25193
  * @{
23524
25194
  */
23525
25195
 
23526
- /*!
23527
- * \typedef typedef struct CUgreenCtx_st* CUgreenCtx
23528
- * A green context handle. This handle can be used safely from only one CPU thread at a time.
23529
- * Created via ::cuGreenCtxCreate
23530
- */
23531
- typedef struct CUgreenCtx_st *CUgreenCtx;
23532
-
23533
25196
  /*!
23534
25197
  * \typedef struct CUdevResourceDesc_st* CUdevResourceDesc;
23535
25198
  * An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources.
@@ -23541,6 +25204,11 @@ typedef enum {
23541
25204
  CU_GREEN_CTX_DEFAULT_STREAM = 0x1, /**< Required. Creates a default stream to use inside the green context */
23542
25205
  } CUgreenCtxCreate_flags;
23543
25206
 
25207
+ typedef enum {
25208
+ CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING = 0x1,
25209
+ CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE = 0x2,
25210
+ } CUdevSmResourceSplit_flags;
25211
+
23544
25212
  #define RESOURCE_ABI_VERSION 1
23545
25213
  #define RESOURCE_ABI_EXTERNAL_BYTES 48
23546
25214
 
@@ -23554,7 +25222,7 @@ typedef enum {
23554
25222
  typedef enum {
23555
25223
  CU_DEV_RESOURCE_TYPE_INVALID = 0,
23556
25224
  CU_DEV_RESOURCE_TYPE_SM = 1, /**< Streaming multiprocessors related information */
23557
- #ifdef __CUDA_API_VERSION_INTERNAL
25225
+ #if defined(__CUDA_API_VERSION_INTERNAL) && !defined(__CUDA_API_VERSION_INTERNAL_ODR)
23558
25226
  CU_DEV_RESOURCE_TYPE_MAX,
23559
25227
  #endif
23560
25228
  } CUdevResourceType;
@@ -23777,18 +25445,24 @@ CUresult CUDAAPI cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resour
23777
25445
  * first creating a descriptor and a green context with that descriptor.
23778
25446
  *
23779
25447
  * When creating the groups, the API will take into account the performance and functional characteristics of the
23780
- * input resource, and guarantee a split that will create a disjoint set of symmetrical partitions. This may lead to less groups created
25448
+ * input resource, and guarantee a split that will create a disjoint set of symmetrical partitions. This may lead to fewer groups created
23781
25449
  * than purely dividing the total SM count by the \p minCount due to cluster requirements or
23782
25450
  * alignment and granularity requirements for the minCount.
23783
25451
  *
23784
- * The \p remainder set, might not have the same functional or performance guarantees as the groups in \p result.
25452
+ * The \p remainder set does not have the same functional or performance guarantees as the groups in \p result.
23785
25453
  * Its use should be carefully planned and future partitions of the \p remainder set are discouraged.
23786
25454
  *
25455
+ * The following flags are supported:
25456
+ * - \p CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING : Lower the minimum SM count and alignment, and treat each SM independent of its hierarchy.
25457
+ * This allows more fine grained partitions but at the cost of advanced features (such as large clusters on compute capability 9.0+).
25458
+ * - \p CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE : Compute Capability 9.0+ only. Attempt to create groups that may allow
25459
+ * for maximally sized thread clusters. This can be queried post green context creation using ::cuOccupancyMaxPotentialClusterSize.
25460
+ *
23787
25461
  * A successful API call must either have:
23788
- * - A valid array of \p result pointers of size passed in \p nbGroups, with \p Input of type \p CU_DEV_RESOURCE_TYPE_SM.
23789
- * Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining and \p useFlags are optional.
23790
- * - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p Input of type \p CU_DEV_RESOURCE_TYPE_SM.
23791
- * Value of \p minCount must be between 0 and the SM count specified in \p input.
25462
+ * - A valid array of \p result pointers of size passed in \p nbGroups, with \p input of type \p CU_DEV_RESOURCE_TYPE_SM.
25463
+ * Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining may be NULL.
25464
+ * - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p input of type \p CU_DEV_RESOURCE_TYPE_SM.
25465
+ * Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining may be NULL.
23792
25466
  * This queries the number of groups that would be created by the API.
23793
25467
  *
23794
25468
  * Note: The API is not supported on 32-bit platforms.
@@ -23798,7 +25472,7 @@ CUresult CUDAAPI cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resour
23798
25472
  * \param input - Input SM resource to be split. Must be a valid \p CU_DEV_RESOURCE_TYPE_SM resource.
23799
25473
  * \param remaining - If the input resource cannot be cleanly split among \p nbGroups, the remaining is placed in here.
23800
25474
  * Can be ommitted (NULL) if the user does not need the remaining set.
23801
- * \param useFlags - Flags specifying how these partitions are used or which constraints to abide by when splitting the input.
25475
+ * \param useFlags - Flags specifying how these partitions are used or which constraints to abide by when splitting the input. Zero is valid for default behavior.
23802
25476
  * \param minCount - Minimum number of SMs required
23803
25477
  *
23804
25478
  * \return
@@ -23821,10 +25495,18 @@ CUresult CUDAAPI cuDevSmResourceSplitByCount(
23821
25495
  /**
23822
25496
  * \brief Generate a resource descriptor
23823
25497
  *
23824
- * Generates a resource descriptor with the set of resources specified in \p resources.
25498
+ * Generates a single resource descriptor with the set of resources specified in \p resources.
23825
25499
  * The generated resource descriptor is necessary for the creation of green contexts via the ::cuGreenCtxCreate API.
23826
- * The API expects \p nbResources == 1, as there is only one type of resource and merging the same
23827
- * types of resource is currently not supported.
25500
+ * Resources of the same type can be passed in, provided they meet the requirements as noted below.
25501
+ *
25502
+ * A successful API call must have:
25503
+ * - A valid output pointer for the \p phDesc descriptor as well as a valid array of \p resources pointers,
25504
+ * with the array size passed in \p nbResources.
25505
+ * If multiple resources are provided in \p resources, the device they came from must be the same,
25506
+ * otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
25507
+ * If multiple resources are provided in \p resources and they are of type ::CU_DEV_RESOURCE_TYPE_SM,
25508
+ * they must be outputs (whether \p result or \p remaining) from the same split API instance,
25509
+ * otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
23828
25510
  *
23829
25511
  * Note: The API is not supported on 32-bit platforms.
23830
25512
  *
@@ -23848,15 +25530,16 @@ CUresult CUDAAPI cuDevResourceGenerateDesc(CUdevResourceDesc *phDesc, CUdevResou
23848
25530
  /**
23849
25531
  * \brief Records an event.
23850
25532
  *
23851
- * Captures in \phEvent all the activities of the green context of \phCtx
23852
- * at the time of this call. \phEvent and \phCtx must be from the same
23853
- * CUDA context. Calls such as ::cuEventQuery() or ::cuGreenCtxWaitEvent() will
25533
+ * Captures in \p hEvent all the activities of the green context of \p hCtx
25534
+ * at the time of this call. \p hEvent and \p hCtx must be from the same
25535
+ * primary context otherwise ::CUDA_ERROR_INVALID_HANDLE is returned.
25536
+ * Calls such as ::cuEventQuery() or ::cuGreenCtxWaitEvent() will
23854
25537
  * then examine or wait for completion of the work that was captured. Uses of
23855
25538
  * \p hCtx after this call do not modify \p hEvent.
23856
25539
  *
23857
- * \note The API will return an error if the specified green context \p hCtx
23858
- * has a stream in the capture mode. In such a case, the call will invalidate
23859
- * all the conflicting captures.
25540
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the
25541
+ * specified green context \p hCtx has a stream in the capture mode. In such
25542
+ * a case, the call will invalidate all the conflicting captures.
23860
25543
  *
23861
25544
  * \param hCtx - Green context to record event for
23862
25545
  * \param hEvent - Event to record
@@ -23866,39 +25549,49 @@ CUresult CUDAAPI cuDevResourceGenerateDesc(CUdevResourceDesc *phDesc, CUdevResou
23866
25549
  * ::CUDA_ERROR_DEINITIALIZED,
23867
25550
  * ::CUDA_ERROR_NOT_INITIALIZED,
23868
25551
  * ::CUDA_ERROR_INVALID_CONTEXT,
23869
- * ::CUDA_ERROR_INVALID_HANDLE
25552
+ * ::CUDA_ERROR_INVALID_HANDLE,
25553
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
23870
25554
  *
23871
25555
  * \sa
23872
25556
  * ::cuGreenCtxWaitEvent,
23873
- * ::cuEventRecord
25557
+ * ::cuEventRecord,
25558
+ * ::cuCtxRecordEvent,
25559
+ * ::cuCtxWaitEvent
23874
25560
  */
23875
25561
  CUresult CUDAAPI cuGreenCtxRecordEvent(CUgreenCtx hCtx, CUevent hEvent);
23876
25562
 
23877
25563
  /**
23878
25564
  * \brief Make a green context wait on an event
23879
25565
  *
23880
- * Makes all future work submitted to green context \phCtx wait for all work
23881
- * captured in \phEvent. The synchronization will be performed on the device
25566
+ * Makes all future work submitted to green context \p hCtx wait for all work
25567
+ * captured in \p hEvent. The synchronization will be performed on the device
23882
25568
  * and will not block the calling CPU thread. See ::cuGreenCtxRecordEvent()
23883
- * for details on what is captured by an event.
25569
+ * or ::cuEventRecord(), for details on what is captured by an event.
25570
+ *
25571
+ * \note \p hEvent may be from a different context or device than \p hCtx.
25572
+ *
25573
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and
25574
+ * invalidate the capture if the specified event \p hEvent is part of an
25575
+ * ongoing capture sequence or if the specified green context \p hCtx has
25576
+ * a stream in the capture mode.
23884
25577
  *
23885
- * \note The API will return an error and invalidate the capture if the specified
23886
- * event \p hEvent is part of an ongoing capture sequence.
23887
- *
23888
25578
  * \param hCtx - Green context to wait
23889
- * \param hEvent - Event to wait on (may not be NULL)
25579
+ * \param hEvent - Event to wait on
23890
25580
  *
23891
25581
  * \return
23892
25582
  * ::CUDA_SUCCESS,
23893
25583
  * ::CUDA_ERROR_DEINITIALIZED,
23894
25584
  * ::CUDA_ERROR_NOT_INITIALIZED,
23895
25585
  * ::CUDA_ERROR_INVALID_CONTEXT,
23896
- * ::CUDA_ERROR_INVALID_HANDLE
25586
+ * ::CUDA_ERROR_INVALID_HANDLE,
25587
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
23897
25588
  *
23898
25589
  * \sa
23899
25590
  * ::cuGreenCtxRecordEvent,
23900
25591
  * ::cuStreamWaitEvent
23901
- */
25592
+ * ::cuCtxRecordEvent,
25593
+ * ::cuCtxWaitEvent
25594
+ */
23902
25595
  CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
23903
25596
 
23904
25597
  /**
@@ -23910,7 +25603,9 @@ CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
23910
25603
  * The stream handle \p hStream can refer to any of the following:
23911
25604
  * <ul>
23912
25605
  * <li>
23913
- * a stream created via any of the CUDA driver APIs such as ::cuStreamCreate.
25606
+ * a stream created via any of the CUDA driver APIs such as ::cuStreamCreate, ::cuStreamCreateWithPriority
25607
+ * and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as
25608
+ * ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
23914
25609
  * If during stream creation the context that was active in the calling thread was obtained
23915
25610
  * with cuCtxFromGreenCtx, that green context is returned in \p phCtx.
23916
25611
  * Otherwise, \p *phCtx is set to NULL instead.
@@ -23936,9 +25631,13 @@ CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
23936
25631
  * \notefnerr
23937
25632
  *
23938
25633
  * \sa ::cuStreamDestroy,
25634
+ * ::cuStreamCreate,
23939
25635
  * ::cuStreamCreateWithPriority,
25636
+ * ::cuStreamGetCtx_v2,
25637
+ * ::cuGreenCtxStreamCreate,
23940
25638
  * ::cuStreamGetPriority,
23941
25639
  * ::cuStreamGetFlags,
25640
+ * ::cuStreamGetDevice
23942
25641
  * ::cuStreamWaitEvent,
23943
25642
  * ::cuStreamQuery,
23944
25643
  * ::cuStreamSynchronize,
@@ -23948,6 +25647,62 @@ CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
23948
25647
  */
23949
25648
  CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
23950
25649
 
25650
+ /**
25651
+ * \brief Create a stream for use in the green context
25652
+ *
25653
+ * Creates a stream for use in the specified green context \p greenCtx and returns a handle in \p phStream.
25654
+ * The stream can be destroyed by calling ::cuStreamDestroy(). Note that the API ignores the context that
25655
+ * is current to the calling thread and creates a stream in the specified green context \p greenCtx.
25656
+ *
25657
+ * The supported values for \p flags are:
25658
+ * - ::CU_STREAM_NON_BLOCKING: This must be specified. It indicates that work running in the created
25659
+ * stream may run concurrently with work in the default stream, and that
25660
+ * the created stream should perform no implicit synchronization with the default stream.
25661
+ *
25662
+ * Specifying \p priority affects the scheduling priority of work in the stream. Priorities provide a
25663
+ * hint to preferentially run work with higher priority when possible, but do not preempt
25664
+ * already-running work or provide any other functional guarantee on execution order.
25665
+ * \p priority follows a convention where lower numbers represent higher priorities.
25666
+ * '0' represents default priority. The range of meaningful numerical priorities can
25667
+ * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
25668
+ * outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
25669
+ * it will automatically be clamped to the lowest or the highest number in the range.
25670
+ *
25671
+ * \param phStream - Returned newly created stream
25672
+ * \param greenCtx - Green context for which to create the stream for
25673
+ * \param flags - Flags for stream creation. \p CU_STREAM_NON_BLOCKING must be specified.
25674
+ * \param priority - Stream priority. Lower numbers represent higher priorities.
25675
+ * See ::cuCtxGetStreamPriorityRange for more information about
25676
+ * meaningful stream priorities that can be passed.
25677
+ *
25678
+ * \return
25679
+ * ::CUDA_SUCCESS,
25680
+ * ::CUDA_ERROR_DEINITIALIZED,
25681
+ * ::CUDA_ERROR_NOT_INITIALIZED,
25682
+ * ::CUDA_ERROR_INVALID_CONTEXT,
25683
+ * ::CUDA_ERROR_INVALID_VALUE,
25684
+ * ::CUDA_ERROR_OUT_OF_MEMORY
25685
+ * \notefnerr
25686
+ *
25687
+ * \note In the current implementation, only compute kernels launched in
25688
+ * priority streams are affected by the stream's priority. Stream priorities have
25689
+ * no effect on host-to-device and device-to-host memory operations.
25690
+ *
25691
+ * \sa ::cuStreamDestroy,
25692
+ * ::cuGreenCtxCreate
25693
+ * ::cuStreamCreate,
25694
+ * ::cuStreamGetPriority,
25695
+ * ::cuCtxGetStreamPriorityRange,
25696
+ * ::cuStreamGetFlags,
25697
+ * ::cuStreamGetDevice
25698
+ * ::cuStreamWaitEvent,
25699
+ * ::cuStreamQuery,
25700
+ * ::cuStreamSynchronize,
25701
+ * ::cuStreamAddCallback,
25702
+ * ::cudaStreamCreateWithPriority
25703
+ */
25704
+ CUresult CUDAAPI cuGreenCtxStreamCreate(CUstream* phStream, CUgreenCtx greenCtx, unsigned int flags, int priority);
25705
+
23951
25706
  /** @} */
23952
25707
 
23953
25708
  /*
@@ -23991,6 +25746,8 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
23991
25746
  #undef cuMemcpyDtoDAsync
23992
25747
  #undef cuMemcpy2DAsync
23993
25748
  #undef cuMemcpy3DAsync
25749
+ #undef cuMemcpyBatchAsync
25750
+ #undef cuMemcpy3DBatchAsync
23994
25751
  #undef cuMemsetD8
23995
25752
  #undef cuMemsetD16
23996
25753
  #undef cuMemsetD32
@@ -24025,6 +25782,7 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
24025
25782
  #undef cuStreamGetPriority
24026
25783
  #undef cuStreamGetId
24027
25784
  #undef cuStreamGetFlags
25785
+ #undef cuStreamGetDevice
24028
25786
  #undef cuStreamGetCtx
24029
25787
  #undef cuStreamWaitEvent
24030
25788
  #undef cuStreamAddCallback
@@ -24083,6 +25841,8 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
24083
25841
  #undef cuStreamUpdateCaptureDependencies
24084
25842
  #undef cuStreamUpdateCaptureDependencies_v2
24085
25843
  #undef cuGetProcAddress
25844
+ #undef cuStreamGetCtx_v2
25845
+ #undef cuMemBatchDecompressAsync
24086
25846
 
24087
25847
  CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
24088
25848
  CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
@@ -24250,7 +26010,11 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
24250
26010
  CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
24251
26011
  CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
24252
26012
  CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
24253
-
26013
+ CUresult CUDAAPI cuMemcpyBatchAsync(CUdeviceptr *dsts, CUdeviceptr *srcs, size_t *sizes, size_t count,
26014
+ CUmemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs,
26015
+ size_t *failIdx, CUstream hStream);
26016
+ CUresult CUDAAPI cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP *opList,
26017
+ size_t *failIdx, unsigned long long flags, CUstream hStream);
24254
26018
  CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
24255
26019
  CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
24256
26020
  CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
@@ -24261,7 +26025,9 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
24261
26025
  CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
24262
26026
  CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
24263
26027
  CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
26028
+ CUresult CUDAAPI cuStreamGetDevice(CUstream hStream, CUdevice *device);
24264
26029
  CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
26030
+ CUresult CUDAAPI cuStreamGetCtx_v2(CUstream hStream, CUcontext *pCtx, CUgreenCtx *pGreenCtx);
24265
26031
  CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
24266
26032
  CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
24267
26033
  CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
@@ -24330,6 +26096,15 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
24330
26096
 
24331
26097
  CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
24332
26098
  CUresult CUDAAPI cuStreamUpdateCaptureDependencies_v2(CUstream hStream, CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
26099
+
26100
+ CUresult CUDAAPI cuMemBatchDecompressAsync(
26101
+ CUmemDecompressParams *paramsArray,
26102
+ size_t count,
26103
+ unsigned int flags,
26104
+ size_t *errorIndex,
26105
+ CUstream stream
26106
+ );
26107
+
24333
26108
  CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags);
24334
26109
 
24335
26110
  #elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
@@ -24344,6 +26119,152 @@ static inline CUresult cuGetProcAddress_v2_ptsz(const char *symbol, void **funcP
24344
26119
  #define cuGetProcAddress_v2 cuGetProcAddress_v2_ptsz
24345
26120
  #endif
24346
26121
 
26122
+ /**
26123
+ * \defgroup CUDA_CHECKPOINT CUDA Checkpointing
26124
+ *
26125
+ * ___MANBRIEF___ CUDA checkpoint and restore functionality of the low-level
26126
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
26127
+ *
26128
+ * This sections describes the checkpoint and restore functions of the low-level
26129
+ * CUDA driver application programming interface.
26130
+ *
26131
+ * The CUDA checkpoint and restore API's provide a way to save and restore GPU
26132
+ * state for full process checkpoints when used with CPU side process
26133
+ * checkpointing solutions. They can also be used to pause GPU work and suspend
26134
+ * a CUDA process to allow other applications to make use of GPU resources.
26135
+ *
26136
+ * Checkpoint and restore capabilities are currently restricted to Linux.
26137
+ *
26138
+ * @{
26139
+ */
26140
+
26141
+ /**
26142
+ * \brief Returns the restore thread ID for a CUDA process
26143
+ *
26144
+ * Returns in \p *tid the thread ID of the CUDA restore thread for the process
26145
+ * specified by \p pid.
26146
+ *
26147
+ * \param pid - The process ID of the CUDA process
26148
+ * \param tid - Returned restore thread ID
26149
+ *
26150
+ * \return
26151
+ * ::CUDA_SUCCESS
26152
+ * ::CUDA_ERROR_INVALID_VALUE
26153
+ * ::CUDA_ERROR_NOT_INITIALIZED
26154
+ * ::CUDA_ERROR_NOT_SUPPORTED
26155
+ */
26156
+ CUresult CUDAAPI cuCheckpointProcessGetRestoreThreadId(int pid, int *tid);
26157
+
26158
+ /**
26159
+ * \brief Returns the process state of a CUDA process
26160
+ *
26161
+ * Returns in \p *state the current state of the CUDA process specified by \p pid.
26162
+ *
26163
+ * \param pid - The process ID of the CUDA process
26164
+ * \param state - Returned CUDA process state
26165
+ *
26166
+ * \return
26167
+ * ::CUDA_SUCCESS
26168
+ * ::CUDA_ERROR_INVALID_VALUE
26169
+ * ::CUDA_ERROR_NOT_INITIALIZED
26170
+ * ::CUDA_ERROR_NOT_SUPPORTED
26171
+ */
26172
+ CUresult CUDAAPI cuCheckpointProcessGetState(int pid, CUprocessState *state);
26173
+
26174
+ /**
26175
+ * \brief Lock a running CUDA process
26176
+ *
26177
+ * Lock the CUDA process specified by \p pid which will block further CUDA API
26178
+ * calls. Process must be in the RUNNING state in order to lock.
26179
+ *
26180
+ * Upon successful return the process will be in the LOCKED state.
26181
+ *
26182
+ * If timeoutMs is specified and the timeout is reached the process will be left
26183
+ * in the RUNNING state upon return.
26184
+ *
26185
+ * \param pid - The process ID of the CUDA process
26186
+ * \param args - Optional lock operation arguments
26187
+ *
26188
+ * \return
26189
+ * ::CUDA_SUCCESS
26190
+ * ::CUDA_ERROR_INVALID_VALUE
26191
+ * ::CUDA_ERROR_NOT_INITIALIZED
26192
+ * ::CUDA_ERROR_ILLEGAL_STATE
26193
+ * ::CUDA_ERROR_NOT_SUPPORTED
26194
+ * ::CUDA_ERROR_NOT_READY
26195
+ */
26196
+ CUresult CUDAAPI cuCheckpointProcessLock(int pid, CUcheckpointLockArgs *args);
26197
+
26198
+ /**
26199
+ * \brief Checkpoint a CUDA process's GPU memory contents
26200
+ *
26201
+ * Checkpoints a CUDA process specified by \p pid that is in the LOCKED
26202
+ * state. The GPU memory contents will be brought into host memory and all
26203
+ * underlying references will be released. Process must be in the LOCKED state
26204
+ * to checkpoint.
26205
+ *
26206
+ * Upon successful return the process will be in the CHECKPOINTED state.
26207
+ *
26208
+ * \param pid - The process ID of the CUDA process
26209
+ * \param args - Optional checkpoint operation arguments
26210
+ *
26211
+ * \return
26212
+ * ::CUDA_SUCCESS
26213
+ * ::CUDA_ERROR_INVALID_VALUE
26214
+ * ::CUDA_ERROR_NOT_INITIALIZED
26215
+ * ::CUDA_ERROR_ILLEGAL_STATE
26216
+ * ::CUDA_ERROR_NOT_SUPPORTED
26217
+ */
26218
+ CUresult CUDAAPI cuCheckpointProcessCheckpoint(int pid, CUcheckpointCheckpointArgs *args);
26219
+
26220
+ /**
26221
+ * \brief Restore a CUDA process's GPU memory contents from its last checkpoint
26222
+ *
26223
+ * Restores a CUDA process specified by \p pid from its last checkpoint. Process
26224
+ * must be in the CHECKPOINTED state to restore.
26225
+ *
26226
+ * Upon successful return the process will be in the LOCKED state.
26227
+ *
26228
+ * CUDA process restore requires persistence mode to be enabled or ::cuInit to
26229
+ * have been called before execution.
26230
+ *
26231
+ * \param pid - The process ID of the CUDA process
26232
+ * \param args - Optional restore operation arguments
26233
+ *
26234
+ * \return
26235
+ * ::CUDA_SUCCESS
26236
+ * ::CUDA_ERROR_INVALID_VALUE
26237
+ * ::CUDA_ERROR_NOT_INITIALIZED
26238
+ * ::CUDA_ERROR_ILLEGAL_STATE
26239
+ * ::CUDA_ERROR_NOT_SUPPORTED
26240
+ *
26241
+ * \sa
26242
+ * ::cuInit
26243
+ */
26244
+ CUresult CUDAAPI cuCheckpointProcessRestore(int pid, CUcheckpointRestoreArgs *args);
26245
+
26246
+ /**
26247
+ * \brief Unlock a CUDA process to allow CUDA API calls
26248
+ *
26249
+ * Unlocks a process specified by \p pid allowing it to resume making CUDA API
26250
+ * calls. Process must be in the LOCKED state.
26251
+ *
26252
+ * Upon successful return the process will be in the RUNNING state.
26253
+ *
26254
+ * \param pid - The process ID of the CUDA process
26255
+ * \param args - Optional unlock operation arguments
26256
+ *
26257
+ * \return
26258
+ * ::CUDA_SUCCESS
26259
+ * ::CUDA_ERROR_INVALID_VALUE
26260
+ * ::CUDA_ERROR_NOT_INITIALIZED
26261
+ * ::CUDA_ERROR_ILLEGAL_STATE
26262
+ * ::CUDA_ERROR_NOT_SUPPORTED
26263
+ */
26264
+ CUresult CUDAAPI cuCheckpointProcessUnlock(int pid, CUcheckpointUnlockArgs *args);
26265
+
26266
+ /** @} */ /* End CUDA_CHECKPOINT */
26267
+
24347
26268
  #ifdef __cplusplus
24348
26269
  }
24349
26270
  #endif