PyPI - triton-windows - Versions diffs - 3.2.0.post11__cp312-cp312-win_amd64.whl → 3.3.0a0.post11__cp312-cp312-win_amd64.whl - Mend

triton-windows 3.2.0.post11__cp312-cp312-win_amd64.whl → 3.3.0a0.post11__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +3 -3
triton/_internal_testing.py +59 -4
triton/_utils.py +35 -0
triton/backends/amd/compiler.py +121 -74
triton/backends/amd/driver.py +77 -43
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +28 -49
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +35 -9
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +761 -284
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +9 -3
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +1391 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +3 -3
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +44 -0
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +288 -0
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +110 -14
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +504 -103
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +2 -1
triton/backends/amd/include/hip/amd_detail/host_defines.h +4 -0
triton/backends/amd/include/hip/hip_ext.h +4 -2
triton/backends/amd/include/hip/hip_fp8.h +33 -0
triton/backends/amd/include/hip/hip_runtime_api.h +375 -33
triton/backends/amd/include/hip/hip_version.h +3 -3
triton/backends/amd/include/hip/hiprtc.h +25 -25
triton/backends/amd/include/hsa/amd_hsa_elf.h +40 -14
triton/backends/amd/include/hsa/hsa.h +11 -2
triton/backends/amd/include/hsa/hsa_api_trace.h +30 -17
triton/backends/amd/include/hsa/hsa_api_trace_version.h +68 -0
triton/backends/amd/include/hsa/hsa_ext_amd.h +83 -27
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +46 -46
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +416 -0
triton/backends/amd/include/roctracer/hip_ostream_ops.h +84 -4
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +260 -0
triton/backends/amd/include/roctracer/hsa_prof_str.h +51 -19
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/compiler.py +25 -225
triton/backends/driver.py +7 -2
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +135 -90
triton/backends/nvidia/driver.c +0 -1
triton/backends/nvidia/driver.py +135 -49
triton/backends/nvidia/include/cuda.h +2162 -241
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +2 -2
triton/compiler/code_generator.py +334 -231
triton/compiler/compiler.py +77 -66
triton/language/__init__.py +22 -5
triton/language/core.py +448 -74
triton/language/extra/cuda/_experimental_tma.py +3 -5
triton/language/math.py +1 -1
triton/language/random.py +2 -1
triton/language/semantic.py +206 -52
triton/language/standard.py +35 -18
triton/runtime/_allocation.py +32 -0
triton/runtime/autotuner.py +27 -32
triton/runtime/build.py +1 -48
triton/runtime/cache.py +6 -6
triton/runtime/errors.py +10 -0
triton/runtime/interpreter.py +179 -45
triton/runtime/jit.py +149 -190
triton/testing.py +39 -11
triton/tools/compile.py +27 -20
triton/tools/{compile.c → extra/cuda/compile.c} +1 -0
triton/tools/mxfp.py +301 -0
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/METADATA +5 -2
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/RECORD +68 -59
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/top_level.txt +2 -0
/triton/tools/{compile.h → extra/cuda/compile.h} +0 -0
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/WHEEL +0 -0

triton/backends/nvidia/include/cuda.h CHANGED Viewed

@@ -89,6 +89,7 @@ typedef uint64_t cuuint64_t;
 #define cuDeviceTotalMem                    cuDeviceTotalMem_v2
 #define cuCtxCreate                         cuCtxCreate_v2
 #define cuCtxCreate_v3                      cuCtxCreate_v3
+#define cuCtxCreate_v4                      cuCtxCreate_v4
 #define cuModuleGetGlobal                   cuModuleGetGlobal_v2
 #define cuMemGetInfo                        cuMemGetInfo_v2
 #define cuMemAlloc                          cuMemAlloc_v2
@@ -115,6 +116,8 @@ typedef uint64_t cuuint64_t;
 #define cuMemcpyDtoDAsync                   __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
 #define cuMemcpy2DAsync                     __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
 #define cuMemcpy3DAsync                     __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
+#define cuMemcpyBatchAsync                  __CUDA_API_PTSZ(cuMemcpyBatchAsync)
+#define cuMemcpy3DBatchAsync                __CUDA_API_PTSZ(cuMemcpy3DBatchAsync)
 #define cuMemsetD8                          __CUDA_API_PTDS(cuMemsetD8_v2)
 #define cuMemsetD16                         __CUDA_API_PTDS(cuMemsetD16_v2)
 #define cuMemsetD32                         __CUDA_API_PTDS(cuMemsetD32_v2)
@@ -183,7 +186,9 @@ typedef uint64_t cuuint64_t;
     #define cuStreamGetPriority                 __CUDA_API_PTSZ(cuStreamGetPriority)
     #define cuStreamGetId                       __CUDA_API_PTSZ(cuStreamGetId)
     #define cuStreamGetFlags                    __CUDA_API_PTSZ(cuStreamGetFlags)
+    #define cuStreamGetDevice                   __CUDA_API_PTSZ(cuStreamGetDevice)
     #define cuStreamGetCtx                      __CUDA_API_PTSZ(cuStreamGetCtx)
+    #define cuStreamGetCtx_v2                   __CUDA_API_PTSZ(cuStreamGetCtx_v2)
     #define cuStreamWaitEvent                   __CUDA_API_PTSZ(cuStreamWaitEvent)
     #define cuStreamEndCapture                  __CUDA_API_PTSZ(cuStreamEndCapture)
     #define cuStreamIsCapturing                 __CUDA_API_PTSZ(cuStreamIsCapturing)
@@ -202,6 +207,7 @@ typedef uint64_t cuuint64_t;
     #define cuGraphicsMapResources              __CUDA_API_PTSZ(cuGraphicsMapResources)
     #define cuGraphicsUnmapResources            __CUDA_API_PTSZ(cuGraphicsUnmapResources)
     #define cuLaunchCooperativeKernel           __CUDA_API_PTSZ(cuLaunchCooperativeKernel)
     #define cuSignalExternalSemaphoresAsync     __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync)
@@ -223,6 +229,8 @@ typedef uint64_t cuuint64_t;
 #endif
+#define cuMemBatchDecompressAsync               __CUDA_API_PTSZ(cuMemBatchDecompressAsync)
 /**
  * \file cuda.h
  * \brief Header file for the CUDA Toolkit application programming interface.
@@ -244,7 +252,7 @@ typedef uint64_t cuuint64_t;
 /**
  * CUDA API version number
  */
-#define CUDA_VERSION 12040
+#define CUDA_VERSION 12080
 #ifdef __cplusplus
 extern "C" {
@@ -263,7 +271,7 @@ typedef CUdeviceptr_v2 CUdeviceptr;                          /**< CUDA device po
 typedef int CUdevice_v1;                                     /**< CUDA device */
 typedef CUdevice_v1 CUdevice;                                /**< CUDA device */
-typedef struct CUctx_st *CUcontext;                          /**< CUDA context */
+typedef struct CUctx_st *CUcontext;                          /**< A regular context handle */
 typedef struct CUmod_st *CUmodule;                           /**< CUDA module */
 typedef struct CUfunc_st *CUfunction;                        /**< CUDA function */
 typedef struct CUlib_st *CUlibrary;                          /**< CUDA library */
@@ -289,6 +297,12 @@ typedef struct CUuserObject_st *CUuserObject;                /**< CUDA user obje
 typedef cuuint64_t CUgraphConditionalHandle; /**< CUDA graph conditional handle */
 typedef struct CUgraphDeviceUpdatableNode_st *CUgraphDeviceNode; /**< CUDA graph device node handle */
 typedef struct CUasyncCallbackEntry_st *CUasyncCallbackHandle;            /**< CUDA async notification callback handle */
+/*!
+ * \typedef typedef struct CUgreenCtx_st* CUgreenCtx
+ * A green context handle. This handle can be used safely from only one CPU thread at a time.
+ * Created via ::cuGreenCtxCreate
+ */
+typedef struct CUgreenCtx_st *CUgreenCtx;
 #ifndef CU_UUID_HAS_BEEN_DEFINED
 #define CU_UUID_HAS_BEEN_DEFINED
@@ -617,41 +631,58 @@ typedef void (*CUasyncCallback)(CUasyncNotificationInfo *info, void *userData, C
  * Array formats
  */
 typedef enum CUarray_format_enum {
-    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
-    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
-    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
-    CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
-    CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
-    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
-    CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
-    CU_AD_FORMAT_FLOAT          = 0x20, /**< 32-bit floating point */
-    CU_AD_FORMAT_NV12           = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
-    CU_AD_FORMAT_UNORM_INT8X1   = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT8X2   = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT8X4   = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT16X1  = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT16X2  = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT16X4  = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT8X1   = 0xc6, /**< 1 channel signed 8-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT8X2   = 0xc7, /**< 2 channel signed 8-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT8X4   = 0xc8, /**< 4 channel signed 8-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT16X1  = 0xc9, /**< 1 channel signed 16-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT16X2  = 0xca, /**< 2 channel signed 16-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT16X4  = 0xcb, /**< 4 channel signed 16-bit normalized integer */
-    CU_AD_FORMAT_BC1_UNORM      = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
-    CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
-    CU_AD_FORMAT_BC2_UNORM      = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
-    CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/
-    CU_AD_FORMAT_BC3_UNORM      = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
-    CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/
-    CU_AD_FORMAT_BC4_UNORM      = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
-    CU_AD_FORMAT_BC4_SNORM      = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */
-    CU_AD_FORMAT_BC5_UNORM      = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
-    CU_AD_FORMAT_BC5_SNORM      = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */
-    CU_AD_FORMAT_BC6H_UF16      = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
-    CU_AD_FORMAT_BC6H_SF16      = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */
-    CU_AD_FORMAT_BC7_UNORM      = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
-    CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e  /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
+    CU_AD_FORMAT_UNSIGNED_INT8            = 0x01, /**< Unsigned 8-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT16           = 0x02, /**< Unsigned 16-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT32           = 0x03, /**< Unsigned 32-bit integers */
+    CU_AD_FORMAT_SIGNED_INT8              = 0x08, /**< Signed 8-bit integers */
+    CU_AD_FORMAT_SIGNED_INT16             = 0x09, /**< Signed 16-bit integers */
+    CU_AD_FORMAT_SIGNED_INT32             = 0x0a, /**< Signed 32-bit integers */
+    CU_AD_FORMAT_HALF                     = 0x10, /**< 16-bit floating point */
+    CU_AD_FORMAT_FLOAT                    = 0x20, /**< 32-bit floating point */
+    CU_AD_FORMAT_NV12                     = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
+    CU_AD_FORMAT_UNORM_INT8X1             = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT8X2             = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT8X4             = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X1            = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X2            = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X4            = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X1             = 0xc6, /**< 1 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X2             = 0xc7, /**< 2 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X4             = 0xc8, /**< 4 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X1            = 0xc9, /**< 1 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X2            = 0xca, /**< 2 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X4            = 0xcb, /**< 4 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_BC1_UNORM                = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
+    CU_AD_FORMAT_BC1_UNORM_SRGB           = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC2_UNORM                = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
+    CU_AD_FORMAT_BC2_UNORM_SRGB           = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC3_UNORM                = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
+    CU_AD_FORMAT_BC3_UNORM_SRGB           = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC4_UNORM                = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
+    CU_AD_FORMAT_BC4_SNORM                = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */
+    CU_AD_FORMAT_BC5_UNORM                = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
+    CU_AD_FORMAT_BC5_SNORM                = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */
+    CU_AD_FORMAT_BC6H_UF16                = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
+    CU_AD_FORMAT_BC6H_SF16                = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */
+    CU_AD_FORMAT_BC7_UNORM                = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
+    CU_AD_FORMAT_BC7_UNORM_SRGB           = 0x9e, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
+    CU_AD_FORMAT_P010                     = 0x9f, /**< 10-bit YUV planar format, with 4:2:0 sampling */
+    CU_AD_FORMAT_P016                     = 0xa1, /**< 16-bit YUV planar format, with 4:2:0 sampling */
+    CU_AD_FORMAT_NV16                     = 0xa2, /**< 8-bit YUV planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_P210                     = 0xa3, /**< 10-bit YUV planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_P216                     = 0xa4, /**< 16-bit YUV planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_YUY2                     = 0xa5, /**< 2 channel, 8-bit YUV packed planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_Y210                     = 0xa6, /**< 2 channel, 10-bit YUV packed planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_Y216                     = 0xa7, /**< 2 channel, 16-bit YUV packed planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_AYUV                     = 0xa8, /**< 4 channel, 8-bit YUV packed planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_Y410                     = 0xa9, /**< 10-bit YUV packed planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_Y416                     = 0xb1, /**< 4 channel, 12-bit YUV packed planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_Y444_PLANAR8             = 0xb2, /**< 3 channel 8-bit YUV planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_Y444_PLANAR10            = 0xb3, /**< 3 channel 10-bit YUV planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_YUV444_8bit_SemiPlanar   = 0xb4, /**< 3 channel 8-bit YUV semi-planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_YUV444_16bit_SemiPlanar  = 0xb5, /**< 3 channel 16-bit YUV semi-planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_UNORM_INT_101010_2       = 0x50, /**< 4 channel unorm R10G10B10A2 RGB format */
+    CU_AD_FORMAT_MAX                      = 0x7FFFFFFF
 } CUarray_format;
 /**
@@ -811,11 +842,17 @@ typedef enum CUdevice_attribute_enum {
     CU_DEVICE_ATTRIBUTE_TENSOR_MAP_ACCESS_SUPPORTED = 127,                  /**< Device supports accessing memory using Tensor Map. */
     CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED = 128,                 /**< Device supports exporting memory to a fabric handle with cuMemExportToShareableHandle() or requested with cuMemCreate() */
     CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS = 129,                    /**< Device supports unified function pointers. */
-    CU_DEVICE_ATTRIBUTE_NUMA_CONFIG = 130,
-    CU_DEVICE_ATTRIBUTE_NUMA_ID = 131,
+    CU_DEVICE_ATTRIBUTE_NUMA_CONFIG = 130,                                  /**< NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum */
+    CU_DEVICE_ATTRIBUTE_NUMA_ID = 131,                                      /**< NUMA node ID of the GPU memory */
     CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED = 132,                          /**< Device supports switch multicast and reduction operations. */
     CU_DEVICE_ATTRIBUTE_MPS_ENABLED = 133,                                  /**< Indicates if contexts created on this device will be shared via MPS */
     CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID = 134,                                 /**< NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA. */
+    CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED = 135,                          /**< Device supports CIG with D3D12. */
+    CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK = 136,                /**< The returned valued shall be interpreted as a bitmask, where the individual bits are described by the ::CUmemDecompressAlgorithm enum. */
+    CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_MAXIMUM_LENGTH = 137,                /**< The returned valued is the maximum length in bytes of a single decompress operation that is allowed. */
+    CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID    = 139, /**< The combined 16-bit PCI device ID and 16-bit PCI vendor ID. */
+    CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID = 140, /**< The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID. */
+    CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED = 143,             /**< Device supports HOST_NUMA location IPC between nodes in a multi-node system. */
     CU_DEVICE_ATTRIBUTE_MAX
 } CUdevice_attribute;
@@ -860,6 +897,7 @@ typedef enum CUpointer_attribute_enum {
     CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18,               /**< Size of the actual underlying mapping that the pointer belongs to **/
     CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19,          /**< The start address of the mapping that the pointer belongs to **/
     CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20             /**< A process-wide unique id corresponding to the physical allocation the pointer belongs to **/
+  , CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE = 21    /**< Returns in \p *data a boolean that indicates whether the pointer points to memory that is capable to be used for hardware accelerated decompression. */
 } CUpointer_attribute;
 /**
@@ -1449,27 +1487,36 @@ typedef enum CUjit_option_enum
  */
 typedef enum CUjit_target_enum
 {
-    CU_TARGET_COMPUTE_30 = 30,       /**< Compute device class 3.0 */
-    CU_TARGET_COMPUTE_32 = 32,       /**< Compute device class 3.2 */
-    CU_TARGET_COMPUTE_35 = 35,       /**< Compute device class 3.5 */
-    CU_TARGET_COMPUTE_37 = 37,       /**< Compute device class 3.7 */
-    CU_TARGET_COMPUTE_50 = 50,       /**< Compute device class 5.0 */
-    CU_TARGET_COMPUTE_52 = 52,       /**< Compute device class 5.2 */
-    CU_TARGET_COMPUTE_53 = 53,       /**< Compute device class 5.3 */
-    CU_TARGET_COMPUTE_60 = 60,       /**< Compute device class 6.0.*/
-    CU_TARGET_COMPUTE_61 = 61,       /**< Compute device class 6.1.*/
-    CU_TARGET_COMPUTE_62 = 62,       /**< Compute device class 6.2.*/
-    CU_TARGET_COMPUTE_70 = 70,       /**< Compute device class 7.0.*/
-    CU_TARGET_COMPUTE_72 = 72,       /**< Compute device class 7.2.*/
-    CU_TARGET_COMPUTE_75 = 75,       /**< Compute device class 7.5.*/
-    CU_TARGET_COMPUTE_80 = 80,       /**< Compute device class 8.0.*/
-    CU_TARGET_COMPUTE_86 = 86,       /**< Compute device class 8.6.*/
-    CU_TARGET_COMPUTE_87 = 87,       /**< Compute device class 8.7.*/
-    CU_TARGET_COMPUTE_89 = 89,       /**< Compute device class 8.9.*/
-    CU_TARGET_COMPUTE_90 = 90,       /**< Compute device class 9.0.*/
+    CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
+    CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */
+    CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */
+    CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */
+    CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */
+    CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */
+    CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */
+    CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/
+    CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/
+    CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/
+    CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/
+    CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/
+    CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/
+    CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/
+    CU_TARGET_COMPUTE_86 = 86, /**< Compute device class 8.6.*/
+    CU_TARGET_COMPUTE_87 = 87, /**< Compute device class 8.7.*/
+    CU_TARGET_COMPUTE_89 = 89, /**< Compute device class 8.9.*/
+    CU_TARGET_COMPUTE_90 = 90, /**< Compute device class 9.0.*/
+    CU_TARGET_COMPUTE_100 = 100, /**< Compute device class 10.0.*/
+    CU_TARGET_COMPUTE_101 = 101,       /**< Compute device class 10.1.*/
+    CU_TARGET_COMPUTE_120 = 120, /**< Compute device class 12.0.*/
     /**< Compute device class 9.0. with accelerated features.*/
     CU_TARGET_COMPUTE_90A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_90,
+    /**< Compute device class 10.0. with accelerated features.*/
+    CU_TARGET_COMPUTE_100A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_100,
+    /**< Compute device class 10.1 with accelerated features.*/
+    CU_TARGET_COMPUTE_101A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_101,
+    /**< Compute device class 12.0. with accelerated features.*/
+    CU_TARGET_COMPUTE_120A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_120,
 } CUjit_target;
 /**
@@ -1585,6 +1632,9 @@ typedef enum CUlimit_enum {
     CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */
     CU_LIMIT_MAX_L2_FETCH_GRANULARITY         = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
     CU_LIMIT_PERSISTING_L2_CACHE_SIZE         = 0x06, /**< A size in bytes for L2 persisting lines cache size */
+    CU_LIMIT_SHMEM_SIZE                       = 0x07, /**< A maximum size in bytes of shared memory available to CUDA kernels on a CIG context. Can only be queried, cannot be set */
+    CU_LIMIT_CIG_ENABLED                      = 0x08, /**< A non-zero value indicates this CUDA context is a CIG-enabled context. Can only be queried, cannot be set */
+    CU_LIMIT_CIG_SHMEM_FALLBACK_ENABLED       = 0x09, /**< When set to zero, CUDA will fail to launch a kernel on a CIG context, instead of using the fallback path, if the kernel uses more shared memory than available */
     CU_LIMIT_MAX
 } CUlimit;
@@ -1748,8 +1798,9 @@ typedef struct CUDA_HOST_NODE_PARAMS_v2_st {
  * Conditional node types
  */
 typedef enum CUgraphConditionalNodeType_enum {
-     CU_GRAPH_COND_TYPE_IF = 0,     /**< Conditional 'if' Node. Body executed once if condition value is non-zero. */
+     CU_GRAPH_COND_TYPE_IF = 0,     /**< Conditional 'if/else' Node. Body[0] executed if condition is non-zero.  If \p size == 2, an optional ELSE graph is created and this is executed if the condition is zero. */
      CU_GRAPH_COND_TYPE_WHILE = 1,  /**< Conditional 'while' Node. Body executed repeatedly while condition value is non-zero. */
+     CU_GRAPH_COND_TYPE_SWITCH = 2, /**< Conditional 'switch' Node. Body[n] is executed once, where 'n' is the value of the condition. If the condition does not match a body index, no body is launched. */
 } CUgraphConditionalNodeType;
 /**
@@ -1760,7 +1811,8 @@ typedef struct CUDA_CONDITIONAL_NODE_PARAMS {
                                             Handles must be created in advance of creating the node
                                             using ::cuGraphConditionalHandleCreate. */
     CUgraphConditionalNodeType type;   /**< Type of conditional node. */
-    unsigned int size;                 /**< Size of graph output array.  Must be 1. */
+    unsigned int size;                 /**< Size of graph output array.  Allowed values are 1 for CU_GRAPH_COND_TYPE_WHILE, 1 or 2
+                                            for CU_GRAPH_COND_TYPE_IF, or any value greater than zero for CU_GRAPH_COND_TYPE_SWITCH. */
     CUgraph *phGraph_out;              /**< CUDA-owned array populated with conditional node child graphs during creation of the node.
                                             Valid for the lifetime of the conditional node.
                                             The contents of the graph(s) are subject to the following constraints:
@@ -1770,7 +1822,17 @@ typedef struct CUDA_CONDITIONAL_NODE_PARAMS {
                                             - All kernels, including kernels in nested conditionals or child graphs at any level,
                                               must belong to the same CUDA context.
-                                            These graphs may be populated using graph node creation APIs or ::cuStreamBeginCaptureToGraph. */
+                                            These graphs may be populated using graph node creation APIs or ::cuStreamBeginCaptureToGraph.
+                                            CU_GRAPH_COND_TYPE_IF:
+                                            phGraph_out[0] is executed when the condition is non-zero.  If \p size == 2, phGraph_out[1] will
+                                            be executed when the condition is zero.
+                                            CU_GRAPH_COND_TYPE_WHILE:
+                                            phGraph_out[0] is executed as long as the condition is non-zero.
+                                            CU_GRAPH_COND_TYPE_SWITCH:
+                                            phGraph_out[n] is executed when the condition is equal to n.  If the condition >= \p size,
+                                            no body graph is executed.
+                                         */
     CUcontext ctx;                     /**< Context on which to run the node.  Must match context used to create the handle and all body nodes. */
 } CUDA_CONDITIONAL_NODE_PARAMS;
@@ -1790,23 +1852,22 @@ typedef enum CUgraphNodeType_enum {
     CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT   = 9, /**< External semaphore wait node */
     CU_GRAPH_NODE_TYPE_MEM_ALLOC        = 10,/**< Memory Allocation Node */
     CU_GRAPH_NODE_TYPE_MEM_FREE         = 11,/**< Memory Free Node */
-    CU_GRAPH_NODE_TYPE_BATCH_MEM_OP     = 12 /**< Batch MemOp Node */
-    ,
+    CU_GRAPH_NODE_TYPE_BATCH_MEM_OP     = 12,/**< Batch MemOp Node */
     CU_GRAPH_NODE_TYPE_CONDITIONAL      = 13 /**< Conditional Node
                                                   May be used to implement a conditional execution path or loop
                                                   inside of a graph. The graph(s) contained within the body of the conditional node
                                                   can be selectively executed or iterated upon based on the value of a conditional
                                                   variable.
                                                   Handles must be created in advance of creating the node
                                                   using ::cuGraphConditionalHandleCreate.
                                                   The following restrictions apply to graphs which contain conditional nodes:
                                                    The graph cannot be used in a child node.
                                                    Only one instantiation of the graph may exist at any point in time.
                                                    The graph cannot be cloned.
                                                   To set the control value, supply a default value when creating the handle and/or
                                                   call ::cudaGraphSetConditional from device code.*/
 } CUgraphNodeType;
@@ -1878,7 +1939,8 @@ typedef enum CUgraphInstantiateResult_enum
     CUDA_GRAPH_INSTANTIATE_ERROR = 1,                            /**< Instantiation failed for an unexpected reason which is described in the return value of the function */
     CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE = 2,                /**< Instantiation failed due to invalid structure, such as cycles */
     CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED = 3,     /**< Instantiation for device launch failed because the graph contained an unsupported operation */
-    CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED = 4       /**< Instantiation for device launch failed due to the nodes belonging to different contexts */
+    CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED = 4,      /**< Instantiation for device launch failed due to the nodes belonging to different contexts */
+    CUDA_GRAPH_INSTANTIATE_CONDITIONAL_HANDLE_UNUSED = 5,        /**< One or more conditional handles are not associated with conditional nodes */
 } CUgraphInstantiateResult;
 /**
@@ -2004,6 +2066,42 @@ typedef enum CUlaunchAttributeID_enum {
                                                       ::CUlaunchAttributeValue::memSyncDomainMap. */
   , CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN        = 10 /**< Valid for streams, graph nodes, launches. See
                                                        ::CUlaunchAttributeValue::memSyncDomain. */
+  , CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11 /**< Valid for graph nodes, launches. Set
+                                                              ::CUlaunchAttributeValue::preferredClusterDim
+                                                              to allow the kernel launch to specify a preferred substitute
+                                                              cluster dimension. Blocks may be grouped according to either
+                                                              the dimensions specified with this attribute (grouped into a
+                                                              "preferred substitute cluster"), or the one specified with
+                                                              ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped
+                                                              into a "regular cluster"). The cluster dimensions of a
+                                                              "preferred substitute cluster" shall be an integer multiple
+                                                              greater than zero of the regular cluster dimensions. The
+                                                              device will attempt - on a best-effort basis - to group
+                                                              thread blocks into preferred clusters over grouping them
+                                                              into regular clusters. When it deems necessary (primarily
+                                                              when the device temporarily runs out of physical resources
+                                                              to launch the larger preferred clusters), the device may
+                                                              switch to launch the regular clusters instead to attempt to
+                                                              utilize as much of the physical device resources as possible.
+                                                              <br>
+                                                              Each type of cluster will have its enumeration / coordinate
+                                                              setup as if the grid consists solely of its type of cluster.
+                                                              For example, if the preferred substitute cluster dimensions
+                                                              double the regular cluster dimensions, there might be
+                                                              simultaneously a regular cluster indexed at (1,0,0), and a
+                                                              preferred cluster indexed at (1,0,0). In this example, the
+                                                              preferred substitute cluster (1,0,0) replaces regular
+                                                              clusters (2,0,0) and (3,0,0) and groups their blocks.
+                                                              <br>
+                                                              This attribute will only take effect when a regular cluster
+                                                              dimension has been specified. The preferred substitute
+                                                              cluster dimension must be an integer multiple greater than
+                                                              zero of the regular cluster dimension and must divide the
+                                                              grid. It must also be no more than `maxBlocksPerCluster`, if
+                                                              it is set in the kernel's `__launch_bounds__`. Otherwise it
+                                                              must be less than the maximum value the driver can support.
+                                                              Otherwise, setting this attribute to a value physically
+                                                              unable to fit on any particular device is permitted. */
   , CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12 /**< Valid for launches. Set
                                                           ::CUlaunchAttributeValue::launchCompletionEvent to record the
                                                           event.
@@ -2054,7 +2152,14 @@ typedef enum CUlaunchAttributeID_enum {
                                                                from within the graph, the graph must be uploaded with ::cuGraphUpload before it
                                                                is launched. For such a graph, if host-side executable graph updates are made to the
                                                                device-updatable nodes, the graph must be uploaded before it is launched again. */
-#ifdef __CUDA_API_VERSION_INTERNAL
+  , CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 14 /**< Valid for launches. On devices where the L1 cache and shared memory use the
+                                                                   same hardware resources, setting ::CUlaunchAttributeValue::sharedMemCarveout to a
+                                                                   percentage between 0-100 signals the CUDA driver to set the shared memory carveout
+                                                                   preference, in percent of the total shared memory for that kernel launch.
+                                                                   This attribute takes precedence over ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
+                                                                   This is only a hint, and the CUDA driver can choose a different configuration if
+                                                                   required for the launch. */
+#if defined(__CUDA_API_VERSION_INTERNAL) && !defined(__CUDA_API_VERSION_INTERNAL_ODR)
   , CU_LAUNCH_ATTRIBUTE_MAX
 #endif
 } CUlaunchAttributeID;
@@ -2092,27 +2197,64 @@ typedef union CUlaunchAttributeValue_union {
                                                                     scheduling policy preference for the kernel. */
     int programmaticStreamSerializationAllowed;  /**< Value of launch attribute
                                                    ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION. */
+    /**
+     *  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
+     *  with the following fields:
+     *      - \p CUevent event - Event to fire when all blocks trigger it.
+     *      - \p Event record flags, see ::cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
+     *      - \p triggerAtBlockStart - If this is set to non-0, each block launch will automatically trigger the event.
+     */
     struct {
-        CUevent event;           /**< Event to fire when all blocks trigger it */
-        int flags;               /**< Event record flags, see ::cuEventRecordWithFlags. Does not accept
-                                    ::CU_EVENT_RECORD_EXTERNAL. */
-        int triggerAtBlockStart; /**< If this is set to non-0, each block launch will automatically trigger the event */
-    } programmaticEvent;         /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT. */
+        CUevent event;
+        int flags;
+        int triggerAtBlockStart;
+    } programmaticEvent;
+    /**
+     * Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT
+     * with the following fields:
+     *     - \p CUevent event - Event to fire when the last block launches
+     *     - \p int flags; - Event record flags, see ::cuEventRecordWithFlags. Does not accept ::CU_EVENT_RECORD_EXTERNAL.
+     */
     struct {
-        CUevent event; /**< Event to fire when the last block launches */
-        int flags; /**< Event record flags, see ::cuEventRecordWithFlags. Does not accept ::CU_EVENT_RECORD_EXTERNAL. */
-    } launchCompletionEvent; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT. */
+        CUevent event;
+        int flags;
+    } launchCompletionEvent;
     int priority; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution priority of the kernel. */
     CUlaunchMemSyncDomainMap memSyncDomainMap; /**< Value of launch attribute
                                                   ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP. See
                                                   ::CUlaunchMemSyncDomainMap. */
     CUlaunchMemSyncDomain memSyncDomain;       /**< Value of launch attribute
                                                   ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN. See::CUlaunchMemSyncDomain */
+    /**
+     *  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
+     *  that represents the desired preferred cluster dimensions for the kernel.
+     *  Opaque type with the following fields:
+     *      - \p x - The X dimension of the preferred cluster, in blocks. Must
+     *               be a divisor of the grid X dimension, and must be a
+     *               multiple of the \p x field of ::CUlaunchAttributeValue::clusterDim.
+     *      - \p y - The Y dimension of the preferred cluster, in blocks. Must
+     *               be a divisor of the grid Y dimension, and must be a
+     *               multiple of the \p y field of ::CUlaunchAttributeValue::clusterDim.
+     *      - \p z - The Z dimension of the preferred cluster, in blocks. Must be
+     *               equal to the \p z field of ::CUlaunchAttributeValue::clusterDim.
+     */
+    struct {
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+    } preferredClusterDim;
+    /**
+     *  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE.
+     *  with the following fields:
+     *      - \p int deviceUpdatable - Whether or not the resulting kernel node should be device-updatable.
+     *      - \p CUgraphDeviceNode devNode - Returns a handle to pass to the various device-side update functions.
+     */
     struct {
-        int deviceUpdatable; /**< Whether or not the resulting kernel node should be device-updatable. */
-        CUgraphDeviceNode devNode; /**< Returns a handle to pass to the various device-side update functions. */
-    } deviceUpdatableKernelNode; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. */
+        int deviceUpdatable;
+        CUgraphDeviceNode devNode;
+    } deviceUpdatableKernelNode;
+    unsigned int sharedMemCarveout; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT. */
 } CUlaunchAttributeValue;
 /**
@@ -2148,7 +2290,9 @@ typedef CUlaunchAttributeID CUkernelNodeAttrID;
 #define CU_KERNEL_NODE_ATTRIBUTE_PRIORITY             CU_LAUNCH_ATTRIBUTE_PRIORITY
 #define CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP  CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP
 #define CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN      CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN
+#define CU_KERNEL_NODE_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
 #define CU_KERNEL_NODE_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE
+#define CU_KERNEL_NODE_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
 typedef CUlaunchAttributeValue CUkernelNodeAttrValue_v1;
 typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue;
@@ -2231,6 +2375,29 @@ typedef struct CUexecAffinityParam_st {
  */
 typedef CUexecAffinityParam_v1 CUexecAffinityParam;
+typedef enum CUcigDataType_enum {
+    CIG_DATA_TYPE_D3D12_COMMAND_QUEUE = 0x1,    /** D3D12 Command Queue Handle */
+} CUcigDataType;
+/**
+* CIG Context Create Params
+*/
+typedef struct CUctxCigParam_st {
+    CUcigDataType sharedDataType;
+    void* sharedData;
+} CUctxCigParam;
+/**
+* Params for creating CUDA context
+* Exactly one of execAffinityParams and cigParams
+* must be non-NULL.
+*/
+typedef struct CUctxCreateParams_st {
+    CUexecAffinityParam *execAffinityParams;
+    int                  numExecAffinityParams;
+    CUctxCigParam       *cigParams;
+} CUctxCreateParams;
 /**
  * Library options to be specified with ::cuLibraryLoadData() or ::cuLibraryLoadFromFile()
  */
@@ -2502,6 +2669,17 @@ typedef enum cudaError_enum {
      */
     CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC       = 225,
+    /**
+     * This indicates that an exception occurred on the device that is now
+     * contained by the GPU's error containment capability. Common causes are -
+     * a. Certain types of invalid accesses of peer GPU memory over nvlink
+     * b. Certain classes of hardware errors
+     * This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must
+     * be terminated and relaunched.
+     */
+    CUDA_ERROR_CONTAINED                      = 226,
     /**
      * This indicates that the device kernel source is invalid. This includes
      * compilation/linker errors encountered in device code or user error.
@@ -2718,6 +2896,14 @@ typedef enum cudaError_enum {
      */
     CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE   = 720,
+    /**
+     * An exception occurred on the device while exiting a kernel using tensor memory: the
+     * tensor memory was not completely deallocated. This leaves the process in an inconsistent
+     * state and any further CUDA work will return the same error. To continue using CUDA, the
+     * process must be terminated and relaunched.
+     */
+    CUDA_ERROR_TENSOR_MEMORY_LEAK             = 721,
     /**
      * This error indicates that the attempted operation is not permitted.
      */
@@ -2894,6 +3080,12 @@ typedef enum cudaError_enum {
     */
     CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION = 915,
+    /**
+     * This error indicates that an error happened during the key rotation
+     * sequence.
+    */
+    CUDA_ERROR_KEY_ROTATION                   = 916,
     /**
      * This indicates that an unknown internal error has occurred.
      */
@@ -3307,7 +3499,10 @@ typedef enum CUtensorMapDataType_enum {
     CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
     CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,
     CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,
-    CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ
+    CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ,
+    CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,
+    CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,
+    CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B
 } CUtensorMapDataType;
 /**
@@ -3327,6 +3522,9 @@ typedef enum CUtensorMapSwizzle_enum {
     CU_TENSOR_MAP_SWIZZLE_32B,
     CU_TENSOR_MAP_SWIZZLE_64B,
     CU_TENSOR_MAP_SWIZZLE_128B,
+    CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,
+    CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B,
+    CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B
 } CUtensorMapSwizzle;
 /**
@@ -3347,6 +3545,14 @@ typedef enum CUtensorMapFloatOOBfill_enum {
     CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
 } CUtensorMapFloatOOBfill;
+/**
+ * Tensor map Im2Col wide mode
+ */
+typedef enum CUtensorMapIm2ColWideMode_enum {
+    CU_TENSOR_MAP_IM2COL_WIDE_MODE_W = 0,
+    CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+} CUtensorMapIm2ColWideMode;
 /**
  * GPU Direct v3 tokens
  */
@@ -3418,7 +3624,7 @@ typedef enum CUexternalMemoryHandleType_enum {
     /**
      * Handle is an NvSciBuf object
      */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8,
 } CUexternalMemoryHandleType;
 /**
@@ -3862,6 +4068,13 @@ typedef enum CUmemRangeHandleType_enum
     CU_MEM_RANGE_HANDLE_TYPE_MAX        = 0x7FFFFFFF
 } CUmemRangeHandleType;
+/**
+* Flag for requesting handle type for address range.
+*/
+typedef enum CUmemRangeFlags_enum {
+    CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE     = 0x1   /**< Indicates that DMA_BUF handle should be mapped via PCIe BAR1 */
+} CUmemRangeFlags;
 /**
  * Sparse subresource types
  */
@@ -3951,6 +4164,11 @@ typedef enum CUmemAllocationCompType_enum {
  * This flag if set indicates that the memory will be used as a tile pool.
  */
 #define CU_MEM_CREATE_USAGE_TILE_POOL    0x1
+/**
+ * This flag, if set, indicates that the memory will be used as a buffer for
+ * hardware accelerated decompression.
+ */
+#define CU_MEM_CREATE_USAGE_HW_DECOMPRESS 0x2
 /**
 * Specifies the allocation properties for a allocation.
@@ -4137,6 +4355,12 @@ typedef enum CUmemPool_attribute_enum {
     CU_MEMPOOL_ATTR_USED_MEM_HIGH
 } CUmemPool_attribute;
+/**
+ * This flag, if set, indicates that the memory will be used as a buffer for
+ * hardware accelerated decompression.
+ */
+#define CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS 0x2
 /**
  * Specifies the properties of allocations made from the pool.
  */
@@ -4152,7 +4376,8 @@ typedef struct CUmemPoolProps_st {
      */
     void *win32SecurityAttributes;
     size_t maxSize;             /**< Maximum pool size. When set to 0, defaults to a system dependent value. */
-    unsigned char reserved[56]; /**< reserved for future use, must be 0 */
+    unsigned short usage;       /**< Bitmask indicating intended usage for the pool. */
+    unsigned char reserved[54]; /**< reserved for future use, must be 0 */
 } CUmemPoolProps_v1;
 typedef CUmemPoolProps_v1 CUmemPoolProps;
@@ -4350,6 +4575,12 @@ typedef struct CUgraphNodeParams_st {
  */
 #define CUDA_ARRAY3D_DEFERRED_MAPPING 0x80
+/**
+ * This flag indicates that the CUDA array will be used for hardware accelerated
+ * video encode/decode operations.
+ */
+#define CUDA_ARRAY3D_VIDEO_ENCODE_DECODE 0x100
 /**
  * Override the texref format with a format inferred from the array.
  * Flag for ::cuTexRefSetArray()
@@ -4494,9 +4725,9 @@ typedef enum CUgraphDebugDot_flags_enum {
     CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES                        = 1<<10, /**< Adds node handles and every kernel function handle to output */
     CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS          = 1<<11, /**< Adds memory alloc node parameters to output */
     CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS           = 1<<12, /**< Adds memory free node parameters to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS       = 1<<13  /**< Adds batch mem op node parameters to output */
-    , CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO                = 1<<14  /**< Adds edge numbering information */
-    , CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS       = 1<<15  /**< Adds conditional node parameters to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS       = 1<<13, /**< Adds batch mem op node parameters to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO                = 1<<14, /**< Adds edge numbering information */
+    CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS        = 1<<15  /**< Adds conditional node parameters to output */
 } CUgraphDebugDot_flags;
 /**
@@ -4528,11 +4759,180 @@ typedef enum CUgraphInstantiate_flags_enum {
                                                               priority of the stream it is launched into. */
 } CUgraphInstantiate_flags;
+/**
+ * CUDA device NUMA configuration
+ */
 typedef enum CUdeviceNumaConfig_enum {
     CU_DEVICE_NUMA_CONFIG_NONE = 0, /**< The GPU is not a NUMA node */
     CU_DEVICE_NUMA_CONFIG_NUMA_NODE, /**< The GPU is a NUMA node, CU_DEVICE_ATTRIBUTE_NUMA_ID contains its NUMA ID */
 } CUdeviceNumaConfig;
+/**
+ * CUDA Process States
+ */
+typedef enum CUprocessState_enum {
+    CU_PROCESS_STATE_RUNNING = 0,  /**< Default process state */
+    CU_PROCESS_STATE_LOCKED,       /**< CUDA API locks are taken so further CUDA API calls will block */
+    CU_PROCESS_STATE_CHECKPOINTED, /**< Application memory contents have been checkpointed and underlying allocations and device handles have been released */
+    CU_PROCESS_STATE_FAILED,       /**< Application entered an uncorrectable error during the checkpoint/restore process */
+} CUprocessState;
+/**
+ * CUDA checkpoint optional lock arguments
+ */
+typedef struct CUcheckpointLockArgs_st {
+    unsigned int timeoutMs; /**< Timeout in milliseconds to attempt to lock the process, 0 indicates no timeout */
+    unsigned int reserved0; /**< Reserved for future use, must be zero */
+    cuuint64_t reserved1[7]; /**< Reserved for future use, must be zeroed */
+} CUcheckpointLockArgs;
+/**
+ * CUDA checkpoint optional checkpoint arguments
+ */
+typedef struct CUcheckpointCheckpointArgs_st {
+    cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
+} CUcheckpointCheckpointArgs;
+/**
+ * CUDA checkpoint optional restore arguments
+ */
+typedef struct CUcheckpointRestoreArgs_st {
+    cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
+} CUcheckpointRestoreArgs;
+/**
+ * CUDA checkpoint optional unlock arguments
+ */
+typedef struct CUcheckpointUnlockArgs_st {
+    cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
+} CUcheckpointUnlockArgs;
+/**
+ * Flags to specify for copies within a batch. For more details see ::cuMemcpyBatchAsync.
+ */
+typedef enum CUmemcpyFlags_enum {
+    CU_MEMCPY_FLAG_DEFAULT = 0x0,
+    /**
+     * Hint to the driver to try and overlap the copy with compute work on the SMs.
+     */
+    CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE = 0x1
+} CUmemcpyFlags;
+/**
+ * These flags allow applications to convey the source access ordering CUDA must maintain.
+ * The destination will always be accessed in stream order.
+ */
+typedef enum CUmemcpySrcAccessOrder_enum {
+    /**
+     * Default invalid.
+     */
+    CU_MEMCPY_SRC_ACCESS_ORDER_INVALID = 0x0,
+    /**
+     * Indicates that access to the source pointer must be in stream order.
+     */
+    CU_MEMCPY_SRC_ACCESS_ORDER_STREAM = 0x1,
+    /**
+     * Indicates that access to the source pointer can be out of stream order and
+     * all accesses must be complete before the API call returns. This flag is suited for
+     * ephemeral sources (ex., stack variables) when it's known that no prior operations
+     * in the stream can be accessing the memory and also that the lifetime of the memory
+     * is limited to the scope that the source variable was declared in. Specifying
+     * this flag allows the driver to optimize the copy and removes the need for the user
+     * to synchronize the stream after the API call.
+     */
+    CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL = 0x2,
+    /**
+     * Indicates that access to the source pointer can be out of stream order and the accesses
+     * can happen even after the API call returns. This flag is suited for host pointers
+     * allocated outside CUDA (ex., via malloc) when it's known that no prior operations
+     * in the stream can be accessing the memory. Specifying this flag allows the driver
+     * to optimize the copy on certain platforms.
+     */
+    CU_MEMCPY_SRC_ACCESS_ORDER_ANY = 0x3,
+    CU_MEMCPY_SRC_ACCESS_ORDER_MAX = 0x7FFFFFFF
+}  CUmemcpySrcAccessOrder;
+/**
+ * Attributes specific to copies within a batch. For more details on usage see ::cuMemcpyBatchAsync.
+ */
+typedef struct CUmemcpyAttributes_st {
+    CUmemcpySrcAccessOrder srcAccessOrder;  /**< Source access ordering to be observed for copies with this attribute. */
+    CUmemLocation srcLocHint;               /**< Hint location for the source operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
+    CUmemLocation dstLocHint;               /**< Hint location for the destination operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
+    unsigned int flags;                     /**< Additional flags for copies with this attribute. See ::CUmemcpyFlags */
+} CUmemcpyAttributes_v1;
+typedef CUmemcpyAttributes_v1 CUmemcpyAttributes;
+/**
+ * These flags allow applications to convey the operand type for individual copies specified in ::cuMemcpy3DBatchAsync.
+ */
+typedef enum CUmemcpy3DOperandType_enum {
+    CU_MEMCPY_OPERAND_TYPE_POINTER = 0x1,     /**< Memcpy operand is a valid pointer. */
+    CU_MEMCPY_OPERAND_TYPE_ARRAY = 0x2,       /**< Memcpy operand is a CUarray. */
+    CU_MEMCPY_OPERAND_TYPE_MAX = 0x7FFFFFFF
+} CUmemcpy3DOperandType;
+/**
+ * Struct representing offset into a CUarray in elements
+ */
+typedef struct CUoffset3D_st {
+    size_t x;
+    size_t y;
+    size_t z;
+} CUoffset3D_v1;
+typedef CUoffset3D_v1 CUoffset3D;
+/**
+ * Struct representing width/height/depth of a CUarray in elements
+ */
+typedef struct CUextent3D_st {
+    size_t width;
+    size_t height;
+    size_t depth;
+} CUextent3D_v1;
+typedef CUextent3D_v1 CUextent3D;
+/**
+ * Struct representing an operand for copy with ::cuMemcpy3DBatchAsync
+ */
+typedef struct CUmemcpy3DOperand_st {
+    CUmemcpy3DOperandType type;
+    union {
+        /**
+         * Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_POINTER
+         */
+        struct {
+            CUdeviceptr ptr;
+            size_t rowLength;        /**< Length of each row in elements. */
+            size_t layerHeight;      /**< Height of each layer in elements. */
+            CUmemLocation locHint;   /**< Hint location for the operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
+        } ptr;
+        /**
+         * Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_ARRAY
+         */
+        struct {
+            CUarray array;
+            CUoffset3D offset;
+        } array;
+    } op;
+} CUmemcpy3DOperand_v1;
+typedef CUmemcpy3DOperand_v1 CUmemcpy3DOperand;
+typedef struct CUDA_MEMCPY3D_BATCH_OP_st {
+    CUmemcpy3DOperand src;                    /**< Source memcpy operand. */
+    CUmemcpy3DOperand dst;                    /**< Destination memcpy operand. */
+    CUextent3D extent;                        /**< Extents of the memcpy between src and dst. The width, height and depth components must not be 0.*/
+    CUmemcpySrcAccessOrder srcAccessOrder;    /**< Source access ordering to be observed for copy from src to dst. */
+    unsigned int flags;                       /**< Additional flags for copies with this attribute. See ::CUmemcpyFlags */
+} CUDA_MEMCPY3D_BATCH_OP_v1;
+typedef CUDA_MEMCPY3D_BATCH_OP_v1 CUDA_MEMCPY3D_BATCH_OP;
 /** @} */ /* END CUDA_TYPES */
 #if defined(__GNUC__)
@@ -5124,6 +5524,12 @@ CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
  * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here.
  * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC
  * - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
+ * - ::CU_DEVICE_ATTRIBUTE_NUMA_CONFIG: NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum
+ * - ::CU_DEVICE_ATTRIBUTE_NUMA_ID: NUMA node ID of the GPU memory
+ * - ::CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED: Device supports switch multicast and reduction operations.
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID: The combined 16-bit PCI device ID and 16-bit PCI vendor ID.
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID: The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID.
+ID.
  *
  * \param pi     - Returned device attribute value
  * \param attrib - Device attribute to query
@@ -5310,6 +5716,15 @@ CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type
  * determined by comparing the numerical values between the two enums, with
  * smaller scopes having smaller values.
  *
+ * On platforms that support GPUDirect RDMA writes via more than one path in
+ * hardware (see ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE), the user should
+ * consider those paths as belonging to separate ordering domains. Note that in
+ * such cases CUDA driver will report both RDMA writes ordering and RDMA write
+ * scope as ALL_DEVICES and a call to cuFlushGPUDirectRDMA will be a no-op,
+ * but when these multiple paths are used simultaneously, it is the user's
+ * responsibility to ensure ordering by using mechanisms outside the scope of
+ * CUDA.
+ *
  * Users may query support for this API via
  * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS.
  *
@@ -5991,6 +6406,161 @@ CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
  */
 CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev);
+/**
+ * \brief Create a CUDA context
+ *
+ * Creates a new CUDA context and associates it with the calling thread. The
+ * \p flags parameter is described below. The context is created with a usage
+ * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy()
+ * when done using the context. If a context is already current to the thread,
+ * it is supplanted by the newly created context and may be restored by a subsequent
+ * call to ::cuCtxPopCurrent().
+ *
+ * CUDA context can be created with execution affinity. The type and the amount of
+   execution resource the context can use is limited by \p paramsArray and \p numExecAffinityParams
+   in \p execAffinity. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numExecAffinityParams
+ * describes the size of the paramsArray. If two \p CUexecAffinityParam in the array have the same type,
+ * the latter execution affinity parameter overrides the former execution affinity parameter.
+ * The supported execution affinity types are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
+ *   of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
+ *   rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
+ *   affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
+ *   is only supported under Volta+ MPS.
+ *
+ * CUDA context can be created in CIG(CUDA in Graphics) mode by setting \p cigParams.
+ * Data from graphics client is shared with CUDA via the \p sharedData in \p cigParams.
+ * Support for D3D12 graphics client can be determined using ::cuDeviceGetAttribute() with
+ * ::CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED. \p sharedData is a ID3D12CommandQueue handle.
+ * Either \p execAffinityParams or \p cigParams can be set to a non-null value. Setting both to a
+ * non-null value will result in an undefined behavior.
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ * This flag must be set in order to allocate pinned host memory that is
+ * accessible to the GPU.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+ *
+ * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ * be set during context creation to instruct CUDA to create a coredump if
+ * this context raises an exception during execution. These environment variables
+ * are described in the CUDA-GDB user guide under the "GPU core dump support"
+ * section.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
+ * it becomes current. This flag is not supported when CUDA context is created in
+ * CIG(CUDA in Graphics) mode.
+ *
+ * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment
+ * variables, this flag can be set during context creation to instruct CUDA to
+ * create a coredump if data is written to a certain pipe that is present in the
+ * OS space. These environment variables are described in the CUDA-GDB user
+ * guide under the "GPU core dump support" section.
+ * It is important to note that the pipe name *must* be set with
+ * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
+ * it becomes current.
+ * Setting this flag on any context creation is equivalent to setting the
+ * ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally.
+ * This flag is not supported when CUDA context is created in
+ * CIG(CUDA in Graphics) mode.
+ *
+ * - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated
+ * on this context will always synchronize. See further documentation in the
+ * section titled "API Synchronization behavior" to learn more about cases when
+ * synchronous memory operations can exhibit asynchronous behavior.
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ * the compute mode for * devices.
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * Context creation will fail with :: CUDA_ERROR_INVALID_VALUE if invalid parameter was
+ * passed by client to create the CUDA context.
+ *
+ * Context creation in CIG mode will fail with ::CUDA_ERROR_NOT_SUPPORTED if CIG is not supported
+ * by the device or the driver.
+ * \param pctx              - Returned context handle of the new context
+ * \param ctxCreateParams   - Context creation parameters
+ * \param flags             - Context creation flags
+ * \param dev               - Device to create context on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCoredumpSetAttributeGlobal,
+ * ::cuCoredumpSetAttribute,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxCreate_v4(CUcontext *pctx, CUctxCreateParams *ctxCreateParams, unsigned int flags, CUdevice dev);
 /**
  * \brief Destroy a CUDA context
  *
@@ -6002,9 +6572,11 @@ CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArra
  * Destroys and cleans up all resources associated with the context.
  * It is the caller's responsibility to ensure that the context or its resources
  * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior.
- * These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
+ * These resources include CUDA types ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
  * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref,
  * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore.
+ * These resources also include memory allocations by ::cuMemAlloc(), ::cuMemAllocHost(),
+ * ::cuMemAllocManaged() and ::cuMemAllocPitch().
  *
  * If \p ctx is current to the calling thread then \p ctx will also be
  * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
@@ -6012,6 +6584,10 @@ CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArra
  * remain current to those threads, and attempting to access \p ctx from
  * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
  *
+ * \note ::cuCtxDestroy() will not destroy memory allocations by ::cuMemCreate(), ::cuMemAllocAsync() and
+ * ::cuMemAllocFromPoolAsync(). These memory allocations are not associated with any CUDA context and need to
+ * be destroyed explicitly.
+ *
  * \param ctx - Context to destroy
  *
  * \return
@@ -6158,11 +6734,11 @@ CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
 CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
 /**
- * \brief Returns the device ID for the current context
+ * \brief Returns the device handle for the current context
  *
- * Returns in \p *device the ordinal of the current context's device.
+ * Returns in \p *device the handle of the current context's device.
  *
- * \param device - Returned device ID for the current context
+ * \param device - Returned device handle for the current context
  *
  * \return
  * ::CUDA_SUCCESS,
@@ -6278,9 +6854,11 @@ CUresult CUDAAPI cuCtxSetFlags(unsigned int flags);
 CUresult CUDAAPI cuCtxGetId(CUcontext ctx, unsigned long long *ctxId);
 /**
- * \brief Block for a context's tasks to complete
+ * \brief Block for the current context's tasks to complete
  *
- * Blocks until the device has completed all preceding requested tasks.
+ * Blocks until the current context has completed all preceding requested tasks.
+ * If the current context is the primary context, green contexts that have been
+ * created will also be synchronized.
  * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
  * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
  * CPU thread will block until the GPU context has finished its work.
@@ -6662,14 +7240,87 @@ CUresult CUDAAPI cuCtxResetPersistingL2Cache(void);
  */
 CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
-/** @} */ /* END CUDA_CTX */
 /**
- * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
+ * \brief Records an event.
  *
- * ___MANBRIEF___ deprecated context management functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ * Captures in \p hEvent all the activities of the context \p hCtx
+ * at the time of this call. \p hEvent and \p hCtx must be from the same
+ * CUDA context, otherwise ::CUDA_ERROR_INVALID_HANDLE will be returned.
+ * Calls such as ::cuEventQuery() or ::cuCtxWaitEvent() will then examine
+ * or wait for completion of the work that was captured.
+ * Uses of \p hCtx after this call do not modify \p hEvent.
+ * If the context passed to \p hCtx is the primary context, \p hEvent will
+ * capture all the activities of the primary context and its green contexts.
+ * If the context passed to \p hCtx is a context converted from green context
+ * via ::cuCtxFromGreenCtx(), \p hEvent will capture only the activities of the green context.
+ *
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the
+ * specified context \p hCtx has a stream in the capture mode. In such a case,
+ * the call will invalidate all the conflicting captures.
+ *
+ * \param hCtx - Context to record event for
+ * \param hEvent - Event to record
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+ *
+ * \sa
+ * ::cuCtxWaitEvent,
+ * ::cuGreenCtxRecordEvent,
+ * ::cuGreenCtxWaitEvent,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuCtxRecordEvent(CUcontext hCtx, CUevent hEvent);
+/**
+ * \brief Make a context wait on an event
+ *
+ * Makes all future work submitted to context \p hCtx wait for all work
+ * captured in \p hEvent. The synchronization will be performed on the device
+ * and will not block the calling CPU thread. See ::cuCtxRecordEvent()
+ * for details on what is captured by an event.
+ * If the context passed to \p hCtx is the primary context, the primary context
+ * and its green contexts will wait for \p hEvent.
+ * If the context passed to \p hCtx is a context converted from green context
+ * via ::cuCtxFromGreenCtx(), the green context will wait for \p hEvent.
+ *
+ * \note \p hEvent may be from a different context or device than \p hCtx.
+ *
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and
+ * invalidate the capture if the specified event \p hEvent is part of an ongoing
+ * capture sequence or if the specified context \p hCtx has a stream in the capture mode.
+ *
+ * \param hCtx    - Context to wait
+ * \param hEvent  - Event to wait on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+ *
+ * \sa
+ * ::cuCtxRecordEvent,
+ * ::cuGreenCtxRecordEvent,
+ * ::cuGreenCtxWaitEvent,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuCtxWaitEvent(CUcontext hCtx, CUevent hEvent);
+/** @} */ /* END CUDA_CTX */
+/**
+ * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated context management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
  *
  * This section describes the deprecated context management functions of the low-level
  * CUDA driver application programming interface.
@@ -7203,6 +7854,11 @@ CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hm
  * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
  * will accumulate data until the CUlinkState is destroyed.
  *
+ * The data passed in via ::cuLinkAddData and ::cuLinkAddFile will be treated
+ * as relocatable (-rdc=true to nvcc) when linking the final cubin during
+ * ::cuLinkComplete and will have similar consequences as offline relocatable
+ * device code linking.
+ *
  * \p optionValues must remain valid for the life of the CUlinkState if output
  * options are used.  No other references to inputs are maintained after this
  * call returns.
@@ -7471,6 +8127,7 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmod
  *
  * The \p code may be a \e cubin or \e fatbin as output by \b nvcc,
  * or a NULL-terminated \e PTX, either as output by \b nvcc or hand-written.
+ * A fatbin should also contain relocatable code when doing separate compilation.
  *
  * Options are passed as an array via \p jitOptions and any corresponding parameters are passed in
  * \p jitOptionsValues. The number of total JIT options is supplied via \p numJitOptions.
@@ -7479,6 +8136,9 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmod
  * Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
  * \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
  *
+ * \note If the library contains managed variables and no device in the system
+ * supports managed variables this call is expected to return ::CUDA_ERROR_NOT_SUPPORTED
+ *
  * \param library             - Returned library
  * \param code                - Code to load
  * \param jitOptions          - Options for JIT
@@ -7499,7 +8159,8 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmod
  * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
  * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
+ * ::CUDA_ERROR_NOT_SUPPORTED
  *
  * \sa ::cuLibraryLoadFromFile,
  * ::cuLibraryUnload,
@@ -7528,6 +8189,7 @@ CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
  *
  * The file should be a \e cubin file as output by \b nvcc, or a \e PTX file either
  * as output by \b nvcc or handwritten, or a \e fatbin file as output by \b nvcc.
+ * A fatbin should also contain relocatable code when doing separate compilation.
  *
  * Options are passed as an array via \p jitOptions and any corresponding parameters are
  * passed in \p jitOptionsValues. The number of total options is supplied via \p numJitOptions.
@@ -7536,6 +8198,9 @@ CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
  * Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
  * \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
  *
+ * \note If the library contains managed variables and no device in the system
+ * supports managed variables this call is expected to return ::CUDA_ERROR_NOT_SUPPORTED
+ *
  * \param library             - Returned library
  * \param fileName            - File to load from
  * \param jitOptions          - Options for JIT
@@ -7556,7 +8221,8 @@ CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
  * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
  * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
+ * ::CUDA_ERROR_NOT_SUPPORTED
  *
  * \sa ::cuLibraryLoadData,
  * ::cuLibraryUnload,
@@ -7702,6 +8368,29 @@ CUresult CUDAAPI cuLibraryGetModule(CUmodule *pMod, CUlibrary library);
  */
 CUresult CUDAAPI cuKernelGetFunction(CUfunction *pFunc, CUkernel kernel);
+/**
+ * \brief Returns a library handle
+ *
+ * Returns in \p pLib the handle of the library for the requested kernel \p kernel
+ *
+ * \param pLib - Returned library handle
+ * \param kernel - Kernel to retrieve library handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuLibraryGetKernel
+ */
+CUresult CUDAAPI cuKernelGetLibrary(CUlibrary *pLib, CUkernel kernel);
 /**
  * \brief Returns a global device pointer
  *
@@ -7744,9 +8433,6 @@ CUresult CUDAAPI cuLibraryGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUlibrary
  * Note that managed memory for library \p library is shared across devices and is registered
  * when the library is loaded into atleast one context.
  *
- * \note The API requires a CUDA context to be present and initialized on at least one device.
- * If no context is present, the call returns ::CUDA_ERROR_NOT_FOUND.
- *
  * \param dptr - Returned pointer to the managed memory
  * \param bytes - Returned memory size in bytes
  * \param library - Library to retrieve managed memory from
@@ -7923,6 +8609,9 @@ CUresult CUDAAPI cuKernelGetAttribute(int *pi, CUfunction_attribute attrib, CUke
  *   positive. The validity of the cluster dimensions is checked at launch time.
  *   If the value is set during compile time, it cannot be set at runtime.
  *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+ *   the function can be launched with non-portable cluster size. 1 is allowed,
+ *   0 is disallowed.
  * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
  *   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
  *
@@ -8222,9 +8911,10 @@ CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t Width
  * ::cuMemAllocPitch(), ::cuMemAllocManaged(), ::cuMemAllocAsync(), ::cuMemAllocFromPoolAsync()
  *
  * Note - This API will not perform any implict synchronization when the pointer was allocated with
- * ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to the
+ * ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to these
  * pointer have completed before invoking ::cuMemFree. For best performance and memory reuse, users
  * should use ::cuMemFreeAsync to free memory allocated via the stream ordered memory allocator.
+ * For all other pointers, this API may perform implicit synchronization.
  *
  * \param dptr - Pointer to memory to free
  *
@@ -8776,7 +9466,8 @@ CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
  *
  * IPC functionality is restricted to devices with support for unified
  * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
  * Users can test their device for IPC functionality by calling
  * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
  *
@@ -8819,7 +9510,8 @@ CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
  *
  * IPC functionality is restricted to devices with support for unified
  * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
  * Users can test their device for IPC functionality by calling
  * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
  *
@@ -8864,7 +9556,8 @@ CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle)
  *
  * IPC functionality is restricted to devices with support for unified
  * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
  * Users can test their device for IPC functionality by calling
  * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
  *
@@ -8919,7 +9612,8 @@ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
  *
  * IPC functionality is restricted to devices with support for unified
  * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
  * Users can test their device for IPC functionality by calling
  * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
  *
@@ -8964,7 +9658,8 @@ CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, u
  *
  * IPC functionality is restricted to devices with support for unified
  * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
  * Users can test their device for IPC functionality by calling
  * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
  *
@@ -10643,6 +11338,153 @@ CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
  */
 CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+/**
+ * \brief Performs a batch of memory copies asynchronously.
+ *
+ * Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
+ * batch are not guaranteed to execute in any specific order. This API only supports pointer-to-pointer copies.
+ * For copies involving CUDA arrays, please see ::cuMemcpy3DBatchAsync.
+ *
+ * Performs memory copies from source buffers specified in \p srcs to destination buffers specified in \p dsts.
+ * The size of each copy is specified in \p sizes. All three arrays must be of the same length as specified
+ * by \p count. Since there are no ordering guarantees for copies within a batch, specifying any dependent copies
+ * within a batch will result in undefined behavior.
+ *
+ * Every copy in the batch has to be associated with a set of attributes specified in the \p attrs array.
+ * Each entry in this array can apply to more than one copy. This can be done by specifying in the \p attrsIdxs array,
+ * the index of the first copy that the corresponding entry in the \p attrs array applies to. Both \p attrs and
+ * \p attrsIdxs must be of the same length as specified by \p numAttrs. For example, if a batch has 10 copies listed
+ * in dst/src/sizes, the first 6 of which have one set of attributes and the remaining 4 another, then \p numAttrs
+ * will be 2, \p attrsIdxs will be {0, 6} and \p attrs will contains the two sets of attributes. Note that the first entry
+ * in \p attrsIdxs must always be 0. Also, each entry must be greater than the previous entry and the last entry should be
+ * less than \p count. Furthermore, \p numAttrs must be lesser than or equal to \p count.
+ *
+ * The ::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
+ * with the attribute. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will
+ * be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then
+ * it indicates that access to the source pointer can be out of stream order and all accesses must be complete before
+ * the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
+ * operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
+ * that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
+ * need for the user to synchronize the stream after the API call. If the source access order is set to
+ * ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the
+ * accesses can happen even after the API call returns. This flag is suited for host pointers allocated
+ * outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
+ * Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcpy operation in the batch must
+ * have a valid ::CUmemcpyAttributes corresponding to it including the appropriate srcAccessOrder setting, otherwise the API
+ * will return ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * The ::CUmemcpyAttributes::srcLocHint and ::CUmemcpyAttributes::dstLocHint allows applications to specify hint locations
+ * for operands of a copy when the operand doesn't have a fixed location. That is, these hints are
+ * only applicable for managed memory pointers on devices where ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or
+ * system-allocated pageable memory on devices where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true.
+ * For other cases, these hints are ignored.
+ *
+ * The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
+ * ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with
+ * any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
+ *
+ * If any error is encountered while parsing the batch, the index within the batch where the error was encountered
+ * will be returned in \p failIdx.
+ *
+ * \param dsts          - Array of destination pointers.
+ * \param srcs          - Array of memcpy source pointers.
+ * \param sizes         - Array of sizes for memcpy operations.
+ * \param count         - Size of \p dsts, \p srcs and \p sizes arrays
+ * \param attrs         - Array of memcpy attributes.
+ * \param attrsIdxs     - Array of indices to specify which copies each entry in the \p attrs array applies to.
+                          The attributes specified in attrs[k] will be applied to copies starting from attrsIdxs[k]
+                          through attrsIdxs[k+1] - 1. Also attrs[numAttrs-1] will apply to copies starting from
+                          attrsIdxs[numAttrs-1] through count - 1.
+ * \param numAttrs      - Size of \p attrs and \p attrsIdxs arrays.
+ * \param failIdx       - Pointer to a location to return the index of the copy where a failure was encountered.
+                          The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
+ * \param hStream       - The stream to enqueue the operations in. Must not be legacy NULL stream.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_async
+ * \note_memcpy
+ */
+CUresult CUDAAPI cuMemcpyBatchAsync(CUdeviceptr *dsts, CUdeviceptr *srcs, size_t *sizes, size_t count,
+                                    CUmemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs,
+                                    size_t *failIdx, CUstream hStream);
+/**
+ * \brief Performs a batch of 3D memory copies asynchronously.
+ *
+ * Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
+ * batch are not guaranteed to execute in any specific order. Note that this means specifying any dependent
+ * copies within a batch will result in undefined behavior.
+ *
+ * Performs memory copies as specified in the \p opList array. The length of this array is specified in \p numOps.
+ * Each entry in this array describes a copy operation. This includes among other things, the source and destination
+ * operands for the copy as specified in ::CUDA_MEMCPY3D_BATCH_OP::src and ::CUDA_MEMCPY3D_BATCH_OP::dst respectively.
+ * The source and destination operands of a copy can either be a pointer or a CUDA array. The width, height and depth
+ * of a copy is specified in ::CUDA_MEMCPY3D_BATCH_OP::extent. The width, height and depth of a copy are specified in
+ * elements and must not be zero. For pointer-to-pointer copies, the element size is considered to be 1. For pointer
+ * to CUDA array or vice versa copies, the element size is determined by the CUDA array. For CUDA array to CUDA array copies,
+ * the element size of the two CUDA arrays must match.
+ *
+ * For a given operand, if ::CUmemcpy3DOperand::type is specified as ::CU_MEMCPY_OPERAND_TYPE_POINTER, then
+ * ::CUmemcpy3DOperand::op::ptr will be used. The ::CUmemcpy3DOperand::op::ptr::ptr field must contain the pointer where
+ * the copy should begin. The ::CUmemcpy3DOperand::op::ptr::rowLength field specifies the length of each row in elements and
+ * must either be zero or be greater than or equal to the width of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::width.
+ * The ::CUmemcpy3DOperand::op::ptr::layerHeight field specifies the height of each layer and must either be zero or be greater than
+ * or equal to the height of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::height. When either of these values is zero,
+ * that aspect of the operand is considered to be tightly packed according to the copy extent. For managed memory pointers on devices where
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or system-allocated pageable memory on devices where
+ * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true, the ::CUmemcpy3DOperand::op::ptr::locHint field can be used to hint
+ * the location of the operand.
+ *
+ * If an operand's type is specified as ::CU_MEMCPY_OPERAND_TYPE_ARRAY, then ::CUmemcpy3DOperand::op::array will be used.
+ * The ::CUmemcpy3DOperand::op::array::array field specifies the CUDA array and ::CUmemcpy3DOperand::op::array::offset specifies
+ * the 3D offset into that array where the copy begins.
+ *
+ * The ::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
+ * with the attribute. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will
+ * be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then
+ * it indicates that access to the source pointer can be out of stream order and all accesses must be complete before
+ * the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
+ * operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
+ * that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
+ * need for the user to synchronize the stream after the API call. If the source access order is set to
+ * ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the
+ * accesses can happen even after the API call returns. This flag is suited for host pointers allocated
+ * outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
+ * Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcopy operation in \p opList must
+ * have a valid srcAccessOrder setting, otherwise this API will return ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
+ * ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with
+ * any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
+ *
+ * If any error is encountered while parsing the batch, the index within the batch where the error was encountered
+ * will be returned in \p failIdx.
+ *
+ * \param numOps     - Total number of memcpy operations.
+ * \param opList     - Array of size \p numOps containing the actual memcpy operations.
+ * \param failIdx    - Pointer to a location to return the index of the copy where a failure was encountered.
+ *                     The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
+ * \param flags      - Flags for future use, must be zero now.
+ * \param hStream    - The stream to enqueue the operations in. Must not be default NULL stream.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_async
+ * \note_memcpy
+ */
+CUresult CUDAAPI cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP *opList,
+                                      size_t *failIdx, unsigned long long flags, CUstream hStream);
 /**
  * \brief Initializes device memory
  *
@@ -11139,8 +11981,51 @@ CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsi
         CU_AD_FORMAT_SIGNED_INT16 = 0x09,
         CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
         CU_AD_FORMAT_HALF = 0x10,
-        CU_AD_FORMAT_FLOAT = 0x20
-    } CUarray_format;
+        CU_AD_FORMAT_FLOAT = 0x20,
+        CU_AD_FORMAT_NV12 = 0xb0,
+        CU_AD_FORMAT_UNORM_INT8X1 = 0xc0,
+        CU_AD_FORMAT_UNORM_INT8X2 = 0xc1,
+        CU_AD_FORMAT_UNORM_INT8X4 = 0xc2,
+        CU_AD_FORMAT_UNORM_INT16X1 = 0xc3,
+        CU_AD_FORMAT_UNORM_INT16X2 = 0xc4,
+        CU_AD_FORMAT_UNORM_INT16X4 = 0xc5,
+        CU_AD_FORMAT_SNORM_INT8X1 = 0xc6,
+        CU_AD_FORMAT_SNORM_INT8X2 = 0xc7,
+        CU_AD_FORMAT_SNORM_INT8X4 = 0xc8,
+        CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
+        CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
+        CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
+        CU_AD_FORMAT_BC1_UNORM = 0x91,
+        CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
+        CU_AD_FORMAT_BC2_UNORM = 0x93,
+        CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
+        CU_AD_FORMAT_BC3_UNORM = 0x95,
+        CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
+        CU_AD_FORMAT_BC4_UNORM = 0x97,
+        CU_AD_FORMAT_BC4_SNORM = 0x98,
+        CU_AD_FORMAT_BC5_UNORM = 0x99,
+        CU_AD_FORMAT_BC5_SNORM = 0x9a,
+        CU_AD_FORMAT_BC6H_UF16 = 0x9b,
+        CU_AD_FORMAT_BC6H_SF16 = 0x9c,
+        CU_AD_FORMAT_BC7_UNORM = 0x9d,
+        CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
+        CU_AD_FORMAT_P010 = 0x9f,
+        CU_AD_FORMAT_P016 = 0xa1,
+        CU_AD_FORMAT_NV16 = 0xa2,
+        CU_AD_FORMAT_P210 = 0xa3,
+        CU_AD_FORMAT_P216 = 0xa4,
+        CU_AD_FORMAT_YUY2 = 0xa5,
+        CU_AD_FORMAT_Y210 = 0xa6,
+        CU_AD_FORMAT_Y216 = 0xa7,
+        CU_AD_FORMAT_AYUV = 0xa8,
+        CU_AD_FORMAT_Y410 = 0xa9,
+        CU_AD_FORMAT_Y416 = 0xb1,
+        CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
+        CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
+        CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
+        CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
+        CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
+   } CUarray_format;
  *  \endcode
  * - \p NumChannels specifies the number of packed components per CUDA array
  * element; it may be 1, 2, or 4;
@@ -11459,7 +12344,50 @@ CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
         CU_AD_FORMAT_SIGNED_INT16 = 0x09,
         CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
         CU_AD_FORMAT_HALF = 0x10,
-        CU_AD_FORMAT_FLOAT = 0x20
+        CU_AD_FORMAT_FLOAT = 0x20,
+        CU_AD_FORMAT_NV12 = 0xb0,
+        CU_AD_FORMAT_UNORM_INT8X1 = 0xc0,
+        CU_AD_FORMAT_UNORM_INT8X2 = 0xc1,
+        CU_AD_FORMAT_UNORM_INT8X4 = 0xc2,
+        CU_AD_FORMAT_UNORM_INT16X1 = 0xc3,
+        CU_AD_FORMAT_UNORM_INT16X2 = 0xc4,
+        CU_AD_FORMAT_UNORM_INT16X4 = 0xc5,
+        CU_AD_FORMAT_SNORM_INT8X1 = 0xc6,
+        CU_AD_FORMAT_SNORM_INT8X2 = 0xc7,
+        CU_AD_FORMAT_SNORM_INT8X4 = 0xc8,
+        CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
+        CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
+        CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
+        CU_AD_FORMAT_BC1_UNORM = 0x91,
+        CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
+        CU_AD_FORMAT_BC2_UNORM = 0x93,
+        CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
+        CU_AD_FORMAT_BC3_UNORM = 0x95,
+        CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
+        CU_AD_FORMAT_BC4_UNORM = 0x97,
+        CU_AD_FORMAT_BC4_SNORM = 0x98,
+        CU_AD_FORMAT_BC5_UNORM = 0x99,
+        CU_AD_FORMAT_BC5_SNORM = 0x9a,
+        CU_AD_FORMAT_BC6H_UF16 = 0x9b,
+        CU_AD_FORMAT_BC6H_SF16 = 0x9c,
+        CU_AD_FORMAT_BC7_UNORM = 0x9d,
+        CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
+        CU_AD_FORMAT_P010 = 0x9f,
+        CU_AD_FORMAT_P016 = 0xa1,
+        CU_AD_FORMAT_NV16 = 0xa2,
+        CU_AD_FORMAT_P210 = 0xa3,
+        CU_AD_FORMAT_P216 = 0xa4,
+        CU_AD_FORMAT_YUY2 = 0xa5,
+        CU_AD_FORMAT_Y210 = 0xa6,
+        CU_AD_FORMAT_Y216 = 0xa7,
+        CU_AD_FORMAT_AYUV = 0xa8,
+        CU_AD_FORMAT_Y410 = 0xa9,
+        CU_AD_FORMAT_Y416 = 0xb1,
+        CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
+        CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
+        CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
+        CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
+        CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
     } CUarray_format;
  *  \endcode
  *
@@ -11680,7 +12608,50 @@ CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescripto
         CU_AD_FORMAT_SIGNED_INT16 = 0x09,
         CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
         CU_AD_FORMAT_HALF = 0x10,
-        CU_AD_FORMAT_FLOAT = 0x20
+        CU_AD_FORMAT_FLOAT = 0x20,
+        CU_AD_FORMAT_NV12 = 0xb0,
+        CU_AD_FORMAT_UNORM_INT8X1 = 0xc0,
+        CU_AD_FORMAT_UNORM_INT8X2 = 0xc1,
+        CU_AD_FORMAT_UNORM_INT8X4 = 0xc2,
+        CU_AD_FORMAT_UNORM_INT16X1 = 0xc3,
+        CU_AD_FORMAT_UNORM_INT16X2 = 0xc4,
+        CU_AD_FORMAT_UNORM_INT16X4 = 0xc5,
+        CU_AD_FORMAT_SNORM_INT8X1 = 0xc6,
+        CU_AD_FORMAT_SNORM_INT8X2 = 0xc7,
+        CU_AD_FORMAT_SNORM_INT8X4 = 0xc8,
+        CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
+        CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
+        CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
+        CU_AD_FORMAT_BC1_UNORM = 0x91,
+        CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
+        CU_AD_FORMAT_BC2_UNORM = 0x93,
+        CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
+        CU_AD_FORMAT_BC3_UNORM = 0x95,
+        CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
+        CU_AD_FORMAT_BC4_UNORM = 0x97,
+        CU_AD_FORMAT_BC4_SNORM = 0x98,
+        CU_AD_FORMAT_BC5_UNORM = 0x99,
+        CU_AD_FORMAT_BC5_SNORM = 0x9a,
+        CU_AD_FORMAT_BC6H_UF16 = 0x9b,
+        CU_AD_FORMAT_BC6H_SF16 = 0x9c,
+        CU_AD_FORMAT_BC7_UNORM = 0x9d,
+        CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
+        CU_AD_FORMAT_P010 = 0x9f,
+        CU_AD_FORMAT_P016 = 0xa1,
+        CU_AD_FORMAT_NV16 = 0xa2,
+        CU_AD_FORMAT_P210 = 0xa3,
+        CU_AD_FORMAT_P216 = 0xa4,
+        CU_AD_FORMAT_YUY2 = 0xa5,
+        CU_AD_FORMAT_Y210 = 0xa6,
+        CU_AD_FORMAT_Y216 = 0xa7,
+        CU_AD_FORMAT_AYUV = 0xa8,
+        CU_AD_FORMAT_Y410 = 0xa9,
+        CU_AD_FORMAT_Y416 = 0xb1,
+        CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
+        CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
+        CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
+        CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
+        CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
     } CUarray_format;
  *  \endcode
  *
@@ -11842,12 +12813,18 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
 * have identical allocation properties. Users are also expected to retrieve a
 * new handle every time the underlying physical allocation(s) corresponding
 * to a previously queried VA range are changed.
+*
+* For CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, users may set
+* flags to ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE. Which when set on a
+* supported platform, will give a DMA_BUF handle mapped via PCIE BAR1 or will
+* return an error otherwise.
 *
 * \param[out] handle     - Pointer to the location where the returned handle will be stored.
 * \param[in] dptr        - Pointer to a valid CUDA device allocation. Must be aligned to host page size.
 * \param[in] size        - Length of the address range. Must be aligned to host page size.
 * \param[in] handleType  - Type of handle requested (defines type and size of the \p handle output parameter)
-* \param[in] flags       - Reserved, must be zero
+* \param[in] flags       - When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD the value could be
+*                          ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE, otherwise 0.
 *
 * \return
 * CUDA_SUCCESS
@@ -11856,6 +12833,112 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
 */
 CUresult CUDAAPI cuMemGetHandleForAddressRange(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
+/**
+ * \brief Bitmasks for CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK.
+ */
+typedef enum CUmemDecompressAlgorithm_enum {
+    CU_MEM_DECOMPRESS_UNSUPPORTED       = 0,    /**< Decompression is unsupported. */
+    CU_MEM_DECOMPRESS_ALGORITHM_DEFLATE = 1<<0, /**< Deflate is supported. */
+    CU_MEM_DECOMPRESS_ALGORITHM_SNAPPY  = 1<<1  /**< Snappy is supported. */
+} CUmemDecompressAlgorithm;
+/**
+ * \brief Structure describing the parameters that compose a single
+ *        decompression operation.
+ */
+typedef struct CUmemDecompressParams_st {
+    /** The number of bytes to be read and decompressed from
+     *  ::CUmemDecompressParams_st.src. */
+    size_t srcNumBytes;
+    /** The number of bytes that the decompression operation will be expected to
+     *  write to ::CUmemDecompressParams_st.dst. This value is optional; if
+     *  present, it may be used by the CUDA driver as a heuristic for scheduling
+     *  the individual decompression operations. */
+    size_t dstNumBytes;
+    /** After the decompression operation has completed, the actual number of
+     * bytes written to ::CUmemDecompressParams.dst will be recorded as a 32-bit
+     * unsigned integer in the memory at this address. */
+    cuuint32_t *dstActBytes;
+    /** Pointer to a buffer of at least ::CUmemDecompressParams_st.srcNumBytes
+      * compressed bytes. */
+    const void *src;
+    /** Pointer to a buffer where the decompressed data will be written. The
+      * number of bytes written to this location will be recorded in the memory
+      * pointed to by ::CUmemDecompressParams_st.dstActBytes */
+    void *dst;
+    /** The decompression algorithm to use. */
+    CUmemDecompressAlgorithm algo;
+    /*  These bytes are unused and must be zeroed. This ensures compatibility if
+     *  additional fields are added in the future. */
+    unsigned char padding[20];
+} CUmemDecompressParams;
+/**
+ * \brief   Submit a batch of \p count independent decompression operations.
+ *
+ * \details Each of the \p count decompression operations is described by a
+ *          single entry in the \p paramsArray array. Once the batch has been
+ *          submitted, the function will return, and decompression will happen
+ *          asynchronously w.r.t. the CPU. To the work completion tracking
+ *          mechanisms in the CUDA driver, the batch will be considered a single
+ *          unit of work and processed according to stream semantics, i.e., it
+ *          is not possible to query the completion of individual decompression
+ *          operations within a batch.
+ *
+ *          The memory pointed to by each of ::CUmemDecompressParams.src,
+ *          ::CUmemDecompressParams.dst, and ::CUmemDecompressParams.dstActBytes,
+ *          must be capable of usage with the hardware decompress feature. That
+ *          is, for each of said pointers, the pointer attribute
+ *          ::CU_POINTER_ATTRIBUTE_IS_MEM_DECOMPRESS_CAPABLE should give a
+ *          non-zero value. To ensure this, the memory backing the pointers
+ *          should have been allocated using one of the following CUDA memory
+ *          allocators:
+ *          * ::cuMemAlloc()
+ *          * ::cuMemCreate() with the usage flag ::CU_MEM_CREATE_USAGE_HW_DECOMPRESS
+ *          * ::cuMemAllocFromPoolAsync() from a pool that was created with
+ *            the usage flag ::CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS
+ *          Additionally, ::CUmemDecompressParams.src, ::CUmemDecompressParams.dst,
+ *          and ::CUmemDecompressParams.dstActBytes, must all be accessible from
+ *          the device associated with the context where \p stream was created.
+ *          For information on how to ensure this, see the documentation for the
+ *          allocator of interest.
+ *
+ * \param[in]  paramsArray  The array of structures describing the independent
+ *                          decompression operations.
+ * \param[in]  count        The number of entries in \p paramsArray array.
+ * \param[in]  flags        Must be 0.
+ * \param[out] errorIndex   The index into \p paramsArray of the decompression
+ *                          operation for which the error returned by this
+ *                          function pertains to. If \p index is SIZE_MAX and
+ *                          the value returned is not ::CUDA_SUCCESS, then the
+ *                          error returned by this function should be considered
+ *                          a general error that does not pertain to a
+ *                          particular decompression operation. May be \p NULL,
+ *                          in which case, no index will be recorded in the
+ *                          event of error.
+ * \param[in]  stream       The stream where the work will be enqueued.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemAlloc, ::cuMemPoolCreate, ::cuMemAllocFromPoolAsync
+ */
+CUresult CUDAAPI cuMemBatchDecompressAsync(
+    CUmemDecompressParams *paramsArray,
+    size_t count,
+    unsigned int flags,
+    size_t *errorIndex,
+    CUstream stream
+);
 /** @} */ /* END CUDA_MEM */
 /**
@@ -11937,17 +13020,23 @@ CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size);
 * set ::CUmemAllocationProp::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
 * ::CUmemAllocationProp::CUmemLocation::id must specify the NUMA ID of the CPU.
 * On systems where NUMA is not available ::CUmemAllocationProp::CUmemLocation::id must be set to 0.
+* Specifying ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or ::CU_MEM_LOCATION_TYPE_HOST as the
+* ::CUmemLocation::type will result in ::CUDA_ERROR_INVALID_VALUE.
+*
+* Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
+* (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices
+* (2) have at least one IMEX channel file accessible by the user launching the application.
+*
+* When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
+* share memory.
 *
-* Applications can set ::CUmemAllocationProp::requestedHandleTypes to
-* ::CU_MEM_HANDLE_TYPE_FABRIC in order to create allocations suitable for sharing
-* within an IMEX domain. An IMEX domain is either an OS instance or a group of securely
-* connected OS instances using the NVIDIA IMEX daemon. An IMEX channel is a global resource
-* within the IMEX domain that represents a logical entity that aims to provide fine grained
-* accessibility control for the participating processes. When exporter and importer CUDA processes
-* have been granted access to the same IMEX channel, they can securely share memory.
-* If the allocating process does not have access setup for an IMEX channel, attempting to create
-* a ::CUmemGenericAllocationHandle with ::CU_MEM_HANDLE_TYPE_FABRIC will result in ::CUDA_ERROR_NOT_PERMITTED.
-* The nvidia-modprobe CLI provides more information regarding setting up of IMEX channels.
+* The IMEX channel security model works on a per user basis. Which means all processes under a user can share
+* memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
+* channel is required for each user.
+*
+* These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
+* native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
+* users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
 *
 * If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then
 * the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays
@@ -12637,25 +13726,31 @@ CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPo
  * Creates a CUDA memory pool and returns the handle in \p pool.  The \p poolProps determines
  * the properties of the pool such as the backing device and IPC capabilities.
  *
-* To create a memory pool targeting a specific host NUMA node, applications must
-* set ::CUmemPoolProps::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
-* ::CUmemPoolProps::CUmemLocation::id must specify the NUMA ID of the host memory node.
+ * To create a memory pool targeting a specific host NUMA node, applications must
+ * set ::CUmemPoolProps::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
+ * ::CUmemPoolProps::CUmemLocation::id must specify the NUMA ID of the host memory node.
+ * Specifying ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or ::CU_MEM_LOCATION_TYPE_HOST as the
+ * ::CUmemPoolProps::CUmemLocation::type will result in ::CUDA_ERROR_INVALID_VALUE.
 * By default, the pool's memory will be accessible from the device it is allocated on.
  * In the case of pools created with ::CU_MEM_LOCATION_TYPE_HOST_NUMA, their default accessibility
  * will be from the host CPU.
  * Applications can control the maximum size of the pool by specifying a non-zero value for ::CUmemPoolProps::maxSize.
  * If set to 0, the maximum size of the pool will default to a system dependent value.
  *
- * Applications can set ::CUmemPoolProps::handleTypes to ::CU_MEM_HANDLE_TYPE_FABRIC
- * in order to create ::CUmemoryPool suitable for sharing within an IMEX domain.
- * An IMEX domain is either an OS instance or a group of securely connected OS instances
- * using the NVIDIA IMEX daemon. An IMEX channel is a global resource within the IMEX domain
- * that represents a logical entity that aims to provide fine grained accessibility control
- * for the participating processes. When exporter and importer CUDA processes have been
- * granted access to the same IMEX channel, they can securely share memory.
- * If the allocating process does not have access setup for an IMEX channel, attempting to export
- * a ::CUmemoryPool with ::CU_MEM_HANDLE_TYPE_FABRIC will result in ::CUDA_ERROR_NOT_PERMITTED.
- * The nvidia-modprobe CLI provides more information regarding setting up of IMEX channels.
+ * Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
+ * (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices
+ * (2) have at least one IMEX channel file accessible by the user launching the application.
+ *
+ * When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
+ * share memory.
+ *
+ * The IMEX channel security model works on a per user basis. Which means all processes under a user can share
+ * memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
+ * channel is required for each user.
+ *
+ * These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
+ * native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
+ * users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
  *
  * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
  *
@@ -12962,8 +14057,8 @@ CUresult CUDAAPI cuMulticastAddDevice(CUmemGenericAllocationHandle mcHandle, CUd
  * returned by ::cuMulticastGetGranularity with the flag
  * ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
  *
- * The \p size + \p memOffset must be smaller than the size of the allocated
- * memory. Similarly the \p size + \p mcOffset must be smaller than the size
+ * The \p size + \p memOffset cannot be larger than the size of the allocated
+ * memory. Similarly the \p size + \p mcOffset cannot be larger than the size
  * of the multicast object.
  * The memory allocation must have beeen created on one of the devices
  * that was added to the multicast team via ::cuMulticastAddDevice.
@@ -13010,8 +14105,8 @@ CUresult CUDAAPI cuMulticastBindMem(CUmemGenericAllocationHandle mcHandle, size_
  * aligned to the value returned by ::cuMulticastGetGranularity with the flag
  * ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
  *
- * The \p size must be smaller than the size of the allocated memory.
- * Similarly the \p size + \p mcOffset must be smaller than the total size
+ * The \p size cannot be larger than the size of the allocated memory.
+ * Similarly the \p size + \p mcOffset cannot be larger than the total size
  * of the multicast object.
  * The memory allocation must have beeen created on one of the devices
  * that was added to the multicast team via ::cuMulticastAddDevice.
@@ -13052,7 +14147,7 @@ CUresult CUDAAPI cuMulticastBindAddr(CUmemGenericAllocationHandle mcHandle, size
  * The intended \p size of the unbind and the offset in the multicast range
  * ( \p mcOffset ) must be a multiple of the value returned by
  * ::cuMulticastGetGranularity flag ::CU_MULTICAST_GRANULARITY_MINIMUM.
- * The \p size + \p mcOffset must be smaller than the total size of the
+ * The \p size + \p mcOffset cannot be larger than the total size of the
  * multicast object.
  *
  * \note
@@ -13343,6 +14438,12 @@ CUresult CUDAAPI cuMulticastGetGranularity(size_t *granularity, const CUmulticas
  *
  *      Returns in \p *data the handle to the mempool that the allocation was obtained from.
  *
+ * - ::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE:
+ *
+ *      Returns in \p *data a boolean that indicates whether the pointer points
+ *      to memory that is capable to be used for hardware accelerated
+ *      decompression.
+ *
  * \par
  *
  * Note that for most allocations in the unified virtual address space
@@ -13397,7 +14498,9 @@ CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute
  * base device pointer of the memory to be prefetched and \p dstDevice is the
  * destination device. \p count specifies the number of bytes to copy. \p hStream
  * is the stream in which the operation is enqueued. The memory range must refer
- * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
+ * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables
+ * or it may also refer to system-allocated memory on systems with non-zero
+ * CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
  *
  * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
  * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
@@ -13957,6 +15060,7 @@ CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute at
  * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
  * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
  * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
+ * - ::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE
  *
  * \param numAttributes - Number of attributes to query
  * \param attributes    - An array of attributes to query
@@ -14027,8 +15131,10 @@ CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_at
  *
  * \sa ::cuStreamDestroy,
  * ::cuStreamCreateWithPriority,
+ * ::cuGreenCtxStreamCreate,
  * ::cuStreamGetPriority,
  * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
  * ::cuStreamWaitEvent,
  * ::cuStreamQuery,
  * ::cuStreamSynchronize,
@@ -14078,9 +15184,11 @@ CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags);
  *
  * \sa ::cuStreamDestroy,
  * ::cuStreamCreate,
+ * ::cuGreenCtxStreamCreate,
  * ::cuStreamGetPriority,
  * ::cuCtxGetStreamPriorityRange,
  * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
  * ::cuStreamWaitEvent,
  * ::cuStreamQuery,
  * ::cuStreamSynchronize,
@@ -14093,7 +15201,7 @@ CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int fla
 /**
  * \brief Query the priority of a given stream
  *
- * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
+ * Query the priority of a stream created using ::cuStreamCreate, ::cuStreamCreateWithPriority or ::cuGreenCtxStreamCreate
  * and return the priority in \p priority. Note that if the stream was created with a
  * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
  * this function returns the clamped priority.
@@ -14114,16 +15222,44 @@ CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int fla
  * \sa ::cuStreamDestroy,
  * ::cuStreamCreate,
  * ::cuStreamCreateWithPriority,
+ * ::cuGreenCtxStreamCreate,
  * ::cuCtxGetStreamPriorityRange,
  * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
  * ::cudaStreamGetPriority
  */
 CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
+/**
+ * \brief Returns the device handle of the stream
+ *
+ * Returns in \p *device the device handle of the stream
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param device - Returns the device to which a stream belongs
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetFlags
+ */
+CUresult CUDAAPI cuStreamGetDevice(CUstream hStream, CUdevice *device);
 /**
  * \brief Query the flags of a given stream
  *
- * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
+ * Query the flags of a stream created using ::cuStreamCreate, ::cuStreamCreateWithPriority or ::cuGreenCtxStreamCreate
  * and return the flags in \p flags.
  *
  * \param hStream    - Handle to the stream to be queried
@@ -14143,8 +15279,10 @@ CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
  *
  * \sa ::cuStreamDestroy,
  * ::cuStreamCreate,
+ * ::cuGreenCtxStreamCreate,
  * ::cuStreamGetPriority,
  * ::cudaStreamGetFlags
+ * ::cuStreamGetDevice
  */
 CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
@@ -14186,6 +15324,10 @@ CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
  *
  * Returns the CUDA context that the stream is associated with.
  *
+ * Note there is a later version of this API, ::cuStreamGetCtx_v2. It will
+ * supplant this version in CUDA 13.0. It is recommended to use ::cuStreamGetCtx_v2
+ * till then as this version will return ::CUDA_ERROR_NOT_SUPPORTED for streams created via the API ::cuGreenCtxStreamCreate.
+ *
  * The stream handle \p hStream can refer to any of the following:
  * <ul>
  *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
@@ -14210,21 +15352,82 @@ CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
  * ::CUDA_ERROR_NOT_INITIALIZED,
  * ::CUDA_ERROR_INVALID_CONTEXT,
  * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
  * \notefnerr
  *
  * \sa ::cuStreamDestroy,
  * ::cuStreamCreateWithPriority,
  * ::cuStreamGetPriority,
  * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
  * ::cuStreamWaitEvent,
  * ::cuStreamQuery,
  * ::cuStreamSynchronize,
  * ::cuStreamAddCallback,
  * ::cudaStreamCreate,
+ * ::cuStreamGetCtx_v2,
  * ::cudaStreamCreateWithFlags
  */
 CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
+/**
+ * \brief Query the contexts associated with a stream
+ *
+ * Returns the contexts that the stream is associated with.
+ *
+ * If the stream is associated with a green context, the API returns the green context in \p pGreenCtx
+ * and the primary context of the associated device in \p pCtx.
+ *
+ * If the stream is associated with a regular context, the API returns the regular context in \p pCtx
+ * and NULL in \p pGreenCtx.
+ *
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate,
+ *   ::cuStreamCreateWithPriority and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+ *   Passing an invalid handle will result in undefined behavior.</li>
+ *   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+ *   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+ *   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
+ *   If any of the special handles are specified, the API will operate on the context current to the
+ *   calling thread. If a green context (that was converted via ::cuCtxFromGreenCtx() before setting it current)
+ *   is current to the calling thread, the API will return the green context in \p pGreenCtx
+ *   and the primary context of the associated device in \p pCtx. If a regular context is current,
+ *   the API returns the regular context in \p pCtx and NULL in \p pGreenCtx.
+ *   Note that specifying ::CU_STREAM_PER_THREAD or ::cudaStreamPerThread will return ::CUDA_ERROR_INVALID_HANDLE
+ *   if a green context is current to the calling thread.
+ *   If no context is current to the calling thread, ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
+ * </ul>
+ *
+ * \param hStream   - Handle to the stream to be queried
+ * \param pCtx      - Returned regular context associated with the stream
+ * \param pGreenCtx - Returned green context if the stream is associated with a green context or NULL if not
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate
+ * ::cuStreamCreateWithPriority,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags,
+ */
+CUresult CUDAAPI cuStreamGetCtx_v2(CUstream hStream, CUcontext *pCtx, CUgreenCtx *pGreenCtx);
 /**
  * \brief Make a compute stream wait on an event
  *
@@ -14545,6 +15748,7 @@ CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
  */
 CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
 /**
  * \brief Query a stream's capture state
  *
@@ -15031,7 +16235,8 @@ CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
  * \brief Records an event
  *
  * Captures in \p hEvent the contents of \p hStream at the time of this call.
- * \p hEvent and \p hStream must be from the same context.
+ * \p hEvent and \p hStream must be from the same context otherwise
+ * ::CUDA_ERROR_INVALID_HANDLE is returned.
  * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
  * examine or wait for completion of the work that was captured. Uses of
  * \p hStream after this call do not modify \p hEvent. See note on default
@@ -15073,7 +16278,8 @@ CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
  * \brief Records an event
  *
  * Captures in \p hEvent the contents of \p hStream at the time of this call.
- * \p hEvent and \p hStream must be from the same context.
+ * \p hEvent and \p hStream must be from the same context otherwise
+ * ::CUDA_ERROR_INVALID_HANDLE is returned.
  * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
  * examine or wait for completion of the work that was captured. Uses of
  * \p hStream after this call do not modify \p hEvent. See note on default
@@ -15231,6 +16437,9 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
  * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
  * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
  * ::CUDA_ERROR_INVALID_HANDLE.
+ *
+ * Note there is a later version of this API, ::cuEventElapsedTime_v2. It will
+ * supplant this version in CUDA 13.0, which is retained for minor version compatibility.
  *
  * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
  * \param hStart        - Starting event
@@ -15255,6 +16464,54 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
  */
 CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+/**
+ * \brief Computes the elapsed time between two events
+ *
+ * Computes the elapsed time between two events (in milliseconds with a
+ * resolution of around 0.5 microseconds). Note this API is not guaranteed
+ * to return the latest errors for pending work. As such this API is intended to
+ * serve as an elapsed time calculation only and any polling for completion on the
+ * events to be compared should be done with ::cuEventQuery instead.
+ *
+ * If either event was last recorded in a non-NULL stream, the resulting time
+ * may be greater than expected (even if both used the same stream handle). This
+ * happens because the ::cuEventRecord() operation takes place asynchronously
+ * and there is no guarantee that the measured latency is actually just between
+ * the two events. Any number of other different stream operations could execute
+ * in between the two measured events, thus altering the timing in a significant
+ * way.
+ *
+ * If ::cuEventRecord() has not been called on either event then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
+ * on both events but one or both of them has not yet been completed (that is,
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
+ * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
+ * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
+ * ::CUDA_ERROR_INVALID_HANDLE.
+ *
+ * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
+ * \param hStart        - Starting event
+ * \param hEnd          - Ending event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cudaEventElapsedTime
+ */
+CUresult CUDAAPI cuEventElapsedTime_v2(float *pMilliseconds, CUevent hStart, CUevent hEnd);
 /** @} */ /* END CUDA_EVENT */
 /**
@@ -15308,7 +16565,7 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven
             CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
             CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
             CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF           = 8
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF           = 8,
         } CUexternalMemoryHandleType;
  * \endcode
  *
@@ -15522,6 +16779,7 @@ CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternal
  * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
  * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1.
  *
+ *
  * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy.
  *
  * \param mipmap     - Returned CUDA mipmapped array
@@ -16280,6 +17538,9 @@ CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunc
  *   positive. The validity of the cluster dimensions is checked at launch time.
  *   If the value is set during compile time, it cannot be set at runtime.
  *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+ *   the function can be launched with non-portable cluster size. 1 is allowed,
+ *   0 is disallowed.
  * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
  *   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
  *
@@ -16679,6 +17940,7 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f,
  *     CU_LAUNCH_ATTRIBUTE_PRIORITY               = 8,
  *     CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP    = 9,
  *     CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN        = 10,
+ *     CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11,
  *     CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12,
  *     CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE = 13,
  * } CUlaunchAttributeID;
@@ -16706,6 +17968,11 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f,
  *     CUlaunchMemSyncDomainMap memSyncDomainMap;
  *     CUlaunchMemSyncDomain memSyncDomain;
  *     struct {
+ *         unsigned int x;
+ *         unsigned int y;
+ *         unsigned int z;
+ *     } preferredClusterDim;
+ *     struct {
  *         CUevent event;
  *         int flags;
  *     } launchCompletionEvent;
@@ -16776,6 +18043,36 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f,
  * opt out, and any attempt to set the attribute to 0 will result in an error. Graphs
  * containing one or more device-updatable node also do not allow multiple instantiation.
  *
+ * ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION allows the kernel launch to
+ * specify a preferred substitute cluster dimension. Blocks may be grouped
+ * according to either the dimensions specified with this attribute (grouped
+ * into a "preferred substitute cluster"), or the one specified with
+ * ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped into a "regular
+ * cluster"). The cluster dimensions of a "preferred substitute cluster" shall
+ * be an integer multiple greater than zero of the regular cluster dimensions.
+ * The device will attempt - on a best-effort basis - to group thread blocks
+ * into preferred clusters over grouping them into regular clusters. When it
+ * deems necessary (primarily when the device temporarily runs out of physical
+ * resources to launch the larger preferred clusters), the device may switch to
+ * launch the regular clusters instead to attempt to utilize as much of the
+ * physical device resources as possible.
+ *
+ * Each type of cluster will have its enumeration / coordinate setup as if the
+ * grid consists solely of its type of cluster. For example, if the preferred
+ * substitute cluster dimensions double the regular cluster dimensions, there
+ * might be simultaneously a regular cluster indexed at (1,0,0), and a preferred
+ * cluster indexed at (1,0,0). In this example, the preferred substitute cluster
+ * (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their
+ * blocks.
+ *
+ * This attribute will only take effect when a regular cluster dimension has
+ * been specified. The preferred substitute The preferred substitute cluster
+ * dimension must be an integer multiple greater than zero of the regular
+ * cluster dimension and must divide the grid. It must also be no more than
+ * `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`.
+ * Otherwise it must be less than the maximum value the driver can support.
+ * Otherwise, setting this attribute to a value physically unable to fit on any
+ * particular device is permitted.
  *
  * The effect of other attributes is consistent with their effect when set via
  * persistent APIs.
@@ -16844,12 +18141,6 @@ CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config,
  * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
  * \p blockDimZ threads.
  *
- * Note that the API can also be used to launch context-less kernel ::CUkernel
- * by querying the handle using ::cuLibraryGetKernel() and then passing it
- * to the API by casting to ::CUfunction. Here, the context to launch
- * the kernel on will either be taken from the specified stream \p hStream
- * or the current context in case of NULL stream.
- *
  * \p sharedMemBytes sets the amount of dynamic shared memory that will be
  * available to each thread block.
  *
@@ -19826,18 +21117,22 @@ CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphN
  * contained \p memsetParams at instantiation.  hNode must remain in the graph which was
  * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
  *
- * The destination memory in \p memsetParams must be allocated from the same
- * contexts as the original destination memory.  Both the instantiation-time
- * memory operand and the memory operand in \p memsetParams must be 1-dimensional.
- * Zero-length operations are not supported.
+ * Zero sized operations are not supported.
+ *
+ * The new destination pointer in memsetParams must be to the same kind of allocation
+ * as the original destination pointer and have the same context association and device mapping
+ * as the original destination pointer.
+ *
+ * Both the value and pointer address may be updated.
+ * Changing other aspects of the memset (width, height, element size or pitch) may cause the update to be rejected.
+ * Specifically, for 2d memsets, all dimension changes are rejected.
+ * For 1d memsets, changes in height are explicitly rejected and other changes are oportunistically allowed
+ * if the resulting work maps onto the work resources already allocated for the node.
  *
  * The modifications only affect future launches of \p hGraphExec.  Already enqueued
  * or running launches of \p hGraphExec are not affected by this call.  hNode is also
  * not modified by this call.
  *
- * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or
- * either the original or new memory operand are multidimensional.
- *
  * \param hGraphExec   - The executable graph in which to set the specified node
  * \param hNode        - Memset node from the graph which was used to instantiate graphExec
  * \param memsetParams - The updated parameters to set
@@ -20319,7 +21614,9 @@ CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
  *   - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
  *   - The source/destination memory must be allocated from the same contexts as the original
  *     source/destination memory.
- *   - Only 1D memsets can be changed.
+ *   - For 2d memsets, only address and assinged value may be updated.
+ *   - For 1d memsets, updating dimensions is also allowed, but may fail if the resulting operation doesn't
+ *     map onto the work resources already allocated for the node.
  * - Additional memcpy node restrictions:
  *   - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
  *     CU_MEMORYTYPE_ARRAY, etc.) is not supported.
@@ -20776,6 +22073,7 @@ CUresult CUDAAPI cuGraphExecNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hN
  * \param hGraph             - Graph which will contain the conditional node using this handle.
  * \param ctx                - Context for the handle and associated conditional node.
  * \param defaultLaunchValue - Optional initial value for the conditional variable.
+ *                             Applied at the beginning of each graph execution if CU_GRAPH_COND_ASSIGN_DEFAULT is set in \p flags.
  * \param flags              - Currently must be CU_GRAPH_COND_ASSIGN_DEFAULT or 0.
  *
  * \return
@@ -20810,6 +22108,11 @@ CUresult CUDAAPI cuGraphConditionalHandleCreate(CUgraphConditionalHandle *pHandl
  * Returns in \p *numBlocks the number of the maximum active blocks per
  * streaming multiprocessor.
  *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
  * \param numBlocks       - Returned occupancy
  * \param func            - Kernel for which occupancy is calculated
  * \param blockSize       - Block size the kernel is intended to be launched with
@@ -20851,6 +22154,11 @@ CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUf
  *   can be found about this feature in the "Unified L1/Texture Cache"
  *   section of the Maxwell tuning guide.
  *
+ * Note that the API can also be with launch context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
  * \param numBlocks       - Returned occupancy
  * \param func            - Kernel for which occupancy is calculated
  * \param blockSize       - Block size the kernel is intended to be launched with
@@ -20902,6 +22210,11 @@ CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBl
  *    size_t blockToSmem(int blockSize);
  * \endcode
  *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
  * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
  * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
  * \param func        - Kernel for which launch configuration is calculated
@@ -20947,6 +22260,11 @@ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSi
  *   can be found about this feature in the "Unified L1/Texture Cache"
  *   section of the Maxwell tuning guide.
  *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
  * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
  * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
  * \param func        - Kernel for which launch configuration is calculated
@@ -20974,6 +22292,11 @@ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int
  *
  * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM.
  *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
  * \param dynamicSmemSize - Returned maximum dynamic shared memory
  * \param func            - Kernel function for which occupancy is calculated
  * \param numBlocks       - Number of blocks to fit on SM
@@ -21004,6 +22327,12 @@ CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize
  *
  * This function will respect the compile time launch bounds.
  *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will either be taken from the specified stream \p config->hStream
+ * or the current context in case of NULL stream.
+ *
  * \param clusterSize - Returned maximum cluster size that can be launched
  *                      for the given kernel function and launch configuration
  * \param func        - Kernel function for which maximum cluster
@@ -21040,6 +22369,12 @@ CUresult CUDAAPI cuOccupancyMaxPotentialClusterSize(int *clusterSize, CUfunction
  * calculation. Runtime environment may affect how the hardware schedules
  * the clusters, so the calculated occupancy is not guaranteed to be achievable.
  *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will either be taken from the specified stream \p config->hStream
+ * or the current context in case of NULL stream.
+ *
  * \param numClusters - Returned maximum number of clusters that
  *                      could co-exist on the target device
  * \param func        - Kernel function for which maximum number
@@ -22004,7 +23339,8 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref
  * \p pResViewDesc is an optional argument that specifies an alternate format for
  * the data described by \p pResDesc, and also describes the subresource region
  * to restrict access to when texturing. \p pResViewDesc can only be specified if
- * the type of resource is a CUDA array or a CUDA mipmapped array.
+ * the type of resource is a CUDA array or a CUDA mipmapped array not in a block
+ * compressed format.
  *
  * Texture objects are only supported on devices of compute capability 3.0 or higher.
  * Additionally, a texture object is an opaque value, and, as such, should only be
@@ -22412,7 +23748,7 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
  *
  * Tensor map objects are only supported on devices of compute capability 9.0 or higher.
  * Additionally, a tensor map object is an opaque value, and, as such, should only be
- * accessed through CUDA API calls.
+ * accessed through CUDA APIs and PTX.
  *
  * The parameters passed are bound to the following requirements:
  *
@@ -22433,21 +23769,33 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
         CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
         CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
         CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
-        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ,    // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,    // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,   // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B    // 6 bits
     } CUtensorMapDataType;
  * \endcode
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
  *
  * - \p tensorRank must be non-zero and less than or equal to the maximum supported dimensionality of 5. If \p interleave is not
  * ::CU_TENSOR_MAP_INTERLEAVE_NONE, then \p tensorRank must additionally be greater than or equal to 3.
  *
- * - \p globalAddress, which specifies the starting address of the memory region described, must be 32 byte aligned when \p interleave is
- * ::CU_TENSOR_MAP_INTERLEAVE_32B and 16 byte aligned otherwise.
+ * - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
  *
- * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
- * equal to 2^32.
+  * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
+ * equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
+ *    - Dimension for the packed data types must reflect the number of individual U# values.
  *
  * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
- * multiple of 16 and less than 2^40. Additionally, the stride must be a multiple of 32 when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B.
+ * multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
  * Each following dimension specified includes previous dimension stride:
  * \code
     globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
@@ -22457,9 +23805,9 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
  * \endcode
  *
  * - \p boxDim array, which specifies number of elements to be traversed along each of the \p tensorRank dimensions, must be non-zero
- * and less than or equal to 256.
- * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, { \p boxDim[0] * elementSizeInBytes( \p tensorDataType ) } must be a multiple
- * of 16 bytes.
+ * and less than or equal to 256. Additionally, the following requirements need to be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, { \p boxDim[0] * elementSizeInBytes( \p tensorDataType ) } must be a multiple of 16 bytes.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, boxDim[0] must be 128.
  *
  * - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
  * than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
@@ -22480,17 +23828,21 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
  * uses 32 bytes.
  * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
  * (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size.
- *    - CU_TENSOR_MAP_SWIZZLE_32B implies the bounding box inner dimension will be <= 32.
- *    - CU_TENSOR_MAP_SWIZZLE_64B implies the bounding box inner dimension will be <= 64.
- *    - CU_TENSOR_MAP_SWIZZLE_128B implies the bounding box inner dimension will be <= 128.
+ *    - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension to be <= 32.
+ *    - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
+ *    - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
+ * Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
  *
  * - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
  * \code
     typedef enum CUtensorMapSwizzle_enum {
         CU_TENSOR_MAP_SWIZZLE_NONE = 0,
-        CU_TENSOR_MAP_SWIZZLE_32B,
-        CU_TENSOR_MAP_SWIZZLE_64B,
-        CU_TENSOR_MAP_SWIZZLE_128B
+        CU_TENSOR_MAP_SWIZZLE_32B,                   // Swizzle 16B chunks within 32B  span
+        CU_TENSOR_MAP_SWIZZLE_64B,                   // Swizzle 16B chunks within 64B  span
+        CU_TENSOR_MAP_SWIZZLE_128B,                  // Swizzle 16B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,         // Swizzle 32B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B, // Swizzle 32B chunks within 128B span, additionally swap lower 8B with upper 8B within each 16B for every alternate row
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B          // Swizzle 64B chunks within 128B span
     } CUtensorMapSwizzle;
  * \endcode
  * Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
@@ -22498,6 +23850,15 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
  * problem, data can be loaded to shared memory with shuffling across shared memory banks.
  * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
  * Other interleave modes can have any swizzling pattern.
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only)
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
  *
  * - \p l2Promotion specifies L2 fetch size which indicates the byte granurality at which L2 requests is filled from DRAM. It must be of
  * type ::CUtensorMapL2promotion, which is defined as:
@@ -22518,7 +23879,8 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
         CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
     } CUtensorMapFloatOOBfill;
  * \endcode
- * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type.
+ * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
+ * and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
  *
  * \param tensorMap         - Tensor map object to create
  * \param tensorDataType    - Tensor data type
@@ -22542,11 +23904,11 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
  *
  * \sa
  * ::cuTensorMapEncodeIm2col,
+ * ::cuTensorMapEncodeIm2colWide,
  * ::cuTensorMapReplaceAddress
  */
 CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const cuuint32_t *boxDim, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
 /**
  * \brief Create a tensor map descriptor object representing im2col memory region
  *
@@ -22555,7 +23917,7 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
  *
  * Tensor map objects are only supported on devices of compute capability 9.0 or higher.
  * Additionally, a tensor map object is an opaque value, and, as such, should only be
- * accessed through CUDA API calls.
+ * accessed through CUDA APIs and PTX.
  *
  * The parameters passed are bound to the following requirements:
  *
@@ -22577,19 +23939,31 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
         CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
         CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
         CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,    // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,   // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B    // 6 bits
     } CUtensorMapDataType;
  * \endcode
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
  *
  * - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
  *
- * - \p globalAddress, which specifies the starting address of the memory region described, must be 32 byte aligned when \p interleave is
- * ::CU_TENSOR_MAP_INTERLEAVE_32B and 16 byte aligned otherwise.
+ * - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
  *
  * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
- * equal to 2^32.
+ * equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
+ *    - Dimension for the packed data types must reflect the number of individual U# values.
  *
  * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
- * multiple of 16 and less than 2^40. Additionally, the stride must be a multiple of 32 when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B.
+ * multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
  * Each following dimension specified includes previous dimension stride:
  * \code
     globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
@@ -22612,6 +23986,7 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
  * The bounding box specified by \p pixelBoxLowerCorner and \p pixelBoxUpperCorner must have non-zero area.
  *
  * - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
+ * Additionally, when \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p channelsPerPixel must be 128.
  *
  * - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the {N, D, H, W} dimensions, must be less than or
  * equal to 1024.
@@ -22634,18 +24009,22 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
  * TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
  * uses 32 bytes.
  * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
- * (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size.
- *    - CU_TENSOR_MAP_SWIZZLE_32B implies the bounding box inner dimension will be <= 32.
- *    - CU_TENSOR_MAP_SWIZZLE_64B implies the bounding box inner dimension will be <= 64.
- *    - CU_TENSOR_MAP_SWIZZLE_128B implies the bounding box inner dimension will be <= 128.
+ * (computed as \p channelsPerPixel multiplied by element size in bytes derived from \p tensorDataType) must be less than or equal to the swizzle size.
+ *    - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension to be <= 32.
+ *    - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
+ *    - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
+ * Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
  *
  * - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
  * \code
     typedef enum CUtensorMapSwizzle_enum {
         CU_TENSOR_MAP_SWIZZLE_NONE = 0,
-        CU_TENSOR_MAP_SWIZZLE_32B,
-        CU_TENSOR_MAP_SWIZZLE_64B,
-        CU_TENSOR_MAP_SWIZZLE_128B
+        CU_TENSOR_MAP_SWIZZLE_32B,                   // Swizzle 16B chunks within 32B  span
+        CU_TENSOR_MAP_SWIZZLE_64B,                   // Swizzle 16B chunks within 64B  span
+        CU_TENSOR_MAP_SWIZZLE_128B,                  // Swizzle 16B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,         // Swizzle 32B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B, // Swizzle 32B chunks within 128B span, additionally swap lower 8B with upper 8B within each 16B for every alternate row
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B          // Swizzle 64B chunks within 128B span
     } CUtensorMapSwizzle;
  * \endcode
  * Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
@@ -22653,6 +24032,15 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
  * problem, data can be loaded to shared memory with shuffling across shared memory banks.
  * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
  * Other interleave modes can have any swizzling pattern.
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only)
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
  *
  * - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
  * type ::CUtensorMapL2promotion, which is defined as:
@@ -22673,7 +24061,8 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
         CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
     } CUtensorMapFloatOOBfill;
  * \endcode
- * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type.
+ * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
+ * and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
  *
  * \param tensorMap             - Tensor map object to create
  * \param tensorDataType        - Tensor data type
@@ -22700,12 +24089,197 @@ CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataT
  *
  * \sa
  * ::cuTensorMapEncodeTiled,
+ * ::cuTensorMapEncodeIm2colWide,
  * ::cuTensorMapReplaceAddress
  */
 CUresult CUDAAPI cuTensorMapEncodeIm2col(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner, const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
 /**
- * \brief Modify an existing tensor map descriptor with an updated global address
+ * \brief Create a tensor map descriptor object representing im2col memory region, but where
+ * the elements are exclusively loaded along the W dimension.
+ *
+ * Creates a descriptor for Tensor Memory Access (TMA) object specified by the parameters
+ * describing a im2col memory layout and where the row is always loaded along the W dimensuin
+ * and returns it in \p tensorMap. This assumes the tensor layout in memory is either NDHWC,
+ * NHWC, or NWC.
+ *
+ * This API is only supported on devices of compute capability 10.0 or higher.
+ * Additionally, a tensor map object is an opaque value, and, as such, should only be
+ * accessed through CUDA APIs and PTX.
+ *
+ * The parameters passed are bound to the following requirements:
+ *
+ * - \p tensorMap address must be aligned to 64 bytes.
+ *
+ * - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as:
+ * \code
+    typedef enum CUtensorMapDataType_enum {
+        CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,       // 1 byte
+        CU_TENSOR_MAP_DATA_TYPE_UINT16,          // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT32,          // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT32,           // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT64,          // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT64,           // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT16,         // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32,         // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT64,         // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,    // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,   // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B    // 6 bits
+    } CUtensorMapDataType;
+ * \endcode
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
+ *
+ * - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
+ *
+ * - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
+ *
+  * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
+ * equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
+ *    - Dimension for the packed data types must reflect the number of individual U# values.
+ *
+ * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
+ * multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
+ * Each following dimension specified includes previous dimension stride:
+ * \code
+    globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
+    for (i = 1; i < tensorRank - 1; i++)
+        globalStrides[i] = globalStrides[i – 1] * (globalDim[i] + padding[i]);
+        assert(globalStrides[i] >= globalDim[i]);
+ * \endcode
+ *
+ * - \p pixelBoxLowerCornerWidth specifies the coordinate offset W of the bounding box from left corner. The offset must be
+ * within range [-32768, 32767].
+ *
+ * - \p pixelBoxUpperCornerWidth specifies the coordinate offset W of the bounding box from right corner. The offset must be
+ * within range [-32768, 32767].
+ *
+ * The bounding box specified by \p pixelBoxLowerCornerWidth and \p pixelBoxUpperCornerWidth must have non-zero area. Note
+ * that the size of the box along D and H dimensions is always equal to one.
+ *
+ * - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
+ * Additionally, when \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p channelsPerPixel must be 128.
+ *
+ * - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the W dimension, must be less than or
+ * equal to 1024. This field is ignored when \p mode is ::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128.
+ *
+ * - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
+ * than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
+ * TMA doesn’t support the stride for dimension zero.
+ * When all elements of the \p elementStrides array are one, \p boxDim specifies the number of elements to load. However, if \p elementStrides[i]
+ * is not equal to one for some \p i, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension.
+ * To load N elements along i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i].
+ *
+ * - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as:
+ * \code
+    typedef enum CUtensorMapInterleave_enum {
+        CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
+        CU_TENSOR_MAP_INTERLEAVE_16B,
+        CU_TENSOR_MAP_INTERLEAVE_32B
+    } CUtensorMapInterleave;
+ * \endcode
+ * TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
+ * uses 32 bytes.
+ * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the bounding box inner dimension (computed as \p channelsPerPixel multiplied by
+ * element size in bytes derived from \p tensorDataType) must be less than or equal to the swizzle size.
+ *    - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
+ *    - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
+ * Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
+ *
+ * - \p mode, which describes loading of elements loaded along the W dimension, has to be one of the following ::CUtensorMapIm2ColWideMode types:
+ * \code
+ *          CU_TENSOR_MAP_IM2COL_WIDE_MODE_W,
+ *          CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+ * \endcode
+ * ::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W allows the number of elements loaded along the W dimension to be specified
+ * via the \p pixelsPerColumn field.
+ *
+ * - \p swizzle, which specifies the shared memory bank swizzling pattern, must be one of the following
+ * ::CUtensorMapSwizzle modes (other swizzle modes are not supported):
+ * \code
+    typedef enum CUtensorMapSwizzle_enum {
+        CU_TENSOR_MAP_SWIZZLE_64B,                   // Swizzle 16B chunks within 64B  span
+        CU_TENSOR_MAP_SWIZZLE_128B,                  // Swizzle 16B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,         // Swizzle 32B chunks within 128B span
+    } CUtensorMapSwizzle;
+ * \endcode
+ * Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
+ * in shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this
+ * problem, data can be loaded to shared memory with shuffling across shared memory banks.
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
+ *
+ * - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
+ * type ::CUtensorMapL2promotion, which is defined as:
+ * \code
+    typedef enum CUtensorMapL2promotion_enum {
+        CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+    } CUtensorMapL2promotion;
+ * \endcode
+ *
+ * - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type
+ * ::CUtensorMapFloatOOBfill which is defined as:
+ * \code
+    typedef enum CUtensorMapFloatOOBfill_enum {
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+    } CUtensorMapFloatOOBfill;
+ * \endcode
+ * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
+ * and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
+ *
+ * \param tensorMap                - Tensor map object to create
+ * \param tensorDataType           - Tensor data type
+ * \param tensorRank               - Dimensionality of tensor; must be at least 3
+ * \param globalAddress            - Starting address of memory region described by tensor
+ * \param globalDim                - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions
+ * \param globalStrides            - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions
+ * \param pixelBoxLowerCornerWidth - Width offset of left box corner
+ * \param pixelBoxUpperCornerWidth - Width offset of right box corner
+ * \param channelsPerPixel         - Number of channels per pixel
+ * \param pixelsPerColumn          - Number of pixels per column
+ * \param elementStrides           - Array containing traversal stride in each of the \p tensorRank dimensions
+ * \param interleave               - Type of interleaved layout the tensor addresses
+ * \param mode                     - W or W128 mode
+ * \param swizzle                  - Bank swizzling pattern inside shared memory
+ * \param l2Promotion              - L2 promotion size
+ * \param oobFill                  - Indicate whether zero or special NaN constant will be used to fill out-of-bound elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTensorMapEncodeTiled,
+ * ::cuTensorMapEncodeIm2col,
+ * ::cuTensorMapReplaceAddress
+ */
+CUresult CUDAAPI cuTensorMapEncodeIm2colWide(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, int pixelBoxLowerCornerWidth, int pixelBoxUpperCornerWidth, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapIm2ColWideMode mode, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+/**
+ * \brief Modify an existing tensor map descriptor with an updated global address
  *
  * Modifies the descriptor for Tensor Memory Access (TMA) object passed in \p tensorMap with
  * an updated \p globalAddress.
@@ -22727,6 +24301,7 @@ CUresult CUDAAPI cuTensorMapEncodeIm2col(CUtensorMap *tensorMap, CUtensorMapData
  * \sa
  * ::cuTensorMapEncodeTiled,
  * ::cuTensorMapEncodeIm2col
+ * ::cuTensorMapEncodeIm2colWide
  */
 CUresult CUDAAPI cuTensorMapReplaceAddress(CUtensorMap *tensorMap, void *globalAddress);
@@ -23261,9 +24836,29 @@ typedef enum CUcoredumpSettings_enum {
     CU_COREDUMP_ENABLE_USER_TRIGGER,
     CU_COREDUMP_FILE,
     CU_COREDUMP_PIPE,
+    CU_COREDUMP_GENERATION_FLAGS,
     CU_COREDUMP_MAX
 } CUcoredumpSettings;
+/**
+ * Flags for controlling coredump contents
+ */
+typedef enum CUCoredumpGenerationFlags {
+    CU_COREDUMP_DEFAULT_FLAGS                = 0,
+    CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES = (1 << 0),
+    CU_COREDUMP_SKIP_GLOBAL_MEMORY           = (1 << 1),
+    CU_COREDUMP_SKIP_SHARED_MEMORY           = (1 << 2),
+    CU_COREDUMP_SKIP_LOCAL_MEMORY            = (1 << 3),
+    CU_COREDUMP_SKIP_ABORT                   = (1 << 4),
+    CU_COREDUMP_SKIP_CONSTBANK_MEMORY        = (1 << 5),
+    CU_COREDUMP_LIGHTWEIGHT_FLAGS = CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES
+                                     | CU_COREDUMP_SKIP_GLOBAL_MEMORY
+                                     | CU_COREDUMP_SKIP_SHARED_MEMORY
+                                     | CU_COREDUMP_SKIP_LOCAL_MEMORY
+                                     | CU_COREDUMP_SKIP_CONSTBANK_MEMORY
+} CUCoredumpGenerationFlags;
 /**
  * \brief Allows caller to fetch a coredump attribute value for the current context
  *
@@ -23280,10 +24875,12 @@ typedef enum CUcoredumpSettings_enum {
  *      CU_CTX_USER_COREDUMP_ENABLE flag was set during context creation.
  * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
  *      also create a coredump. The default value is ::true unless set to ::false globally or
- *      or locally.
+ *      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+ *      flag to disable host device abort() if needed.
  * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
  *      will not have a dump of GPU memory or non-reloc ELF images. The default value is
- *      ::false unless set to ::true globally or locally.
+ *      ::false unless set to ::true globally or locally. This attribute is deprecated as
+ *      of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS instead.
  * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
  *      created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
  *      value is ::false unless set to ::true globally or locally.
@@ -23295,6 +24892,22 @@ typedef enum CUcoredumpSettings_enum {
  *      that will be monitored if user-triggered coredumps are enabled. The default value is
  *      ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
  *      the CUDA application and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+ *      contained in a coredump specified as a bitwise OR combination of the following values:
+ *      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+ *          default settings of including all memory regions that it is able to access
+ *      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+ *          CUDA source modules that are not relocated at runtime.
+ *      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+ *          that does not belong to any context.
+ *      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+ *          for the warp that the dumped kernel belonged to.
+ *      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+ *      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+ *          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+ *      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+ *          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+ *          behavior.
  *
  * \param attrib - The enum defining which value to fetch.
  * \param value - void* containing the requested data.
@@ -23330,10 +24943,13 @@ CUresult CUDAAPI cuCoredumpGetAttribute(CUcoredumpSettings attrib, void* value,
  *      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
  *      The default value is ::false.
  * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
- *      also create a coredump. The default value is ::true.
+ *      also create a coredump. The default value is ::true unless set to ::false globally or
+ *      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+ *      flag to disable host device abort() if needed.
  * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
  *      will not have a dump of GPU memory or non-reloc ELF images. The default value is
- *      ::false.
+ *      ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
+ *      instead.
  * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
  *      created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
  *      value is ::false.
@@ -23345,6 +24961,22 @@ CUresult CUDAAPI cuCoredumpGetAttribute(CUcoredumpSettings attrib, void* value,
  *      that will be monitored if user-triggered coredumps are enabled. The default value is
  *      ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
  *      the CUDA application and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+ *      contained in a coredump specified as a bitwise OR combination of the following values:
+ *      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+ *          default settings of including all memory regions that it is able to access
+ *      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+ *          CUDA source modules that are not relocated at runtime.
+ *      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+ *          that does not belong to any context.
+ *      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+ *          for the warp that the dumped kernel belonged to.
+ *      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+ *      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+ *          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+ *      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+ *          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+ *          behavior.
  *
  * \param attrib - The enum defining which value to fetch.
  * \param value - void* containing the requested data.
@@ -23369,7 +25001,7 @@ CUresult CUDAAPI cuCoredumpGetAttributeGlobal(CUcoredumpSettings attrib, void *v
  *
  * An important design decision to note is that any coredump environment variable values
  * set before CUDA initializes will take permanent precedence over any values set with this
- * this function. This decision was made to ensure no change in behavior for any users that
+ * function. This decision was made to ensure no change in behavior for any users that
  * may be currently using these variables to get coredumps.
  *
  * \p *value shall contain the requested value specified by \p set. It is up to the caller
@@ -23389,14 +25021,33 @@ CUresult CUDAAPI cuCoredumpGetAttributeGlobal(CUcoredumpSettings attrib, void *v
  *      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
  *      The default value is ::false.
  * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
- *      also create a coredump. The default value is ::true.
+ *      also create a coredump. The default value is ::true unless set to ::false globally or
+ *      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+ *      flag to disable host device abort() if needed.
  * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
  *      will not have a dump of GPU memory or non-reloc ELF images. The default value is
- *      ::false.
+ *      ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
+ *      instead.
  * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
  *      any coredumps generated by this context will be written. The default value is
  *      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
  *      the CUDA applications and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+ *      contained in a coredump specified as a bitwise OR combination of the following values:
+ *      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+ *          default settings of including all memory regions that it is able to access
+ *      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+ *          CUDA source modules that are not relocated at runtime.
+ *      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+ *          that does not belong to any context.
+ *      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+ *          for the warp that the dumped kernel belonged to.
+ *      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+ *      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+ *          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+ *      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+ *          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+ *          behavior.
  *
  * \param attrib - The enum defining which value to set.
  * \param value - void* containing the requested data.
@@ -23427,7 +25078,7 @@ CUresult CUDAAPI cuCoredumpSetAttribute(CUcoredumpSettings attrib, void* value,
  *
  * An important design decision to note is that any coredump environment variable values
  * set before CUDA initializes will take permanent precedence over any values set with this
- * this function. This decision was made to ensure no change in behavior for any users that
+ * function. This decision was made to ensure no change in behavior for any users that
  * may be currently using these variables to get coredumps.
  *
  * \p *value shall contain the requested value specified by \p set. It is up to the caller
@@ -23441,10 +25092,13 @@ CUresult CUDAAPI cuCoredumpSetAttribute(CUcoredumpSettings attrib, void* value,
  *      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
  *      The default value is ::false.
  * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
- *      also create a coredump. The default value is ::true.
+ *      also create a coredump. The default value is ::true unless set to ::false globally or
+ *      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+ *      flag to disable host device abort() if needed.
  * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
  *      will not have a dump of GPU memory or non-reloc ELF images. The default value is
- *      ::false.
+ *      ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
+ *      instead.
  * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
  *      created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
  *      value is ::false.
@@ -23457,6 +25111,22 @@ CUresult CUDAAPI cuCoredumpSetAttribute(CUcoredumpSettings attrib, void* value,
  *      changed after ::CU_COREDUMP_ENABLE_USER_TRIGGER is set to ::true. The default
  *      value is ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine
  *      running the CUDA application and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+ *      contained in a coredump specified as a bitwise OR combination of the following values:
+ *      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+ *          default settings of including all memory regions that it is able to access
+ *      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+ *          CUDA source modules that are not relocated at runtime.
+ *      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+ *          that does not belong to any context.
+ *      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+ *          for the warp that the dumped kernel belonged to.
+ *      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+ *      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+ *          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+ *      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+ *          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+ *          behavior.
  *
  * \param attrib - The enum defining which value to set.
  * \param value - void* containing the requested data.
@@ -23523,13 +25193,6 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExp
  * @{
  */
-/*!
- * \typedef typedef struct CUgreenCtx_st* CUgreenCtx
- * A green context handle. This handle can be used safely from only one CPU thread at a time.
- * Created via ::cuGreenCtxCreate
- */
-typedef struct CUgreenCtx_st *CUgreenCtx;
 /*!
  * \typedef struct CUdevResourceDesc_st* CUdevResourceDesc;
  * An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources.
@@ -23541,6 +25204,11 @@ typedef enum {
     CU_GREEN_CTX_DEFAULT_STREAM = 0x1, /**< Required. Creates a default stream to use inside the green context */
 } CUgreenCtxCreate_flags;
+typedef enum {
+    CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING = 0x1,
+    CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE = 0x2,
+} CUdevSmResourceSplit_flags;
 #define RESOURCE_ABI_VERSION 1
 #define RESOURCE_ABI_EXTERNAL_BYTES 48
@@ -23554,7 +25222,7 @@ typedef enum {
 typedef enum {
     CU_DEV_RESOURCE_TYPE_INVALID = 0,
     CU_DEV_RESOURCE_TYPE_SM = 1, /**< Streaming multiprocessors related information */
-#ifdef __CUDA_API_VERSION_INTERNAL
+#if defined(__CUDA_API_VERSION_INTERNAL) && !defined(__CUDA_API_VERSION_INTERNAL_ODR)
     CU_DEV_RESOURCE_TYPE_MAX,
 #endif
 } CUdevResourceType;
@@ -23777,18 +25445,24 @@ CUresult CUDAAPI cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resour
  * first creating a descriptor and a green context with that descriptor.
  *
  * When creating the groups, the API will take into account the performance and functional characteristics of the
- * input resource, and guarantee a split that will create a disjoint set of symmetrical partitions. This may lead to less groups created
+ * input resource, and guarantee a split that will create a disjoint set of symmetrical partitions. This may lead to fewer groups created
  * than purely dividing the total SM count by the \p minCount due to cluster requirements or
  * alignment and granularity requirements for the minCount.
  *
- * The \p remainder set, might not have the same functional or performance guarantees as the groups in \p result.
+ * The \p remainder set does not have the same functional or performance guarantees as the groups in \p result.
  * Its use should be carefully planned and future partitions of the \p remainder set are discouraged.
  *
+ * The following flags are supported:
+ * - \p CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING : Lower the minimum SM count and alignment, and treat each SM independent of its hierarchy.
+ *  This allows more fine grained partitions but at the cost of advanced features (such as large clusters on compute capability 9.0+).
+ * - \p CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE : Compute Capability 9.0+ only. Attempt to create groups that may allow
+ *  for maximally sized thread clusters. This can be queried post green context creation using ::cuOccupancyMaxPotentialClusterSize.
+ *
  * A successful API call must either have:
- * - A valid array of \p result pointers of size passed in \p nbGroups, with \p Input of type \p CU_DEV_RESOURCE_TYPE_SM.
- * Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining and \p useFlags are optional.
- * - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p Input of type \p CU_DEV_RESOURCE_TYPE_SM.
- * Value of \p minCount must be between 0 and the SM count specified in \p input.
+ * - A valid array of \p result pointers of size passed in \p nbGroups, with \p input of type \p CU_DEV_RESOURCE_TYPE_SM.
+ * Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining may be NULL.
+ * - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p input of type \p CU_DEV_RESOURCE_TYPE_SM.
+ * Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining may be NULL.
  * This queries the number of groups that would be created by the API.
  *
  * Note: The API is not supported on 32-bit platforms.
@@ -23798,7 +25472,7 @@ CUresult CUDAAPI cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resour
  * \param input - Input SM resource to be split. Must be a valid \p CU_DEV_RESOURCE_TYPE_SM resource.
  * \param remaining - If the input resource cannot be cleanly split among \p nbGroups, the remaining is placed in here.
  * Can be ommitted (NULL) if the user does not need the remaining set.
- * \param useFlags - Flags specifying how these partitions are used or which constraints to abide by when splitting the input.
+ * \param useFlags - Flags specifying how these partitions are used or which constraints to abide by when splitting the input. Zero is valid for default behavior.
  * \param minCount - Minimum number of SMs required
  *
  * \return
@@ -23821,10 +25495,18 @@ CUresult CUDAAPI cuDevSmResourceSplitByCount(
 /**
  * \brief Generate a resource descriptor
  *
- * Generates a resource descriptor with the set of resources specified in \p resources.
+ * Generates a single resource descriptor with the set of resources specified in \p resources.
  * The generated resource descriptor is necessary for the creation of green contexts via the ::cuGreenCtxCreate API.
- * The API expects \p nbResources == 1, as there is only one type of resource and merging the same
- * types of resource is currently not supported.
+ * Resources of the same type can be passed in, provided they meet the requirements as noted below.
+ *
+ * A successful API call must have:
+ * - A valid output pointer for the \p phDesc descriptor as well as a valid array of \p resources pointers,
+ * with the array size passed in \p nbResources.
+ * If multiple resources are provided in \p resources, the device they came from must be the same,
+ * otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
+ * If multiple resources are provided in \p resources and they are of type ::CU_DEV_RESOURCE_TYPE_SM,
+ * they must be outputs (whether \p result or \p remaining) from the same split API instance,
+ * otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
  *
  * Note: The API is not supported on 32-bit platforms.
  *
@@ -23848,15 +25530,16 @@ CUresult CUDAAPI cuDevResourceGenerateDesc(CUdevResourceDesc *phDesc, CUdevResou
 /**
  * \brief Records an event.
  *
- * Captures in \phEvent all the activities of the green context of \phCtx
- * at the time of this call. \phEvent and \phCtx must be from the same
- * CUDA context. Calls such as ::cuEventQuery() or ::cuGreenCtxWaitEvent() will
+ * Captures in \p hEvent all the activities of the green context of \p hCtx
+ * at the time of this call. \p hEvent and \p hCtx must be from the same
+ * primary context otherwise ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * Calls such as ::cuEventQuery() or ::cuGreenCtxWaitEvent() will
  * then examine or wait for completion of the work that was captured. Uses of
  * \p hCtx after this call do not modify \p hEvent.
  *
- * \note The API will return an error if the specified green context \p hCtx
- * has a stream in the capture mode. In such a case, the call will invalidate
- * all the conflicting captures.
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the
+ * specified green context \p hCtx has a stream in the capture mode. In such
+ * a case, the call will invalidate all the conflicting captures.
  *
  * \param hCtx - Green context to record event for
  * \param hEvent  - Event to record
@@ -23866,39 +25549,49 @@ CUresult CUDAAPI cuDevResourceGenerateDesc(CUdevResourceDesc *phDesc, CUdevResou
  * ::CUDA_ERROR_DEINITIALIZED,
  * ::CUDA_ERROR_NOT_INITIALIZED,
  * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
  *
  * \sa
  * ::cuGreenCtxWaitEvent,
- * ::cuEventRecord
+ * ::cuEventRecord,
+ * ::cuCtxRecordEvent,
+ * ::cuCtxWaitEvent
  */
 CUresult CUDAAPI cuGreenCtxRecordEvent(CUgreenCtx hCtx, CUevent hEvent);
 /**
  * \brief Make a green context wait on an event
  *
- * Makes all future work submitted to green context \phCtx wait for all work
- * captured in \phEvent. The synchronization will be performed on the device
+ * Makes all future work submitted to green context \p hCtx wait for all work
+ * captured in \p hEvent. The synchronization will be performed on the device
  * and will not block the calling CPU thread. See ::cuGreenCtxRecordEvent()
- * for details on what is captured by an event.
+ * or ::cuEventRecord(), for details on what is captured by an event.
+ *
+ * \note \p hEvent may be from a different context or device than \p hCtx.
+ *
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and
+ * invalidate the capture if the specified event \p hEvent is part of an
+ * ongoing capture sequence or if the specified green context \p hCtx has
+ * a stream in the capture mode.
  *
- * \note The API will return an error and invalidate the capture if the specified
- * event \p hEvent is part of an ongoing capture sequence.
- *
  * \param hCtx    - Green context to wait
- * \param hEvent  - Event to wait on (may not be NULL)
+ * \param hEvent  - Event to wait on
  *
  * \return
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_DEINITIALIZED,
  * ::CUDA_ERROR_NOT_INITIALIZED,
  * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
  *
  * \sa
  * ::cuGreenCtxRecordEvent,
  * ::cuStreamWaitEvent
- */
+ * ::cuCtxRecordEvent,
+ * ::cuCtxWaitEvent
+*/
 CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
 /**
@@ -23910,7 +25603,9 @@ CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
  * The stream handle \p hStream can refer to any of the following:
  * <ul>
  *   <li>
- *   a stream created via any of the CUDA driver APIs such as ::cuStreamCreate.
+ *   a stream created via any of the CUDA driver APIs such as ::cuStreamCreate, ::cuStreamCreateWithPriority
+ *   and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
  *   If during stream creation the context that was active in the calling thread was obtained
  *   with cuCtxFromGreenCtx, that green context is returned in \p phCtx.
  *   Otherwise, \p *phCtx is set to NULL instead.
@@ -23936,9 +25631,13 @@ CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
  * \notefnerr
  *
  * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
  * ::cuStreamCreateWithPriority,
+ * ::cuStreamGetCtx_v2,
+ * ::cuGreenCtxStreamCreate,
  * ::cuStreamGetPriority,
  * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
  * ::cuStreamWaitEvent,
  * ::cuStreamQuery,
  * ::cuStreamSynchronize,
@@ -23948,6 +25647,62 @@ CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
  */
 CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
+/**
+ * \brief Create a stream for use in the green context
+ *
+ * Creates a stream for use in the specified green context \p greenCtx and returns a handle in \p phStream.
+ * The stream can be destroyed by calling ::cuStreamDestroy(). Note that the API ignores the context that
+ * is current to the calling thread and creates a stream in the specified green context \p greenCtx.
+ *
+ * The supported values for \p flags are:
+ * - ::CU_STREAM_NON_BLOCKING: This must be specified. It indicates that work running in the created
+ *   stream may run concurrently with work in the default stream, and that
+ *   the created stream should perform no implicit synchronization with the default stream.
+ *
+ * Specifying \p priority affects the scheduling priority of work in the stream. Priorities provide a
+ * hint to preferentially run work with higher priority when possible, but do not preempt
+ * already-running work or provide any other functional guarantee on execution order.
+ * \p priority follows a convention where lower numbers represent higher priorities.
+ * '0' represents default priority. The range of meaningful numerical priorities can
+ * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
+ * outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * it will automatically be clamped to the lowest or the highest number in the range.
+ *
+ * \param phStream - Returned newly created stream
+ * \param greenCtx - Green context for which to create the stream for
+ * \param flags    - Flags for stream creation. \p CU_STREAM_NON_BLOCKING must be specified.
+ * \param priority - Stream priority. Lower numbers represent higher priorities.
+ *                   See ::cuCtxGetStreamPriorityRange for more information about
+ *                   meaningful stream priorities that can be passed.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \note In the current implementation, only compute kernels launched in
+ * priority streams are affected by the stream's priority. Stream priorities have
+ * no effect on host-to-device and device-to-host memory operations.
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuGreenCtxCreate
+ * ::cuStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreateWithPriority
+ */
+CUresult CUDAAPI cuGreenCtxStreamCreate(CUstream* phStream, CUgreenCtx greenCtx, unsigned int flags, int priority);
 /** @} */
 /*
@@ -23991,6 +25746,8 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
     #undef cuMemcpyDtoDAsync
     #undef cuMemcpy2DAsync
     #undef cuMemcpy3DAsync
+    #undef cuMemcpyBatchAsync
+    #undef cuMemcpy3DBatchAsync
     #undef cuMemsetD8
     #undef cuMemsetD16
     #undef cuMemsetD32
@@ -24025,6 +25782,7 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
     #undef cuStreamGetPriority
     #undef cuStreamGetId
     #undef cuStreamGetFlags
+    #undef cuStreamGetDevice
     #undef cuStreamGetCtx
     #undef cuStreamWaitEvent
     #undef cuStreamAddCallback
@@ -24083,6 +25841,8 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
     #undef cuStreamUpdateCaptureDependencies
     #undef cuStreamUpdateCaptureDependencies_v2
     #undef cuGetProcAddress
+    #undef cuStreamGetCtx_v2
+    #undef cuMemBatchDecompressAsync
     CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
     CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
@@ -24250,7 +26010,11 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
     CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
     CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
     CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyBatchAsync(CUdeviceptr *dsts, CUdeviceptr *srcs, size_t *sizes, size_t count,
+                                        CUmemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs,
+                                        size_t *failIdx, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP *opList,
+                                          size_t *failIdx, unsigned long long flags, CUstream hStream);
     CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
     CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
     CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
@@ -24261,7 +26025,9 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
     CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
     CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
     CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
+    CUresult CUDAAPI cuStreamGetDevice(CUstream hStream, CUdevice *device);
     CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
+    CUresult CUDAAPI cuStreamGetCtx_v2(CUstream hStream, CUcontext *pCtx, CUgreenCtx *pGreenCtx);
     CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
     CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
     CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
@@ -24330,6 +26096,15 @@ CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
     CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
     CUresult CUDAAPI cuStreamUpdateCaptureDependencies_v2(CUstream hStream, CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
+    CUresult CUDAAPI cuMemBatchDecompressAsync(
+        CUmemDecompressParams *paramsArray,
+        size_t count,
+        unsigned int flags,
+        size_t *errorIndex,
+        CUstream stream
+    );
     CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags);
 #elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
@@ -24344,6 +26119,152 @@ static inline CUresult cuGetProcAddress_v2_ptsz(const char *symbol, void **funcP
 #define cuGetProcAddress_v2 cuGetProcAddress_v2_ptsz
 #endif
+/**
+ * \defgroup CUDA_CHECKPOINT CUDA Checkpointing
+ *
+ * ___MANBRIEF___ CUDA checkpoint and restore functionality of the low-level
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This sections describes the checkpoint and restore functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * The CUDA checkpoint and restore API's provide a way to save and restore GPU
+ * state for full process checkpoints when used with CPU side process
+ * checkpointing solutions. They can also be used to pause GPU work and suspend
+ * a CUDA process to allow other applications to make use of GPU resources.
+ *
+ * Checkpoint and restore capabilities are currently restricted to Linux.
+ *
+ * @{
+ */
+/**
+ * \brief Returns the restore thread ID for a CUDA process
+ *
+ * Returns in \p *tid the thread ID of the CUDA restore thread for the process
+ * specified by \p pid.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param tid - Returned restore thread ID
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuCheckpointProcessGetRestoreThreadId(int pid, int *tid);
+/**
+ * \brief Returns the process state of a CUDA process
+ *
+ * Returns in \p *state the current state of the CUDA process specified by \p pid.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param state - Returned CUDA process state
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuCheckpointProcessGetState(int pid, CUprocessState *state);
+/**
+ * \brief Lock a running CUDA process
+ *
+ * Lock the CUDA process specified by \p pid which will block further CUDA API
+ * calls. Process must be in the RUNNING state in order to lock.
+ *
+ * Upon successful return the process will be in the LOCKED state.
+ *
+ * If timeoutMs is specified and the timeout is reached the process will be left
+ * in the RUNNING state upon return.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param args - Optional lock operation arguments
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * ::CUDA_ERROR_NOT_READY
+ */
+CUresult CUDAAPI cuCheckpointProcessLock(int pid, CUcheckpointLockArgs *args);
+/**
+ * \brief Checkpoint a CUDA process's GPU memory contents
+ *
+ * Checkpoints a CUDA process specified by \p pid that is in the LOCKED
+ * state. The GPU memory contents will be brought into host memory and all
+ * underlying references will be released. Process must be in the LOCKED state
+ * to checkpoint.
+ *
+ * Upon successful return the process will be in the CHECKPOINTED state.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param args - Optional checkpoint operation arguments
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuCheckpointProcessCheckpoint(int pid, CUcheckpointCheckpointArgs *args);
+/**
+ * \brief Restore a CUDA process's GPU memory contents from its last checkpoint
+ *
+ * Restores a CUDA process specified by \p pid from its last checkpoint. Process
+ * must be in the CHECKPOINTED state to restore.
+ *
+ * Upon successful return the process will be in the LOCKED state.
+ *
+ * CUDA process restore requires persistence mode to be enabled or ::cuInit to
+ * have been called before execution.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param args - Optional restore operation arguments
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa
+ * ::cuInit
+ */
+CUresult CUDAAPI cuCheckpointProcessRestore(int pid, CUcheckpointRestoreArgs *args);
+/**
+ * \brief Unlock a CUDA process to allow CUDA API calls
+ *
+ * Unlocks a process specified by \p pid allowing it to resume making CUDA API
+ * calls. Process must be in the LOCKED state.
+ *
+ * Upon successful return the process will be in the RUNNING state.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param args - Optional unlock operation arguments
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuCheckpointProcessUnlock(int pid, CUcheckpointUnlockArgs *args);
+/** @} */ /* End CUDA_CHECKPOINT */
 #ifdef __cplusplus
 }
 #endif