PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show

cuda/cccl/headers/include/cuda/__driver/driver_api.h CHANGED Viewed

@@ -21,11 +21,12 @@
 #  pragma system_header
 #endif // no system header
-#if _CCCL_HAS_CTK()
+#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #  include <cuda/std/__exception/cuda_error.h>
 #  include <cuda/std/__internal/namespaces.h>
 #  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_same.h>
 #  include <cuda.h>
@@ -41,31 +42,45 @@ _CCCL_BEGIN_NAMESPACE_CUDA_DRIVER
     reinterpret_cast<decltype(::versioned_fn_name)*>(                                           \
       ::cuda::__driver::__get_driver_entry_point(#function_name, major, minor))
+// cudaGetDriverEntryPoint function is deprecated
 _CCCL_SUPPRESS_DEPRECATED_PUSH
-//! @brief Get a driver function pointer for a given API name and optionally specific CUDA version
-//!
-//! For minor version compatibility request the 12.0 version of everything for now, unless requested otherwise
-[[nodiscard]] _CCCL_HOST_API inline void*
-__get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12, [[maybe_unused]] int __minor = 0)
+//! @brief Gets the cuGetProcAddress function pointer.
+[[nodiscard]] _CCCL_HOST_API inline auto __getProcAddressFn() -> decltype(cuGetProcAddress)*
 {
-  // TODO switch to dlopen of libcuda.so instead of the below and maybe pair it with cuInit to avoid checking for two
-  // initializations
-  static auto __get_driver_entry_point_fn = reinterpret_cast<decltype(cuGetProcAddress)*>([]() {
-    void* __fn;
-    ::cudaDriverEntryPointQueryResult __result;
-    ::cudaError_t __status = ::cudaGetDriverEntryPoint("cuGetProcAddress", &__fn, ::cudaEnableDefault, &__result);
-    if (__status != ::cudaSuccess || __result != ::cudaDriverEntryPointSuccess)
-    {
-      ::cuda::__throw_cuda_error(::cudaErrorUnknown, "Failed to get cuGetProcAddress");
-    }
-    return __fn;
-  }());
+  // TODO switch to dlopen of libcuda.so instead of the below
+  void* __fn;
+  ::cudaDriverEntryPointQueryResult __result;
+  ::cudaError_t __status = ::cudaGetDriverEntryPoint("cuGetProcAddress", &__fn, ::cudaEnableDefault, &__result);
+  if (__status != ::cudaSuccess || __result != ::cudaDriverEntryPointSuccess)
+  {
+    ::cuda::__throw_cuda_error(::cudaErrorUnknown, "Failed to get cuGetProcAddress");
+  }
+  return reinterpret_cast<decltype(cuGetProcAddress)*>(__fn);
+}
+_CCCL_SUPPRESS_DEPRECATED_POP
+//! @brief Gets the driver entry point.
+//!
+//! @param __get_proc_addr_fn Pointer to cuGetProcAddress function.
+//! @param __name Name of the symbol to get the driver entry point for.
+//! @param __major The major CTK version to get the symbol version for.
+//! @param __minor The major CTK version to get the symbol version for.
+//!
+//! @return The address of the symbol.
+//!
+//! @throws @c cuda::cuda_error if the symbol cannot be obtained.
+[[nodiscard]] _CCCL_HOST_API inline void* __get_driver_entry_point_impl(
+  decltype(cuGetProcAddress)* __get_proc_addr_fn,
+  const char* __name,
+  [[maybe_unused]] int __major,
+  [[maybe_unused]] int __minor)
+{
   void* __fn;
   ::CUdriverProcAddressQueryResult __result;
   ::CUresult __status =
-    __get_driver_entry_point_fn(__name, &__fn, __major * 1000 + __minor * 10, ::CU_GET_PROC_ADDRESS_DEFAULT, &__result);
+    __get_proc_addr_fn(__name, &__fn, __major * 1000 + __minor * 10, ::CU_GET_PROC_ADDRESS_DEFAULT, &__result);
   if (__status != ::CUDA_SUCCESS || __result != ::CU_GET_PROC_ADDRESS_SUCCESS)
   {
     if (__status == ::CUDA_ERROR_INVALID_VALUE)
@@ -84,8 +99,13 @@ __get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12,
   return __fn;
 }
-_CCCL_SUPPRESS_DEPRECATED_POP
+//! @brief CUDA Driver API call wrapper. Calls a given CUDA Driver API and checks the return value.
+//!
+//! @param __fn A CUDA Driver function.
+//! @param __err_msg Error message describing the call if the all fails.
+//! @param __args The arguments to the @c __fn call.
+//!
+//! @throws @c cuda::cuda_error if the function call doesn't return CUDA_SUCCESS.
 template <typename Fn, typename... Args>
 _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args... __args)
 {
@@ -96,6 +116,48 @@ _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args
   }
 }
+//! @brief Initializes the CUDA Driver.
+//!
+//! @param __get_proc_addr_fn The pointer to cuGetProcAddress function.
+//!
+//! @return A dummy bool value.
+//!
+//! @warning This function should be called only once from __get_driver_entry_point function.
+[[nodiscard]] _CCCL_HOST_API inline bool __init(decltype(cuGetProcAddress)* __get_proc_addr_fn)
+{
+  auto __driver_fn = reinterpret_cast<decltype(::cuInit)*>(
+    ::cuda::__driver::__get_driver_entry_point_impl(__get_proc_addr_fn, "cuInit", 12, 0));
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to initialize CUDA Driver", 0);
+  return true;
+}
+//! @brief Get a driver function pointer for a given API name and optionally specific CUDA version. This function also
+//!        initializes the CUDA Driver.
+//!
+//! @param __name Name of the symbol to get the driver entry point for.
+//! @param __major The major CTK version to get the symbol version for. Defaults to 12.
+//! @param __minor The major CTK version to get the symbol version for. Defaults to 0.
+//!
+//! @return The address of the symbol.
+//!
+//! @throws @c cuda::cuda_error if the symbol cannot be obtained or the CUDA driver failed to initialize.
+[[nodiscard]] _CCCL_HOST_API inline void*
+__get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12, [[maybe_unused]] int __minor = 0)
+{
+  // Get cuGetProcAddress function and call cuInit(0) only on the first call
+  static auto __get_proc_addr_fn      = ::cuda::__driver::__getProcAddressFn();
+  [[maybe_unused]] static auto __init = ::cuda::__driver::__init(__get_proc_addr_fn);
+  return ::cuda::__driver::__get_driver_entry_point_impl(__get_proc_addr_fn, __name, __major, __minor);
+}
+//! @brief Converts CUdevice to ordinal device id.
+//!
+//! @note Currently, CUdevice value is the same as the ordinal device id. But that might change in the future.
+[[nodiscard]] _CCCL_HOST_API inline int __cudevice_to_ordinal(::CUdevice __dev) noexcept
+{
+  return static_cast<int>(__dev);
+}
 // Version management
 [[nodiscard]] _CCCL_HOST_API inline int __getVersion()
@@ -119,6 +181,22 @@ _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args
   return __result;
 }
+[[nodiscard]] _CCCL_HOST_API inline ::CUdevice __deviceGetAttribute(::CUdevice_attribute __attr, ::CUdevice __device)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetAttribute);
+  int __result;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get device attribute", &__result, __attr, __device);
+  return __result;
+}
+[[nodiscard]] _CCCL_HOST_API inline int __deviceGetCount()
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetCount);
+  int __result;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get device count", &__result);
+  return __result;
+}
 _CCCL_HOST_API inline void __deviceGetName(char* __name_out, int __len, int __ordinal)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetName);
@@ -138,11 +216,10 @@ _CCCL_HOST_API inline void __deviceGetName(char* __name_out, int __len, int __or
   return __result;
 }
-_CCCL_HOST_API inline void __primaryCtxRelease(::CUdevice __dev)
+[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __primaryCtxReleaseNoThrow(::CUdevice __dev)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDevicePrimaryCtxRelease);
-  // TODO we might need to ignore failure here
-  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to release context for a device", __dev);
+  return static_cast<::cudaError_t>(__driver_fn(__dev));
 }
 [[nodiscard]] _CCCL_HOST_API inline bool __isPrimaryCtxActive(::CUdevice __dev)
@@ -178,6 +255,14 @@ _CCCL_HOST_API inline ::CUcontext __ctxPop()
   return __result;
 }
+[[nodiscard]] _CCCL_HOST_API inline ::CUdevice __ctxGetDevice()
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuCtxGetDevice);
+  ::CUdevice __result{};
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get current context", &__result);
+  return __result;
+}
 // Memory management
 _CCCL_HOST_API inline void __memcpyAsync(void* __dst, const void* __src, size_t __count, ::CUstream __stream)
@@ -239,8 +324,174 @@ _CCCL_HOST_API void __memsetAsync(void* __dst, _Tp __value, size_t __count, ::CU
   }
 }
+_CCCL_HOST_API inline ::cudaError_t __mempoolCreateNoThrow(::CUmemoryPool* __pool, ::CUmemPoolProps* __props)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolCreate);
+  return static_cast<::cudaError_t>(__driver_fn(__pool, __props));
+}
+_CCCL_HOST_API inline void __mempoolSetAttribute(::CUmemoryPool __pool, ::CUmemPool_attribute __attr, void* __value)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolSetAttribute);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to set attribute for a memory pool", __pool, __attr, __value);
+}
+_CCCL_HOST_API inline size_t __mempoolGetAttribute(::CUmemoryPool __pool, ::CUmemPool_attribute __attr)
+{
+  size_t __value          = 0;
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolGetAttribute);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get attribute for a memory pool", __pool, __attr, &__value);
+  return __value;
+}
+_CCCL_HOST_API inline void __mempoolDestroy(::CUmemoryPool __pool)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolDestroy);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to destroy a memory pool", __pool);
+}
+_CCCL_HOST_API inline ::CUdeviceptr
+__mallocFromPoolAsync(::cuda::std::size_t __bytes, ::CUmemoryPool __pool, ::CUstream __stream)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocFromPoolAsync);
+  ::CUdeviceptr __result  = 0;
+  ::cuda::__driver::__call_driver_fn(
+    __driver_fn, "Failed to allocate memory from a memory pool", &__result, __bytes, __pool, __stream);
+  return __result;
+}
+_CCCL_HOST_API inline void __mempoolTrimTo(::CUmemoryPool __pool, ::cuda::std::size_t __min_bytes_to_keep)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolTrimTo);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to trim a memory pool", __pool, __min_bytes_to_keep);
+}
+_CCCL_HOST_API inline ::cudaError_t __freeAsyncNoThrow(::CUdeviceptr __dptr, ::CUstream __stream)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFreeAsync);
+  return static_cast<::cudaError_t>(__driver_fn(__dptr, __stream));
+}
+_CCCL_HOST_API inline void __mempoolSetAccess(::CUmemoryPool __pool, ::CUmemAccessDesc* __descs, ::size_t __count)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolSetAccess);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to set access of a memory pool", __pool, __descs, __count);
+}
+_CCCL_HOST_API inline ::CUmemAccess_flags __mempoolGetAccess(::CUmemoryPool __pool, ::CUmemLocation* __location)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolGetAccess);
+  ::CUmemAccess_flags __flags;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get access of a memory pool", &__flags, __pool, __location);
+  return __flags;
+}
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+_CCCL_HOST_API inline ::CUmemoryPool
+__getDefaultMemPool(CUmemLocation __location, CUmemAllocationType_enum __allocation_type)
+{
+  static auto __driver_fn =
+    _CCCLRT_GET_DRIVER_FUNCTION_VERSIONED(cuMemGetDefaultMemPool, cuMemGetDefaultMemPool, 13, 0);
+  ::CUmemoryPool __result = nullptr;
+  ::cuda::__driver::__call_driver_fn(
+    __driver_fn, "Failed to get default memory pool", &__result, &__location, __allocation_type);
+  return __result;
+}
+#  endif // _CCCL_CTK_AT_LEAST(13, 0)
+_CCCL_HOST_API inline ::CUdeviceptr __mallocManaged(::cuda::std::size_t __bytes, unsigned int __flags)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocManaged);
+  ::CUdeviceptr __result  = 0;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to allocate managed memory", &__result, __bytes, __flags);
+  return __result;
+}
+_CCCL_HOST_API inline void* __mallocHost(::cuda::std::size_t __bytes)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocHost);
+  void* __result          = nullptr;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to allocate host memory", &__result, __bytes);
+  return __result;
+}
+_CCCL_HOST_API inline ::cudaError_t __freeNoThrow(::CUdeviceptr __dptr)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFree);
+  return static_cast<::cudaError_t>(__driver_fn(__dptr));
+}
+_CCCL_HOST_API inline ::cudaError_t __freeHostNoThrow(void* __dptr)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFreeHost);
+  return static_cast<::cudaError_t>(__driver_fn(__dptr));
+}
+// Unified Addressing
+// TODO: we don't want to have these functions here, refactoring expected
+template <::CUpointer_attribute _Attr>
+[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __pointer_attribute_value_type_t_impl() noexcept
+{
+  if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_CONTEXT)
+  {
+    return ::CUcontext{};
+  }
+  else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE)
+  {
+    return ::CUmemorytype{};
+  }
+  else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER || _Attr == ::CU_POINTER_ATTRIBUTE_HOST_POINTER)
+  {
+    return static_cast<void*>(nullptr);
+  }
+  else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_IS_MANAGED || _Attr == ::CU_POINTER_ATTRIBUTE_MAPPED)
+  {
+    return bool{};
+  }
+  else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL)
+  {
+    return int{};
+  }
+  else
+  {
+    static_assert(::cuda::std::__always_false_v<decltype(_Attr)>, "not implemented attribute");
+  }
+}
+template <::CUpointer_attribute _Attr>
+using __pointer_attribute_value_type_t = decltype(::cuda::__driver::__pointer_attribute_value_type_t_impl<_Attr>());
+template <::CUpointer_attribute _Attr>
+[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t
+__pointerGetAttributeNoThrow(__pointer_attribute_value_type_t<_Attr>& __result, const void* __ptr)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuPointerGetAttribute);
+  ::cudaError_t __status{};
+  if constexpr (::cuda::std::is_same_v<__pointer_attribute_value_type_t<_Attr>, bool>)
+  {
+    int __result2{};
+    __status = static_cast<::cudaError_t>(__driver_fn(&__result2, _Attr, reinterpret_cast<::CUdeviceptr>(__ptr)));
+    __result = static_cast<bool>(__result2);
+  }
+  else
+  {
+    __status =
+      static_cast<::cudaError_t>(__driver_fn((void*) &__result, _Attr, reinterpret_cast<::CUdeviceptr>(__ptr)));
+  }
+  return __status;
+}
 // Stream management
+[[nodiscard]] _CCCL_HOST_API inline ::CUstream __streamCreateWithPriority(unsigned __flags, int __priority)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamCreateWithPriority);
+  ::CUstream __stream;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to create a stream", &__stream, __flags, __priority);
+  return __stream;
+}
 _CCCL_HOST_API inline void __streamSynchronize(::CUstream __stream)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamSynchronize);
@@ -294,6 +545,17 @@ struct __ctx_from_stream
 }
 #  endif // _CCCL_CTK_AT_LEAST(12, 5)
+// TODO: make this available since CUDA 12.8
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+[[nodiscard]] _CCCL_HOST_API inline ::CUdevice __streamGetDevice(::CUstream __stream)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION_VERSIONED(cuStreamGetDevice, cuStreamGetDevice, 12, 8);
+  ::CUdevice __result{};
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get the device of the stream", __stream, &__result);
+  return __result;
+}
+#  endif // _CCCL_CTK_AT_LEAST(13, 0)
 _CCCL_HOST_API inline void __streamWaitEvent(::CUstream __stream, ::CUevent __evnt)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamWaitEvent);
@@ -323,31 +585,52 @@ _CCCL_HOST_API inline void __streamWaitEvent(::CUstream __stream, ::CUevent __ev
   return __id;
 }
-// Event management
-_CCCL_HOST_API inline void __eventRecord(::CUevent __evnt, ::CUstream __stream)
-{
-  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventRecord);
-  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to record CUDA event", __evnt, __stream);
-}
-// Destroy calls return error codes to let the calling code decide if the error should be ignored
 [[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __streamDestroyNoThrow(::CUstream __stream)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamDestroy);
   return static_cast<::cudaError_t>(__driver_fn(__stream));
 }
+// Event management
+[[nodiscard]] _CCCL_HOST_API inline ::CUevent __eventCreate(unsigned __flags)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventCreate);
+  ::CUevent __evnt;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to create a CUDA event", &__evnt, __flags);
+  return __evnt;
+}
 [[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __eventDestroyNoThrow(::CUevent __evnt)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventDestroy);
   return static_cast<::cudaError_t>(__driver_fn(__evnt));
 }
-_CCCL_HOST_API inline void __eventElapsedTime(::CUevent __start, ::CUevent __end, float* __ms)
+[[nodiscard]] _CCCL_HOST_API inline float __eventElapsedTime(::CUevent __start, ::CUevent __end)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventElapsedTime);
-  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get CUDA event elapsed time", __ms, __start, __end);
+  float __result;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get event elapsed time", &__result, __start, __end);
+  return __result;
+}
+[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __eventQueryNoThrow(::CUevent __evnt)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventQuery);
+  return static_cast<::cudaError_t>(__driver_fn(__evnt));
+}
+_CCCL_HOST_API inline void __eventRecord(::CUevent __evnt, ::CUstream __stream)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventRecord);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to record an event", __evnt, __stream);
+}
+_CCCL_HOST_API inline void __eventSynchronize(::CUevent __evnt)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventSynchronize);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to synchronize an event", __evnt);
 }
 // Library management
@@ -491,6 +774,17 @@ __graphKernelNodeSetAttribute(::CUgraphNode __node, ::CUkernelNodeAttrID __id, c
   _CUDA_DRIVER::__call_driver_fn(__driver_fn, "Failed to set kernel node parameters", __node, __id, &__value);
 }
+// Peer Context Memory Access
+[[nodiscard]] _CCCL_HOST_API inline bool __deviceCanAccessPeer(::CUdevice __dev, ::CUdevice __peer_dev)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceCanAccessPeer);
+  int __result;
+  _CUDA_DRIVER::__call_driver_fn(
+    __driver_fn, "Failed to query if device can access peer's memory", &__result, __dev, __peer_dev);
+  return static_cast<bool>(__result);
+}
 // Green contexts
 #  if _CCCL_CTK_AT_LEAST(12, 5)
@@ -536,6 +830,6 @@ _CCCL_END_NAMESPACE_CUDA_DRIVER
 #  include <cuda/std/__cccl/epilogue.h>
-#endif // _CCCL_HAS_CTK()
+#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #endif // _CUDA___DRIVER_DRIVER_API_H

cuda/cccl/headers/include/cuda/__event/event.h CHANGED Viewed

@@ -23,10 +23,11 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+#  include <cuda/__device/device_ref.h>
+#  include <cuda/__driver/driver_api.h>
 #  include <cuda/__event/event_ref.h>
 #  include <cuda/__runtime/ensure_current_context.h>
 #  include <cuda/__utility/no_init.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
 #  include <cuda/std/cstddef>
 #  include <cuda/std/utility>
@@ -43,11 +44,11 @@ class event : public event_ref
 public:
   //! @brief Flags to use when creating the event.
-  enum class flags : unsigned int
+  enum class flags : unsigned
   {
     none          = cudaEventDefault,
     blocking_sync = cudaEventBlockingSync,
-    interprocess  = cudaEventInterprocess
+    interprocess  = cudaEventInterprocess,
   };
   //! @brief Construct a new `event` object with timing disabled, and record
@@ -141,7 +142,7 @@ public:
   [[nodiscard]] friend constexpr flags operator|(flags __lhs, flags __rhs) noexcept
   {
-    return static_cast<flags>(static_cast<unsigned int>(__lhs) | static_cast<unsigned int>(__rhs));
+    return static_cast<flags>(static_cast<unsigned>(__lhs) | static_cast<unsigned>(__rhs));
   }
 private:
@@ -151,14 +152,13 @@ private:
       : event_ref(__evnt)
   {}
-  explicit event(stream_ref __stream, unsigned int __flags);
+  explicit event(stream_ref __stream, unsigned __flags);
-  explicit event(device_ref __device, unsigned int __flags)
+  explicit event(device_ref __device, unsigned __flags)
       : event_ref(::cudaEvent_t{})
   {
     [[maybe_unused]] __ensure_current_context __ctx_setter(__device);
-    _CCCL_TRY_CUDA_API(
-      ::cudaEventCreateWithFlags, "Failed to create CUDA event", &__event_, static_cast<unsigned int>(__flags));
+    __event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
   }
 };

cuda/cccl/headers/include/cuda/__event/event_ref.h CHANGED Viewed

@@ -24,7 +24,6 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #  include <cuda/__driver/driver_api.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
 #  include <cuda/std/cassert>
 #  include <cuda/std/cstddef>
 #  include <cuda/std/utility>
@@ -80,7 +79,7 @@ public:
   _CCCL_HOST_API void sync() const
   {
     _CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::sync no event set");
-    _CCCL_TRY_CUDA_API(::cudaEventSynchronize, "Failed to wait for CUDA event", __event_);
+    ::cuda::__driver::__eventSynchronize(__event_);
   }
   //! @brief Checks if all the work in the stream prior to the record of the event has completed.
@@ -91,12 +90,12 @@ public:
   [[nodiscard]] _CCCL_HOST_API bool is_done() const
   {
     _CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::sync no event set");
-    cudaError_t __status = ::cudaEventQuery(__event_);
-    if (__status == cudaSuccess)
+    ::cudaError_t __status = ::cuda::__driver::__eventQueryNoThrow(__event_);
+    if (__status == ::cudaSuccess)
     {
       return true;
     }
-    else if (__status == cudaErrorNotReady)
+    else if (__status == ::cudaErrorNotReady)
     {
       return false;
     }

cuda/cccl/headers/include/cuda/__event/timed_event.h CHANGED Viewed

@@ -26,10 +26,11 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+#  include <cuda/__device/device_ref.h>
+#  include <cuda/__driver/driver_api.h>
 #  include <cuda/__event/event.h>
 #  include <cuda/__utility/no_init.h>
 #  include <cuda/std/__chrono/duration.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
 #  include <cuda/std/cstddef>
 #  include <cuda/std/__cccl/prologue.h>
@@ -51,7 +52,7 @@ public:
   //!
   //! @throws cuda_error if the event creation fails.
   explicit timed_event(device_ref __device, flags __flags = flags::none)
-      : event(__device, static_cast<unsigned int>(__flags))
+      : event(__device, static_cast<unsigned>(__flags))
   {}
   //! @brief Construct a new `timed_event` object into the moved-from state.
@@ -96,8 +97,7 @@ public:
   //! @note The elapsed time has a resolution of approximately 0.5 microseconds.
   [[nodiscard]] friend ::cuda::std::chrono::nanoseconds operator-(const timed_event& __end, const timed_event& __start)
   {
-    float __ms = 0.0f;
-    ::cuda::__driver::__eventElapsedTime(__start.get(), __end.get(), &__ms);
+    const auto __ms = ::cuda::__driver::__eventElapsedTime(__start.get(), __end.get());
     return ::cuda::std::chrono::nanoseconds(static_cast<::cuda::std::chrono::nanoseconds::rep>(__ms * 1'000'000.0));
   }

cuda/cccl/headers/include/cuda/__fwd/devices.h ADDED Viewed

@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#ifndef _CUDA___FWD_DEVICES_H
+#define _CUDA___FWD_DEVICES_H
+#include <cuda/std/detail/__config>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+#include <cuda/std/__fwd/span.h>
+#include <cuda/std/__cccl/prologue.h>
+_CCCL_BEGIN_NAMESPACE_CUDA
+class __physical_device;
+class device_ref;
+template <::cudaDeviceAttr _Attr>
+struct __dev_attr;
+namespace arch
+{
+struct traits_t;
+} // namespace arch
+_CCCL_END_NAMESPACE_CUDA
+#include <cuda/std/__cccl/epilogue.h>
+#endif // _CUDA___FWD_DEVICES_H

cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h CHANGED Viewed

@@ -42,6 +42,15 @@ inline constexpr bool __is_zip_function = false;
 template <class _Fn>
 inline constexpr bool __is_zip_function<zip_function<_Fn>> = true;
+template <class _Fn, class... _Iterators>
+class zip_transform_iterator;
+template <class>
+inline constexpr bool __is_zip_transform_iterator = false;
+template <class _Fn, class... _Iterators>
+inline constexpr bool __is_zip_transform_iterator<zip_transform_iterator<_Fn, _Iterators...>> = true;
 _CCCL_END_NAMESPACE_CUDA
 #include <cuda/std/__cccl/epilogue.h>

cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h CHANGED Viewed

@@ -159,11 +159,11 @@ public:
   ::cuda::std::ranges::__movable_box<_OutputFn> __output_func_{};
   using iterator_concept = ::cuda::std::conditional_t<
-    ::cuda::std::random_access_iterator<_Iter>,
+    ::cuda::std::__has_random_access_traversal<_Iter>,
     ::cuda::std::random_access_iterator_tag,
-    ::cuda::std::conditional_t<::cuda::std::bidirectional_iterator<_Iter>,
+    ::cuda::std::conditional_t<::cuda::std::__has_bidirectional_traversal<_Iter>,
                                ::cuda::std::bidirectional_iterator_tag,
-                               ::cuda::std::conditional_t<::cuda::std::forward_iterator<_Iter>,
+                               ::cuda::std::conditional_t<::cuda::std::__has_forward_traversal<_Iter>,
                                                           ::cuda::std::forward_iterator_tag,
                                                           ::cuda::std::output_iterator_tag>>>;
   using iterator_category = ::cuda::std::output_iterator_tag;