PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (60) hide show

cuda/cccl/headers/include/cub/util_device.cuh CHANGED Viewed

@@ -47,7 +47,6 @@
 // for backward compatibility
 #include <cub/util_temporary_storage.cuh>
-#include <cuda/std/__cuda/ensure_current_device.h> // IWYU pragma: export
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__utility/forward.h>
 #include <cuda/std/array>
@@ -104,7 +103,34 @@ CUB_RUNTIME_FUNCTION inline int CurrentDevice()
 //! @brief RAII helper which saves the current device and switches to the specified device on construction and switches
 //! to the saved device on destruction.
-using SwitchDevice = ::cuda::__ensure_current_device;
+class SwitchDevice
+{
+  int target_device_;
+  int original_device_;
+public:
+  //! @brief Queries the current device and if that is different than @p target_device sets the current device to
+  //! @p target_device
+  SwitchDevice(const int target_device)
+      : target_device_(target_device)
+  {
+    CubDebug(cudaGetDevice(&original_device_));
+    if (original_device_ != target_device_)
+    {
+      CubDebug(cudaSetDevice(target_device_));
+    }
+  }
+  //! @brief If the @p original_device was not equal to @p target_device sets the current device back to
+  //! @p original_device
+  ~SwitchDevice()
+  {
+    if (original_device_ != target_device_)
+    {
+      CubDebug(cudaSetDevice(original_device_));
+    }
+  }
+};
 #  endif // _CCCL_DOXYGEN_INVOKED
@@ -684,16 +710,31 @@ struct KernelConfig
     return launcher_factory.MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
   }
 };
 } // namespace detail
 #endif // !_CCCL_COMPILER(NVRTC)
+namespace detail
+{
+template <typename T>
+struct get_active_policy
+{
+  using type = typename T::ActivePolicy;
+};
+} // namespace detail
 /// Helper for dispatching into a policy chain
 template <int PolicyPtxVersion, typename PolicyT, typename PrevPolicyT>
 struct ChainedPolicy
 {
+private:
+  static constexpr bool have_previous_policy = !::cuda::std::is_same_v<PolicyT, PrevPolicyT>;
+public:
   /// The policy for the active compiler pass
-  using ActivePolicy = ::cuda::std::_If<(CUB_PTX_ARCH < PolicyPtxVersion), typename PrevPolicyT::ActivePolicy, PolicyT>;
+  using ActivePolicy =
+    typename ::cuda::std::_If<(CUB_PTX_ARCH < PolicyPtxVersion && have_previous_policy),
+                              detail::get_active_policy<PrevPolicyT>,
+                              ::cuda::std::type_identity<PolicyT>>::type;
 #if !_CCCL_COMPILER(NVRTC)
   /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
@@ -708,9 +749,12 @@ struct ChainedPolicy
 #  elif defined(NV_TARGET_SM_INTEGER_LIST)
     return runtime_to_compiletime<10, NV_TARGET_SM_INTEGER_LIST>(device_ptx_version, op);
 #  else
-    if (device_ptx_version < PolicyPtxVersion)
+    if constexpr (have_previous_policy)
     {
-      return PrevPolicyT::Invoke(device_ptx_version, op);
+      if (device_ptx_version < PolicyPtxVersion)
+      {
+        return PrevPolicyT::Invoke(device_ptx_version, op);
+      }
     }
     return op.template Invoke<PolicyT>();
 #  endif
@@ -738,7 +782,7 @@ private:
   template <int DevicePtxVersion, typename FunctorT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op)
   {
-    if constexpr (DevicePtxVersion < PolicyPtxVersion)
+    if constexpr (DevicePtxVersion < PolicyPtxVersion && have_previous_policy)
     {
       return PrevPolicyT::template invoke_static<DevicePtxVersion>(op);
     }
@@ -749,34 +793,6 @@ private:
   }
 #endif // !_CCCL_COMPILER(NVRTC)
 };
-/// Helper for dispatching into a policy chain (end-of-chain specialization)
-template <int PolicyPtxVersion, typename PolicyT>
-struct ChainedPolicy<PolicyPtxVersion, PolicyT, PolicyT>
-{
-  template <int, typename, typename>
-  friend struct ChainedPolicy; // befriend primary template, so it can call invoke_static
-  /// The policy for the active compiler pass
-  using ActivePolicy = PolicyT;
-#if !_CCCL_COMPILER(NVRTC)
-  /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
-  template <typename FunctorT>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op)
-  {
-    return op.template Invoke<PolicyT>();
-  }
-private:
-  template <int, typename FunctorT>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op)
-  {
-    return op.template Invoke<PolicyT>();
-  }
-#endif // !_CCCL_COMPILER(NVRTC)
-};
 CUB_NAMESPACE_END
 #if _CCCL_HAS_CUDA_COMPILER() && !_CCCL_COMPILER(NVRTC)

cuda/cccl/headers/include/cuda/__algorithm/copy.h CHANGED Viewed

@@ -38,11 +38,11 @@ enum class source_access_order
 {
 #  if _CCCL_CTK_AT_LEAST(13, 0)
   //! @brief Access source in stream order
-  stream = cudaMemcpySrcAccessOrderStream,
+  stream = ::cudaMemcpySrcAccessOrderStream,
   //! @brief Access source during the copy call, source can be destroyed after the API returns
-  during_api_call = cudaMemcpySrcAccessOrderDuringApiCall,
+  during_api_call = ::cudaMemcpySrcAccessOrderDuringApiCall,
   //! @brief Access source in any order, the order can change across CUDA releases
-  any = cudaMemcpySrcAccessOrderAny,
+  any = ::cudaMemcpySrcAccessOrderAny,
 #  else
   any = 0x3,
 #  endif // _CCCL_CTK_BELOW(13, 0)

cuda/cccl/headers/include/cuda/__device/all_devices.h CHANGED Viewed

@@ -23,7 +23,7 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #  include <cuda/__device/physical_device.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
+#  include <cuda/__driver/driver_api.h>
 #  include <cuda/std/cassert>
 #  include <cuda/std/detail/libcxx/include/stdexcept>
 #  include <cuda/std/span>
@@ -151,11 +151,8 @@ inline all_devices::operator ::cuda::std::span<const device_ref>() const
 inline const ::std::vector<physical_device>& all_devices::__devices()
 {
-  static const ::std::vector<physical_device> __devices = [] {
-    int __count = 0;
-    _CCCL_TRY_CUDA_API(::cudaGetDeviceCount, "failed to get the count of CUDA devices", &__count);
-    return ::std::vector<physical_device>{__initializer_iterator{0}, __initializer_iterator{__count}};
-  }();
+  static const ::std::vector<physical_device> __devices{
+    __initializer_iterator{0}, __initializer_iterator{::cuda::__driver::__deviceGetCount()}};
   return __devices;
 }
 } // namespace __detail

cuda/cccl/headers/include/cuda/__device/arch_traits.h CHANGED Viewed

@@ -516,7 +516,7 @@ inline constexpr int __highest_known_arch = 120;
     case id::sm_120a:
       return ::cuda::arch::traits<id::sm_120a>();
     default:
-      ::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Traits requested for an unknown architecture");
+      ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Traits requested for an unknown architecture");
       break;
   }
 }
@@ -525,7 +525,7 @@ inline constexpr int __highest_known_arch = 120;
 {
   if (compute_capability < 60 || compute_capability > __highest_known_arch)
   {
-    ::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Compute capability out of range");
+    ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
   }
   return static_cast<id>(compute_capability);
 }
@@ -550,7 +550,7 @@ _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
     case 120:
       return id::sm_120a;
     default:
-      ::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Compute capability out of range");
+      ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
       break;
   }
 }

cuda/cccl/headers/include/cuda/__device/attributes.h CHANGED Viewed

@@ -24,6 +24,7 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #  include <cuda/__device/device_ref.h>
+#  include <cuda/__driver/driver_api.h>
 #  include <cuda/std/__cccl/attributes.h>
 #  include <cuda/std/__cuda/api_wrapper.h>
@@ -44,11 +45,10 @@ struct __dev_attr_impl
     return _Attr;
   }
-  [[nodiscard]] type operator()(device_ref __dev_id) const
+  [[nodiscard]] type operator()(device_ref __dev) const
   {
-    int __value = 0;
-    _CCCL_TRY_CUDA_API(::cudaDeviceGetAttribute, "failed to get device attribute", &__value, _Attr, __dev_id.get());
-    return static_cast<type>(__value);
+    return static_cast<type>(::cuda::__driver::__deviceGetAttribute(
+      static_cast<::CUdevice_attribute>(_Attr), ::cuda::__driver::__deviceGet(__dev.get())));
   }
 };
@@ -81,9 +81,9 @@ template <>
 struct __dev_attr<::cudaDevAttrComputeMode> //
     : __dev_attr_impl<::cudaDevAttrComputeMode, ::cudaComputeMode>
 {
-  static constexpr type default_mode           = cudaComputeModeDefault;
-  static constexpr type prohibited_mode        = cudaComputeModeProhibited;
-  static constexpr type exclusive_process_mode = cudaComputeModeExclusiveProcess;
+  static constexpr type default_mode           = ::cudaComputeModeDefault;
+  static constexpr type prohibited_mode        = ::cudaComputeModeProhibited;
+  static constexpr type exclusive_process_mode = ::cudaComputeModeExclusiveProcess;
 };
 template <>
 struct __dev_attr<::cudaDevAttrConcurrentKernels> //

cuda/cccl/headers/include/cuda/__device/device_ref.h CHANGED Viewed

@@ -24,7 +24,6 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #  include <cuda/__driver/driver_api.h>
 #  include <cuda/__runtime/types.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
 #  include <string>
 #  include <vector>
@@ -143,16 +142,10 @@ public:
   //!
   //! @param __other_dev Device to query the peer access
   //! @return true if its possible for this device to access the specified device's memory
-  bool has_peer_access_to(device_ref __other_dev) const
+  [[nodiscard]] bool has_peer_access_to(device_ref __other_dev) const
   {
-    int __can_access;
-    _CCCL_TRY_CUDA_API(
-      ::cudaDeviceCanAccessPeer,
-      "Could not query if device can be peer accessed",
-      &__can_access,
-      get(),
-      __other_dev.get());
-    return __can_access;
+    return ::cuda::__driver::__deviceCanAccessPeer(
+      ::cuda::__driver::__deviceGet(get()), ::cuda::__driver::__deviceGet(__other_dev.get()));
   }
   //! @brief Retrieve architecture traits of this device.

cuda/cccl/headers/include/cuda/__driver/driver_api.h CHANGED Viewed

@@ -21,11 +21,12 @@
 #  pragma system_header
 #endif // no system header
-#if _CCCL_HAS_CTK()
+#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #  include <cuda/std/__exception/cuda_error.h>
 #  include <cuda/std/__internal/namespaces.h>
 #  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_same.h>
 #  include <cuda.h>
@@ -41,31 +42,45 @@ _CCCL_BEGIN_NAMESPACE_CUDA_DRIVER
     reinterpret_cast<decltype(::versioned_fn_name)*>(                                           \
       ::cuda::__driver::__get_driver_entry_point(#function_name, major, minor))
+// cudaGetDriverEntryPoint function is deprecated
 _CCCL_SUPPRESS_DEPRECATED_PUSH
-//! @brief Get a driver function pointer for a given API name and optionally specific CUDA version
-//!
-//! For minor version compatibility request the 12.0 version of everything for now, unless requested otherwise
-[[nodiscard]] _CCCL_HOST_API inline void*
-__get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12, [[maybe_unused]] int __minor = 0)
+//! @brief Gets the cuGetProcAddress function pointer.
+[[nodiscard]] _CCCL_HOST_API inline auto __getProcAddressFn() -> decltype(cuGetProcAddress)*
 {
-  // TODO switch to dlopen of libcuda.so instead of the below and maybe pair it with cuInit to avoid checking for two
-  // initializations
-  static auto __get_driver_entry_point_fn = reinterpret_cast<decltype(cuGetProcAddress)*>([]() {
-    void* __fn;
-    ::cudaDriverEntryPointQueryResult __result;
-    ::cudaError_t __status = ::cudaGetDriverEntryPoint("cuGetProcAddress", &__fn, ::cudaEnableDefault, &__result);
-    if (__status != ::cudaSuccess || __result != ::cudaDriverEntryPointSuccess)
-    {
-      ::cuda::__throw_cuda_error(::cudaErrorUnknown, "Failed to get cuGetProcAddress");
-    }
-    return __fn;
-  }());
+  // TODO switch to dlopen of libcuda.so instead of the below
+  void* __fn;
+  ::cudaDriverEntryPointQueryResult __result;
+  ::cudaError_t __status = ::cudaGetDriverEntryPoint("cuGetProcAddress", &__fn, ::cudaEnableDefault, &__result);
+  if (__status != ::cudaSuccess || __result != ::cudaDriverEntryPointSuccess)
+  {
+    ::cuda::__throw_cuda_error(::cudaErrorUnknown, "Failed to get cuGetProcAddress");
+  }
+  return reinterpret_cast<decltype(cuGetProcAddress)*>(__fn);
+}
+_CCCL_SUPPRESS_DEPRECATED_POP
+//! @brief Gets the driver entry point.
+//!
+//! @param __get_proc_addr_fn Pointer to cuGetProcAddress function.
+//! @param __name Name of the symbol to get the driver entry point for.
+//! @param __major The major CTK version to get the symbol version for.
+//! @param __minor The major CTK version to get the symbol version for.
+//!
+//! @return The address of the symbol.
+//!
+//! @throws @c cuda::cuda_error if the symbol cannot be obtained.
+[[nodiscard]] _CCCL_HOST_API inline void* __get_driver_entry_point_impl(
+  decltype(cuGetProcAddress)* __get_proc_addr_fn,
+  const char* __name,
+  [[maybe_unused]] int __major,
+  [[maybe_unused]] int __minor)
+{
   void* __fn;
   ::CUdriverProcAddressQueryResult __result;
   ::CUresult __status =
-    __get_driver_entry_point_fn(__name, &__fn, __major * 1000 + __minor * 10, ::CU_GET_PROC_ADDRESS_DEFAULT, &__result);
+    __get_proc_addr_fn(__name, &__fn, __major * 1000 + __minor * 10, ::CU_GET_PROC_ADDRESS_DEFAULT, &__result);
   if (__status != ::CUDA_SUCCESS || __result != ::CU_GET_PROC_ADDRESS_SUCCESS)
   {
     if (__status == ::CUDA_ERROR_INVALID_VALUE)
@@ -84,8 +99,13 @@ __get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12,
   return __fn;
 }
-_CCCL_SUPPRESS_DEPRECATED_POP
+//! @brief CUDA Driver API call wrapper. Calls a given CUDA Driver API and checks the return value.
+//!
+//! @param __fn A CUDA Driver function.
+//! @param __err_msg Error message describing the call if the all fails.
+//! @param __args The arguments to the @c __fn call.
+//!
+//! @throws @c cuda::cuda_error if the function call doesn't return CUDA_SUCCESS.
 template <typename Fn, typename... Args>
 _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args... __args)
 {
@@ -96,6 +116,48 @@ _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args
   }
 }
+//! @brief Initializes the CUDA Driver.
+//!
+//! @param __get_proc_addr_fn The pointer to cuGetProcAddress function.
+//!
+//! @return A dummy bool value.
+//!
+//! @warning This function should be called only once from __get_driver_entry_point function.
+[[nodiscard]] _CCCL_HOST_API inline bool __init(decltype(cuGetProcAddress)* __get_proc_addr_fn)
+{
+  auto __driver_fn = reinterpret_cast<decltype(::cuInit)*>(
+    ::cuda::__driver::__get_driver_entry_point_impl(__get_proc_addr_fn, "cuInit", 12, 0));
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to initialize CUDA Driver", 0);
+  return true;
+}
+//! @brief Get a driver function pointer for a given API name and optionally specific CUDA version. This function also
+//!        initializes the CUDA Driver.
+//!
+//! @param __name Name of the symbol to get the driver entry point for.
+//! @param __major The major CTK version to get the symbol version for. Defaults to 12.
+//! @param __minor The major CTK version to get the symbol version for. Defaults to 0.
+//!
+//! @return The address of the symbol.
+//!
+//! @throws @c cuda::cuda_error if the symbol cannot be obtained or the CUDA driver failed to initialize.
+[[nodiscard]] _CCCL_HOST_API inline void*
+__get_driver_entry_point(const char* __name, [[maybe_unused]] int __major = 12, [[maybe_unused]] int __minor = 0)
+{
+  // Get cuGetProcAddress function and call cuInit(0) only on the first call
+  static auto __get_proc_addr_fn      = ::cuda::__driver::__getProcAddressFn();
+  [[maybe_unused]] static auto __init = ::cuda::__driver::__init(__get_proc_addr_fn);
+  return ::cuda::__driver::__get_driver_entry_point_impl(__get_proc_addr_fn, __name, __major, __minor);
+}
+//! @brief Converts CUdevice to ordinal device id.
+//!
+//! @note Currently, CUdevice value is the same as the ordinal device id. But that might change in the future.
+[[nodiscard]] _CCCL_HOST_API inline int __cudevice_to_ordinal(::CUdevice __dev) noexcept
+{
+  return static_cast<int>(__dev);
+}
 // Version management
 [[nodiscard]] _CCCL_HOST_API inline int __getVersion()
@@ -119,6 +181,22 @@ _CCCL_HOST_API inline void __call_driver_fn(Fn __fn, const char* __err_msg, Args
   return __result;
 }
+[[nodiscard]] _CCCL_HOST_API inline ::CUdevice __deviceGetAttribute(::CUdevice_attribute __attr, ::CUdevice __device)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetAttribute);
+  int __result;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get device attribute", &__result, __attr, __device);
+  return __result;
+}
+[[nodiscard]] _CCCL_HOST_API inline int __deviceGetCount()
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetCount);
+  int __result;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get device count", &__result);
+  return __result;
+}
 _CCCL_HOST_API inline void __deviceGetName(char* __name_out, int __len, int __ordinal)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetName);
@@ -178,6 +256,14 @@ _CCCL_HOST_API inline ::CUcontext __ctxPop()
   return __result;
 }
+[[nodiscard]] _CCCL_HOST_API inline ::CUdevice __ctxGetDevice()
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuCtxGetDevice);
+  ::CUdevice __result{};
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get current context", &__result);
+  return __result;
+}
 // Memory management
 _CCCL_HOST_API inline void __memcpyAsync(void* __dst, const void* __src, size_t __count, ::CUstream __stream)
@@ -239,8 +325,71 @@ _CCCL_HOST_API void __memsetAsync(void* __dst, _Tp __value, size_t __count, ::CU
   }
 }
+// Unified Addressing
+// TODO: we don't want to have these functions here, refactoring expected
+template <::CUpointer_attribute _Attr>
+[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __pointer_attribute_value_type_t_impl() noexcept
+{
+  if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_CONTEXT)
+  {
+    return ::CUcontext{};
+  }
+  else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE)
+  {
+    return ::CUmemorytype{};
+  }
+  else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER || _Attr == ::CU_POINTER_ATTRIBUTE_HOST_POINTER)
+  {
+    return static_cast<void*>(nullptr);
+  }
+  else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_IS_MANAGED || _Attr == ::CU_POINTER_ATTRIBUTE_MAPPED)
+  {
+    return bool{};
+  }
+  else if constexpr (_Attr == ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL)
+  {
+    return int{};
+  }
+  else
+  {
+    static_assert(::cuda::std::__always_false_v<decltype(_Attr)>, "not implemented attribute");
+  }
+}
+template <::CUpointer_attribute _Attr>
+using __pointer_attribute_value_type_t = decltype(::cuda::__driver::__pointer_attribute_value_type_t_impl<_Attr>());
+template <::CUpointer_attribute _Attr>
+[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t
+__pointerGetAttributeNoThrow(__pointer_attribute_value_type_t<_Attr>& __result, const void* __ptr)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuPointerGetAttribute);
+  ::cudaError_t __status{};
+  if constexpr (::cuda::std::is_same_v<__pointer_attribute_value_type_t<_Attr>, bool>)
+  {
+    int __result2{};
+    __status = static_cast<::cudaError_t>(__driver_fn(&__result2, _Attr, reinterpret_cast<::CUdeviceptr>(__ptr)));
+    __result = static_cast<bool>(__result2);
+  }
+  else
+  {
+    __status =
+      static_cast<::cudaError_t>(__driver_fn((void*) &__result, _Attr, reinterpret_cast<::CUdeviceptr>(__ptr)));
+  }
+  return __status;
+}
 // Stream management
+[[nodiscard]] _CCCL_HOST_API inline ::CUstream __streamCreateWithPriority(unsigned __flags, int __priority)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamCreateWithPriority);
+  ::CUstream __stream;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to create a stream", &__stream, __flags, __priority);
+  return __stream;
+}
 _CCCL_HOST_API inline void __streamSynchronize(::CUstream __stream)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamSynchronize);
@@ -294,6 +443,17 @@ struct __ctx_from_stream
 }
 #  endif // _CCCL_CTK_AT_LEAST(12, 5)
+// TODO: make this available since CUDA 12.8
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+[[nodiscard]] _CCCL_HOST_API inline ::CUdevice __streamGetDevice(::CUstream __stream)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION_VERSIONED(cuStreamGetDevice, cuStreamGetDevice, 12, 8);
+  ::CUdevice __result{};
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get the device of the stream", __stream, &__result);
+  return __result;
+}
+#  endif // _CCCL_CTK_AT_LEAST(13, 0)
 _CCCL_HOST_API inline void __streamWaitEvent(::CUstream __stream, ::CUevent __evnt)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamWaitEvent);
@@ -323,31 +483,52 @@ _CCCL_HOST_API inline void __streamWaitEvent(::CUstream __stream, ::CUevent __ev
   return __id;
 }
-// Event management
-_CCCL_HOST_API inline void __eventRecord(::CUevent __evnt, ::CUstream __stream)
-{
-  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventRecord);
-  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to record CUDA event", __evnt, __stream);
-}
-// Destroy calls return error codes to let the calling code decide if the error should be ignored
 [[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __streamDestroyNoThrow(::CUstream __stream)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuStreamDestroy);
   return static_cast<::cudaError_t>(__driver_fn(__stream));
 }
+// Event management
+[[nodiscard]] _CCCL_HOST_API inline ::CUevent __eventCreate(unsigned __flags)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventCreate);
+  ::CUevent __evnt;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to create a CUDA event", &__evnt, __flags);
+  return __evnt;
+}
 [[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __eventDestroyNoThrow(::CUevent __evnt)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventDestroy);
   return static_cast<::cudaError_t>(__driver_fn(__evnt));
 }
-_CCCL_HOST_API inline void __eventElapsedTime(::CUevent __start, ::CUevent __end, float* __ms)
+[[nodiscard]] _CCCL_HOST_API inline float __eventElapsedTime(::CUevent __start, ::CUevent __end)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventElapsedTime);
-  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get CUDA event elapsed time", __ms, __start, __end);
+  float __result;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get event elapsed time", &__result, __start, __end);
+  return __result;
+}
+[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __eventQueryNoThrow(::CUevent __evnt)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventQuery);
+  return static_cast<::cudaError_t>(__driver_fn(__evnt));
+}
+_CCCL_HOST_API inline void __eventRecord(::CUevent __evnt, ::CUstream __stream)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventRecord);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to record an event", __evnt, __stream);
+}
+_CCCL_HOST_API inline void __eventSynchronize(::CUevent __evnt)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuEventSynchronize);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to synchronize an event", __evnt);
 }
 // Library management
@@ -491,6 +672,17 @@ __graphKernelNodeSetAttribute(::CUgraphNode __node, ::CUkernelNodeAttrID __id, c
   _CUDA_DRIVER::__call_driver_fn(__driver_fn, "Failed to set kernel node parameters", __node, __id, &__value);
 }
+// Peer Context Memory Access
+[[nodiscard]] _CCCL_HOST_API inline bool __deviceCanAccessPeer(::CUdevice __dev, ::CUdevice __peer_dev)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceCanAccessPeer);
+  int __result;
+  _CUDA_DRIVER::__call_driver_fn(
+    __driver_fn, "Failed to query if device can access peer's memory", &__result, __dev, __peer_dev);
+  return static_cast<bool>(__result);
+}
 // Green contexts
 #  if _CCCL_CTK_AT_LEAST(12, 5)
@@ -536,6 +728,6 @@ _CCCL_END_NAMESPACE_CUDA_DRIVER
 #  include <cuda/std/__cccl/epilogue.h>
-#endif // _CCCL_HAS_CTK()
+#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #endif // _CUDA___DRIVER_DRIVER_API_H

cuda/cccl/headers/include/cuda/__event/event.h CHANGED Viewed

@@ -23,10 +23,10 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+#  include <cuda/__driver/driver_api.h>
 #  include <cuda/__event/event_ref.h>
 #  include <cuda/__runtime/ensure_current_context.h>
 #  include <cuda/__utility/no_init.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
 #  include <cuda/std/cstddef>
 #  include <cuda/std/utility>
@@ -43,11 +43,11 @@ class event : public event_ref
 public:
   //! @brief Flags to use when creating the event.
-  enum class flags : unsigned int
+  enum class flags : unsigned
   {
     none          = cudaEventDefault,
     blocking_sync = cudaEventBlockingSync,
-    interprocess  = cudaEventInterprocess
+    interprocess  = cudaEventInterprocess,
   };
   //! @brief Construct a new `event` object with timing disabled, and record
@@ -141,7 +141,7 @@ public:
   [[nodiscard]] friend constexpr flags operator|(flags __lhs, flags __rhs) noexcept
   {
-    return static_cast<flags>(static_cast<unsigned int>(__lhs) | static_cast<unsigned int>(__rhs));
+    return static_cast<flags>(static_cast<unsigned>(__lhs) | static_cast<unsigned>(__rhs));
   }
 private:
@@ -151,14 +151,13 @@ private:
       : event_ref(__evnt)
   {}
-  explicit event(stream_ref __stream, unsigned int __flags);
+  explicit event(stream_ref __stream, unsigned __flags);
-  explicit event(device_ref __device, unsigned int __flags)
+  explicit event(device_ref __device, unsigned __flags)
       : event_ref(::cudaEvent_t{})
   {
     [[maybe_unused]] __ensure_current_context __ctx_setter(__device);
-    _CCCL_TRY_CUDA_API(
-      ::cudaEventCreateWithFlags, "Failed to create CUDA event", &__event_, static_cast<unsigned int>(__flags));
+    __event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
   }
 };