PyPI - cuda-cccl - Versions diffs - 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (144) hide show

cuda/cccl/headers/include/cuda/__device/device_ref.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef _CUDA___DEVICE_DEVICE_REF_H
 #define _CUDA___DEVICE_DEVICE_REF_H
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -22,44 +22,32 @@
 #endif // no system header
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #  include <cuda/__driver/driver_api.h>
+#  include <cuda/__fwd/devices.h>
 #  include <cuda/__runtime/types.h>
-#  include <string>
-#  include <vector>
+#  include <cuda/std/span>
+#  include <cuda/std/string_view>
 #  include <cuda/std/__cccl/prologue.h>
 _CCCL_BEGIN_NAMESPACE_CUDA
-class physical_device;
-namespace arch
-{
-struct traits_t;
-} // namespace arch
-namespace __detail
-{
-template <::cudaDeviceAttr _Attr>
-struct __dev_attr;
-} // namespace __detail
 //! @brief A non-owning representation of a CUDA device
 class device_ref
 {
-  friend class physical_device;
   int __id_ = 0;
 public:
   //! @brief Create a `device_ref` object from a native device ordinal.
-  /*implicit*/ constexpr device_ref(int __id) noexcept
+  /*implicit*/ _CCCL_HOST_API constexpr device_ref(int __id) noexcept
       : __id_(__id)
   {}
   //! @brief Retrieve the native ordinal of the `device_ref`
   //!
   //! @return int The native device ordinal held by the `device_ref` object
-  [[nodiscard]] constexpr int get() const noexcept
+  [[nodiscard]] _CCCL_HOST_API constexpr int get() const noexcept
   {
     return __id_;
   }
@@ -72,7 +60,7 @@ public:
   //! @param __lhs The first `device_ref` to compare
   //! @param __rhs The second `device_ref` to compare
   //! @return true if `lhs` and `rhs` refer to the same device ordinal
-  [[nodiscard]] friend constexpr bool operator==(device_ref __lhs, device_ref __rhs) noexcept
+  [[nodiscard]] friend _CCCL_HOST_API constexpr bool operator==(device_ref __lhs, device_ref __rhs) noexcept
   {
     return __lhs.__id_ == __rhs.__id_;
   }
@@ -86,7 +74,7 @@ public:
   //! @param __lhs The first `device_ref` to compare
   //! @param __rhs The second `device_ref` to compare
   //! @return true if `lhs` and `rhs` refer to different device ordinal
-  [[nodiscard]] constexpr friend bool operator!=(device_ref __lhs, device_ref __rhs) noexcept
+  [[nodiscard]] friend _CCCL_HOST_API constexpr bool operator!=(device_ref __lhs, device_ref __rhs) noexcept
   {
     return __lhs.__id_ != __rhs.__id_;
   }
@@ -101,38 +89,35 @@ public:
   //!
   //! @sa device::attrs
   template <typename _Attr>
-  [[nodiscard]] auto attribute(_Attr __attr) const
+  [[nodiscard]] _CCCL_HOST_API auto attribute(_Attr __attr) const
   {
     return __attr(*this);
   }
   //! @overload
   template <::cudaDeviceAttr _Attr>
-  [[nodiscard]] auto attribute() const
+  [[nodiscard]] _CCCL_HOST_API auto attribute() const
   {
-    return attribute(__detail::__dev_attr<_Attr>());
+    return attribute(__dev_attr<_Attr>());
   }
   //! @brief Retrieve the memory location of this device
   //!
   //! @return The memory location of this device
-  [[nodiscard]] operator memory_location() const noexcept
+  [[nodiscard]] _CCCL_HOST_API operator memory_location() const noexcept
   {
     return memory_location{::cudaMemLocationTypeDevice, get()};
   }
-  //! @brief Retrieve string with the name of this device.
-  //!
-  //! @return String containing the name of this device.
-  [[nodiscard]] ::std::string name() const
-  {
-    constexpr int __max_name_length = 256;
-    ::std::string __name(256, 0);
+  //! @brief Initializes the primary context of the device.
+  _CCCL_HOST_API void init() const; // implemented in <cuda/__device/physical_device.h> to avoid circular dependency
-    // For some reason there is no separate name query in CUDA runtime
-    ::cuda::__driver::__deviceGetName(__name.data(), __max_name_length, get());
-    return __name;
-  }
+  //! @brief Retrieve the name of this device.
+  //!
+  //! @return String view containing the name of this device.
+  [[nodiscard]] _CCCL_HOST_API ::cuda::std::string_view name() const; // implemented in
+                                                                      // <cuda/__device/physical_device.h> to avoid
+                                                                      // circular dependency
   //! @brief Queries if its possible for this device to directly access specified device's memory.
   //!
@@ -142,7 +127,7 @@ public:
   //!
   //! @param __other_dev Device to query the peer access
   //! @return true if its possible for this device to access the specified device's memory
-  [[nodiscard]] bool has_peer_access_to(device_ref __other_dev) const
+  [[nodiscard]] _CCCL_HOST_API bool has_peer_access_to(device_ref __other_dev) const
   {
     return ::cuda::__driver::__deviceCanAccessPeer(
       ::cuda::__driver::__deviceGet(get()), ::cuda::__driver::__deviceGet(__other_dev.get()));
@@ -154,19 +139,22 @@ public:
   //! that are shared by all devices belonging to given architecture.
   //!
   //! @return A reference to `arch_traits_t` object containing architecture traits of this device
-  const arch::traits_t& arch_traits() const;
+  [[nodiscard]] _CCCL_HOST_API const arch::traits_t& arch_traits() const; // implemented in
+                                                                          // <cuda/__device/physical_device.h> to avoid
+                                                                          // circular dependency
   // TODO this might return some more complex type in the future
   // TODO we might want to include the calling device, depends on what we decide
   // peer access APIs
-  //! @brief Retrieve a vector of `device_ref`s that are peers of this device
+  //! @brief Retrieve `device_ref`s that are peers of this device
   //!
-  //! The device on which this API is called is not included in the vector,
-  //! if a full group of peer devices is needed, it needs to be pushed_back separately.
+  //! The device on which this API is called is not included in the vector.
   //!
   //! @throws cuda_error if any peer access query fails
-  ::std::vector<device_ref> peer_devices() const;
+  [[nodiscard]] _CCCL_HOST_API ::cuda::std::span<const device_ref> peers() const; // implemented in
+                                                                                  // <cuda/__device/physical_device.h>
+                                                                                  // to avoid circular dependency
 };
 _CCCL_END_NAMESPACE_CUDA

cuda/cccl/headers/include/cuda/__device/physical_device.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef _CUDA___DEVICE_PHYSICAL_DEVICE_H
 #define _CUDA___DEVICE_PHYSICAL_DEVICE_H
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -24,58 +24,71 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #  include <cuda/__device/arch_traits.h>
-#  include <cuda/__device/attributes.h>
 #  include <cuda/__device/device_ref.h>
 #  include <cuda/__driver/driver_api.h>
+#  include <cuda/__fwd/devices.h>
+#  include <cuda/std/__cstddef/types.h>
+#  include <cuda/std/span>
+#  include <cuda/std/string_view>
 #  include <cassert>
+#  include <memory>
 #  include <mutex>
+#  include <vector>
 #  include <cuda/std/__cccl/prologue.h>
 _CCCL_BEGIN_NAMESPACE_CUDA
-namespace __detail
-{
-//! @brief A proxy object used to in-place construct a `device` object from an
-//! integer ID. Used in __detail/all_devices.cuh.
-struct __emplace_device
-{
-  int __id_;
-  [[nodiscard]] operator physical_device() const;
-  [[nodiscard]] constexpr const __emplace_device* operator->() const;
-};
-} // namespace __detail
-//! @brief For a given attribute, type of the attribute value.
-//!
-//! @par Example
-//! @code
-//! using threads_per_block_t = device::attr_result_t<device_attributes::max_threads_per_block>;
-//! static_assert(std::is_same_v<threads_per_block_t, int>);
-//! @endcode
-//!
-//! @sa device_attributes
-template <::cudaDeviceAttr _Attr>
-using device_attribute_result_t = typename __detail::__dev_attr<_Attr>::type;
+[[nodiscard]] inline ::cuda::std::span<__physical_device> __physical_devices();
 // This is the element type of the the global `devices` array. In the future, we
 // can cache device properties here.
 //
 //! @brief An immovable "owning" representation of a CUDA device.
-class physical_device : public device_ref
+class __physical_device
 {
+  friend _CCCL_HOST_API inline ::std::unique_ptr<__physical_device[]>
+  __make_physical_devices(::cuda::std::size_t __device_count);
+  ::CUdevice __device_{};
+  // TODO We should have some of the attributes just return from the arch traits
+  ::std::once_flag __traits_once_flag_{};
+  arch::traits_t __traits_{};
+  ::std::once_flag __primary_ctx_once_flag_{};
+  ::CUcontext __primary_ctx_{};
+  static constexpr ::cuda::std::size_t __max_name_length{256};
+  ::std::once_flag __name_once_flag_{};
+  char __name_[__max_name_length]{};
+  ::cuda::std::size_t __name_length_{};
+  ::std::once_flag __peers_once_flag_{};
+  ::std::vector<device_ref> __peers_{};
 public:
-#  ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-#    if _CCCL_COMPILER(MSVC)
-  // When __EDG__ is defined, std::construct_at will not permit constructing
-  // a device object from an __emplace_device object. This is a workaround.
-  physical_device(__detail::__emplace_device __ed)
-      : physical_device(__ed.__id_)
-  {}
-#    endif // _CCCL_COMPILER(MSVC)
-#  endif // _CCCL_COMPILER(MSVC)
+  _CCCL_HIDE_FROM_ABI __physical_device() = default;
+  _CCCL_HOST_API ~__physical_device()
+  {
+    if (__primary_ctx_ != nullptr)
+    {
+      [[maybe_unused]] const auto __ignore = ::cuda::__driver::__primaryCtxReleaseNoThrow(__device_);
+    }
+  }
+  //! @brief Retrieve the primary context for this device.
+  //!
+  //! @return A reference to the primary context for this device.
+  [[nodiscard]] _CCCL_HOST_API ::CUcontext __primary_context()
+  {
+    ::std::call_once(__primary_ctx_once_flag_, [this]() {
+      __primary_ctx_ = ::cuda::__driver::__primaryCtxRetain(__device_);
+    });
+    return __primary_ctx_;
+  }
   //! @brief Retrieve architecture traits of this device.
   //!
@@ -83,81 +96,97 @@ public:
   //! that are shared by all devices belonging to given architecture.
   //!
   //! @return A reference to `arch_traits_t` object containing architecture traits of this device
-  const arch::traits_t& arch_traits() const noexcept
+  [[nodiscard]] _CCCL_HOST_API const arch::traits_t& __arch_traits()
   {
-    return __traits;
+    ::std::call_once(__traits_once_flag_, [this]() {
+      const auto __id = ::cuda::__driver::__cudevice_to_ordinal(__device_);
+      __traits_       = ::cuda::arch::__arch_traits_might_be_unknown(__id, device_attributes::compute_capability(__id));
+    });
+    return __traits_;
   }
-  //! @brief Retrieve the primary context for this device.
-  //!
-  //! @return A reference to the primary context for this device.
-  ::CUcontext primary_context() const
+  [[nodiscard]] _CCCL_HOST_API ::cuda::std::string_view __name()
   {
-    ::std::call_once(__init_once, [this]() {
-      __device      = ::cuda::__driver::__deviceGet(__id_);
-      __primary_ctx = ::cuda::__driver::__primaryCtxRetain(__device);
+    ::std::call_once(__name_once_flag_, [this]() {
+      const auto __id = ::cuda::__driver::__cudevice_to_ordinal(__device_);
+      ::cuda::__driver::__deviceGetName(__name_, __max_name_length, __id);
+      __name_length_ = ::cuda::std::char_traits<char>::length(__name_);
     });
-    _CCCL_ASSERT(__primary_ctx != nullptr, "cuda::primary_context failed to get context");
-    return __primary_ctx;
+    return ::cuda::std::string_view{__name_, __name_length_};
   }
-  ~physical_device()
+  [[nodiscard]] _CCCL_HOST_API ::cuda::std::span<const device_ref> __peers()
   {
-    if (__primary_ctx)
-    {
-      ::cuda::__driver::__primaryCtxRelease(__device);
-    }
+    ::std::call_once(__peers_once_flag_, [this]() {
+      const auto __count = static_cast<int>(::cuda::__physical_devices().size());
+      const auto __id    = ::cuda::__driver::__cudevice_to_ordinal(__device_);
+      __peers_.reserve(__count);
+      for (int __other_id = 0; __other_id < __count; ++__other_id)
+      {
+        // Exclude the device this API is called on. The main use case for this API
+        // is enable/disable peer access. While enable peer access can be called on
+        // device on which memory resides, disable peer access will error-out.
+        // Usage of the peer access control is smoother when *this is excluded,
+        // while it can be easily added with .push_back() on the vector if a full
+        // group of peers is needed (for cases other than peer access control)
+        if (__other_id != __id)
+        {
+          device_ref __dev{__id};
+          device_ref __other_dev{__other_id};
+          // While in almost all practical applications peer access should be symmetrical,
+          // it is possible to build a system with one directional peer access, check
+          // both ways here just to be safe
+          if (__dev.has_peer_access_to(__other_dev) && __other_dev.has_peer_access_to(__dev))
+          {
+            __peers_.push_back(__other_dev);
+          }
+        }
+      }
+    });
+    return ::cuda::std::span<const device_ref>{__peers_};
   }
+};
-private:
-  // TODO: put a mutable thread-safe (or thread_local) cache of device
-  // properties here.
-  friend class device_ref;
-  friend struct __detail::__emplace_device;
-  mutable ::CUcontext __primary_ctx = nullptr;
-  mutable ::CUdevice __device{};
-  mutable ::std::once_flag __init_once;
-  // TODO should this be a reference/pointer to the constexpr traits instances?
-  //  Do we care about lazy init?
-  //  We should have some of the attributes just return from the arch traits
-  arch::traits_t __traits;
-  explicit physical_device(int __id)
-      : device_ref(__id)
-      , __traits(arch::__arch_traits_might_be_unknown(__id, device_attributes::compute_capability(__id)))
-  {}
+[[nodiscard]] _CCCL_HOST_API inline ::std::unique_ptr<__physical_device[]>
+__make_physical_devices(::cuda::std::size_t __device_count)
+{
+  ::std::unique_ptr<__physical_device[]> __devices{::new __physical_device[__device_count]};
+  for (::cuda::std::size_t __i = 0; __i < __device_count; ++__i)
+  {
+    __devices[__i].__device_ = static_cast<int>(__i);
+  }
+  return __devices;
+}
-  // `device` objects are not movable or copyable.
-  physical_device(physical_device&&)                 = delete;
-  physical_device(const physical_device&)            = delete;
-  physical_device& operator=(physical_device&&)      = delete;
-  physical_device& operator=(const physical_device&) = delete;
+[[nodiscard]] inline ::cuda::std::span<__physical_device> __physical_devices()
+{
+  static const auto __device_count = static_cast<::cuda::std::size_t>(::cuda::__driver::__deviceGetCount());
+  static const auto __devices      = ::cuda::__make_physical_devices(__device_count);
+  return ::cuda::std::span<__physical_device>{__devices.get(), __device_count};
+}
-  friend bool operator==(const physical_device& __lhs, int __rhs) = delete;
-  friend bool operator==(int __lhs, const physical_device& __rhs) = delete;
+// device_ref methods dependent on __physical_device
-#  if _CCCL_STD_VER <= 2017
-  friend bool operator!=(const physical_device& __lhs, int __rhs) = delete;
-  friend bool operator!=(int __lhs, const physical_device& __rhs) = delete;
-#  endif // _CCCL_STD_VER <= 2017
-};
+_CCCL_HOST_API inline void device_ref::init() const
+{
+  (void) ::cuda::__physical_devices()[__id_].__primary_context();
+}
-namespace __detail
+[[nodiscard]] _CCCL_HOST_API inline ::cuda::std::string_view device_ref::name() const
 {
-[[nodiscard]] inline __emplace_device::operator physical_device() const
+  return ::cuda::__physical_devices()[__id_].__name();
+}
+[[nodiscard]] _CCCL_HOST_API inline const arch::traits_t& device_ref::arch_traits() const
 {
-  return physical_device(__id_);
+  return ::cuda::__physical_devices()[__id_].__arch_traits();
 }
-[[nodiscard]] inline constexpr const __emplace_device* __emplace_device::operator->() const
+[[nodiscard]] _CCCL_HOST_API inline ::cuda::std::span<const device_ref> device_ref::peers() const
 {
-  return this;
+  return ::cuda::__physical_devices()[__id_].__peers();
 }
-} // namespace __detail
 _CCCL_END_NAMESPACE_CUDA

cuda/cccl/headers/include/cuda/__driver/driver_api.h CHANGED Viewed

@@ -216,11 +216,10 @@ _CCCL_HOST_API inline void __deviceGetName(char* __name_out, int __len, int __or
   return __result;
 }
-_CCCL_HOST_API inline void __primaryCtxRelease(::CUdevice __dev)
+[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __primaryCtxReleaseNoThrow(::CUdevice __dev)
 {
   static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDevicePrimaryCtxRelease);
-  // TODO we might need to ignore failure here
-  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to release context for a device", __dev);
+  return static_cast<::cudaError_t>(__driver_fn(__dev));
 }
 [[nodiscard]] _CCCL_HOST_API inline bool __isPrimaryCtxActive(::CUdevice __dev)
@@ -325,6 +324,109 @@ _CCCL_HOST_API void __memsetAsync(void* __dst, _Tp __value, size_t __count, ::CU
   }
 }
+_CCCL_HOST_API inline ::cudaError_t __mempoolCreateNoThrow(::CUmemoryPool* __pool, ::CUmemPoolProps* __props)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolCreate);
+  return static_cast<::cudaError_t>(__driver_fn(__pool, __props));
+}
+_CCCL_HOST_API inline void __mempoolSetAttribute(::CUmemoryPool __pool, ::CUmemPool_attribute __attr, void* __value)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolSetAttribute);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to set attribute for a memory pool", __pool, __attr, __value);
+}
+_CCCL_HOST_API inline size_t __mempoolGetAttribute(::CUmemoryPool __pool, ::CUmemPool_attribute __attr)
+{
+  size_t __value          = 0;
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolGetAttribute);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get attribute for a memory pool", __pool, __attr, &__value);
+  return __value;
+}
+_CCCL_HOST_API inline void __mempoolDestroy(::CUmemoryPool __pool)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolDestroy);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to destroy a memory pool", __pool);
+}
+_CCCL_HOST_API inline ::CUdeviceptr
+__mallocFromPoolAsync(::cuda::std::size_t __bytes, ::CUmemoryPool __pool, ::CUstream __stream)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocFromPoolAsync);
+  ::CUdeviceptr __result  = 0;
+  ::cuda::__driver::__call_driver_fn(
+    __driver_fn, "Failed to allocate memory from a memory pool", &__result, __bytes, __pool, __stream);
+  return __result;
+}
+_CCCL_HOST_API inline void __mempoolTrimTo(::CUmemoryPool __pool, ::cuda::std::size_t __min_bytes_to_keep)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolTrimTo);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to trim a memory pool", __pool, __min_bytes_to_keep);
+}
+_CCCL_HOST_API inline ::cudaError_t __freeAsyncNoThrow(::CUdeviceptr __dptr, ::CUstream __stream)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFreeAsync);
+  return static_cast<::cudaError_t>(__driver_fn(__dptr, __stream));
+}
+_CCCL_HOST_API inline void __mempoolSetAccess(::CUmemoryPool __pool, ::CUmemAccessDesc* __descs, ::size_t __count)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolSetAccess);
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to set access of a memory pool", __pool, __descs, __count);
+}
+_CCCL_HOST_API inline ::CUmemAccess_flags __mempoolGetAccess(::CUmemoryPool __pool, ::CUmemLocation* __location)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolGetAccess);
+  ::CUmemAccess_flags __flags;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get access of a memory pool", &__flags, __pool, __location);
+  return __flags;
+}
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+_CCCL_HOST_API inline ::CUmemoryPool
+__getDefaultMemPool(CUmemLocation __location, CUmemAllocationType_enum __allocation_type)
+{
+  static auto __driver_fn =
+    _CCCLRT_GET_DRIVER_FUNCTION_VERSIONED(cuMemGetDefaultMemPool, cuMemGetDefaultMemPool, 13, 0);
+  ::CUmemoryPool __result = nullptr;
+  ::cuda::__driver::__call_driver_fn(
+    __driver_fn, "Failed to get default memory pool", &__result, &__location, __allocation_type);
+  return __result;
+}
+#  endif // _CCCL_CTK_AT_LEAST(13, 0)
+_CCCL_HOST_API inline ::CUdeviceptr __mallocManaged(::cuda::std::size_t __bytes, unsigned int __flags)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocManaged);
+  ::CUdeviceptr __result  = 0;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to allocate managed memory", &__result, __bytes, __flags);
+  return __result;
+}
+_CCCL_HOST_API inline void* __mallocHost(::cuda::std::size_t __bytes)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocHost);
+  void* __result          = nullptr;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to allocate host memory", &__result, __bytes);
+  return __result;
+}
+_CCCL_HOST_API inline ::cudaError_t __freeNoThrow(::CUdeviceptr __dptr)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFree);
+  return static_cast<::cudaError_t>(__driver_fn(__dptr));
+}
+_CCCL_HOST_API inline ::cudaError_t __freeHostNoThrow(void* __dptr)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFreeHost);
+  return static_cast<::cudaError_t>(__driver_fn(__dptr));
+}
 // Unified Addressing
 // TODO: we don't want to have these functions here, refactoring expected

cuda/cccl/headers/include/cuda/__event/event.h CHANGED Viewed

@@ -23,6 +23,7 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+#  include <cuda/__device/device_ref.h>
 #  include <cuda/__driver/driver_api.h>
 #  include <cuda/__event/event_ref.h>
 #  include <cuda/__runtime/ensure_current_context.h>

cuda/cccl/headers/include/cuda/__event/timed_event.h CHANGED Viewed

@@ -26,6 +26,7 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+#  include <cuda/__device/device_ref.h>
 #  include <cuda/__driver/driver_api.h>
 #  include <cuda/__event/event.h>
 #  include <cuda/__utility/no_init.h>

cuda/cccl/headers/include/cuda/__fwd/devices.h ADDED Viewed

@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#ifndef _CUDA___FWD_DEVICES_H
+#define _CUDA___FWD_DEVICES_H
+#include <cuda/std/detail/__config>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+#include <cuda/std/__fwd/span.h>
+#include <cuda/std/__cccl/prologue.h>
+_CCCL_BEGIN_NAMESPACE_CUDA
+class __physical_device;
+class device_ref;
+template <::cudaDeviceAttr _Attr>
+struct __dev_attr;
+namespace arch
+{
+struct traits_t;
+} // namespace arch
+_CCCL_END_NAMESPACE_CUDA
+#include <cuda/std/__cccl/epilogue.h>
+#endif // _CUDA___FWD_DEVICES_H

cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h CHANGED Viewed

@@ -42,6 +42,15 @@ inline constexpr bool __is_zip_function = false;
 template <class _Fn>
 inline constexpr bool __is_zip_function<zip_function<_Fn>> = true;
+template <class _Fn, class... _Iterators>
+class zip_transform_iterator;
+template <class>
+inline constexpr bool __is_zip_transform_iterator = false;
+template <class _Fn, class... _Iterators>
+inline constexpr bool __is_zip_transform_iterator<zip_transform_iterator<_Fn, _Iterators...>> = true;
 _CCCL_END_NAMESPACE_CUDA
 #include <cuda/std/__cccl/epilogue.h>