PyPI - cuda-cccl - Versions diffs - 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show

cuda/cccl/headers/include/cuda/__device/attributes.h CHANGED Viewed

@@ -23,6 +23,7 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+#  include <cuda/__device/compute_capability.h>
 #  include <cuda/__device/device_ref.h>
 #  include <cuda/__driver/driver_api.h>
 #  include <cuda/__fwd/devices.h>
@@ -739,12 +740,12 @@ static constexpr numa_id_t numa_id{};
 // capability in a single query
 struct compute_capability_t
 {
-  using type = int;
+  using type = ::cuda::compute_capability;
   [[nodiscard]] _CCCL_HOST_API type operator()(device_ref __dev_id) const
   {
-    return 10 * ::cuda::device_attributes::compute_capability_major(__dev_id)
-         + ::cuda::device_attributes::compute_capability_minor(__dev_id);
+    return type{::cuda::device_attributes::compute_capability_major(__dev_id),
+                ::cuda::device_attributes::compute_capability_minor(__dev_id)};
   }
 };
 static constexpr compute_capability_t compute_capability{};

cuda/cccl/headers/include/cuda/__device/compute_capability.h ADDED Viewed

@@ -0,0 +1,171 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#ifndef _CUDA___DEVICE_COMPUTE_CAPABILITY_H
+#define _CUDA___DEVICE_COMPUTE_CAPABILITY_H
+#include <cuda/std/detail/__config>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+#include <cuda/__fwd/devices.h>
+#include <cuda/std/__utility/to_underlying.h>
+#include <cuda/std/__cccl/prologue.h>
+_CCCL_BEGIN_NAMESPACE_CUDA
+//! @brief Type representing the CUDA compute capability.
+class compute_capability
+{
+  int __cc_{}; //!< The stored compute capability in format 10 * major + minor.
+public:
+  _CCCL_HIDE_FROM_ABI constexpr compute_capability() noexcept = default;
+  //! @brief Constructs the object from compute capability \c __cc. The expected format is 10 * major + minor.
+  //!
+  //! @param __cc Compute capability.
+  _CCCL_API explicit constexpr compute_capability(int __cc) noexcept
+      : __cc_{__cc}
+  {}
+  //! @brief Constructs the object by combining the \c __major and \c __minor compute capability.
+  //!
+  //! @param __major The major compute capability.
+  //! @param __minor The minor compute capability. Must be less than 10.
+  _CCCL_API constexpr compute_capability(int __major, int __minor) noexcept
+      : __cc_{10 * __major + __minor}
+  {
+    _CCCL_ASSERT(__minor < 10, "invalid minor compute capability");
+  }
+  //! @brief Constructs the object from the architecture id.
+  //!
+  //! @param __arch_id The architecture id.
+  _CCCL_API explicit constexpr compute_capability(arch_id __arch_id) noexcept
+  {
+    const auto __val = ::cuda::std::to_underlying(__arch_id);
+    if (__val > __arch_specific_id_multiplier)
+    {
+      __cc_ = __val / __arch_specific_id_multiplier;
+    }
+    else
+    {
+      __cc_ = __val;
+    }
+  }
+  _CCCL_HIDE_FROM_ABI constexpr compute_capability(const compute_capability&) noexcept = default;
+  _CCCL_HIDE_FROM_ABI constexpr compute_capability& operator=(const compute_capability& __other) noexcept = default;
+  //! @brief Gets the stored compute capability.
+  //!
+  //! @return The stored compute capability in format 10 * major + minor.
+  [[nodiscard]] _CCCL_API constexpr int get() const noexcept
+  {
+    return __cc_;
+  }
+  //! @brief Gets the major compute capability.
+  //!
+  //! @return Major compute capability.
+  [[nodiscard]] _CCCL_API constexpr int major() const noexcept
+  {
+    return __cc_ / 10;
+  }
+  //! @brief Gets the minor compute capability.
+  //!
+  //! @return Minor compute capability. The value is always less than 10.
+  [[nodiscard]] _CCCL_API constexpr int minor() const noexcept
+  {
+    return __cc_ % 10;
+  }
+  //! @brief Conversion operator to \c int.
+  //!
+  //! @return The stored compute capability in format 10 * major + minor.
+  _CCCL_API explicit constexpr operator int() const noexcept
+  {
+    return __cc_;
+  }
+  //! @brief Equality operator.
+  [[nodiscard]] friend _CCCL_API constexpr bool operator==(compute_capability __lhs, compute_capability __rhs) noexcept
+  {
+    return __lhs.__cc_ == __rhs.__cc_;
+  }
+  //! @brief Inequality operator.
+  [[nodiscard]] friend _CCCL_API constexpr bool operator!=(compute_capability __lhs, compute_capability __rhs) noexcept
+  {
+    return __lhs.__cc_ != __rhs.__cc_;
+  }
+  //! @brief Less than operator.
+  [[nodiscard]] friend _CCCL_API constexpr bool operator<(compute_capability __lhs, compute_capability __rhs) noexcept
+  {
+    return __lhs.__cc_ < __rhs.__cc_;
+  }
+  //! @brief Less than or equal to operator.
+  [[nodiscard]] friend _CCCL_API constexpr bool operator<=(compute_capability __lhs, compute_capability __rhs) noexcept
+  {
+    return __lhs.__cc_ <= __rhs.__cc_;
+  }
+  //! @brief Greater than operator.
+  [[nodiscard]] friend _CCCL_API constexpr bool operator>(compute_capability __lhs, compute_capability __rhs) noexcept
+  {
+    return __lhs.__cc_ > __rhs.__cc_;
+  }
+  //! @brief Greater than or equal to operator.
+  [[nodiscard]] friend _CCCL_API constexpr bool operator>=(compute_capability __lhs, compute_capability __rhs) noexcept
+  {
+    return __lhs.__cc_ >= __rhs.__cc_;
+  }
+};
+_CCCL_END_NAMESPACE_CUDA
+#if _CCCL_CUDA_COMPILATION()
+_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE
+//! @brief Returns the \c cuda::compute_capability that is currently being compiled.
+//!
+//! @note This API cannot be used in constexpr context when compiling with nvc++ in CUDA mode.
+[[nodiscard]] _CCCL_DEVICE_API _CCCL_TARGET_CONSTEXPR ::cuda::compute_capability current_compute_capability() noexcept
+{
+#  if _CCCL_CUDA_COMPILER(NVHPC)
+  return ::cuda::compute_capability{__builtin_current_device_sm()};
+#  elif _CCCL_DEVICE_COMPILATION()
+  return ::cuda::compute_capability{__CUDA_ARCH__ / 10};
+#  else // ^^^ _CCCL_DEVICE_COMPILATION() ^^^ / vvv !_CCCL_DEVICE_COMPILATION() vvv
+  return {};
+#  endif // ^^^ !_CCCL_DEVICE_COMPILATION() ^^^
+}
+_CCCL_END_NAMESPACE_CUDA_DEVICE
+#endif // _CCCL_CUDA_COMPILATION()
+#include <cuda/std/__cccl/epilogue.h>
+#endif // _CUDA___DEVICE_COMPUTE_CAPABILITY_H

cuda/cccl/headers/include/cuda/__device/device_ref.h CHANGED Viewed

@@ -133,16 +133,6 @@ public:
       ::cuda::__driver::__deviceGet(get()), ::cuda::__driver::__deviceGet(__other_dev.get()));
   }
-  //! @brief Retrieve architecture traits of this device.
-  //!
-  //! Architecture traits object contains information about certain traits
-  //! that are shared by all devices belonging to given architecture.
-  //!
-  //! @return A reference to `arch_traits_t` object containing architecture traits of this device
-  [[nodiscard]] _CCCL_HOST_API const arch::traits_t& arch_traits() const; // implemented in
-                                                                          // <cuda/__device/physical_device.h> to avoid
-                                                                          // circular dependency
   // TODO this might return some more complex type in the future
   // TODO we might want to include the calling device, depends on what we decide
   // peer access APIs

cuda/cccl/headers/include/cuda/__device/physical_device.h CHANGED Viewed

@@ -23,16 +23,15 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
-#  include <cuda/__device/arch_traits.h>
 #  include <cuda/__device/device_ref.h>
 #  include <cuda/__driver/driver_api.h>
 #  include <cuda/__fwd/devices.h>
+#  include <cuda/std/__cccl/memory_wrapper.h>
 #  include <cuda/std/__cstddef/types.h>
 #  include <cuda/std/span>
 #  include <cuda/std/string_view>
 #  include <cassert>
-#  include <memory>
 #  include <mutex>
 #  include <vector>
@@ -53,10 +52,6 @@ class __physical_device
   ::CUdevice __device_{};
-  // TODO We should have some of the attributes just return from the arch traits
-  ::std::once_flag __traits_once_flag_{};
-  arch::traits_t __traits_{};
   ::std::once_flag __primary_ctx_once_flag_{};
   ::CUcontext __primary_ctx_{};
@@ -90,21 +85,6 @@ public:
     return __primary_ctx_;
   }
-  //! @brief Retrieve architecture traits of this device.
-  //!
-  //! Architecture traits object contains information about certain traits
-  //! that are shared by all devices belonging to given architecture.
-  //!
-  //! @return A reference to `arch_traits_t` object containing architecture traits of this device
-  [[nodiscard]] _CCCL_HOST_API const arch::traits_t& __arch_traits()
-  {
-    ::std::call_once(__traits_once_flag_, [this]() {
-      const auto __id = ::cuda::__driver::__cudevice_to_ordinal(__device_);
-      __traits_       = ::cuda::arch::__arch_traits_might_be_unknown(__id, device_attributes::compute_capability(__id));
-    });
-    return __traits_;
-  }
   [[nodiscard]] _CCCL_HOST_API ::cuda::std::string_view __name()
   {
     ::std::call_once(__name_once_flag_, [this]() {
@@ -178,11 +158,6 @@ _CCCL_HOST_API inline void device_ref::init() const
   return ::cuda::__physical_devices()[__id_].__name();
 }
-[[nodiscard]] _CCCL_HOST_API inline const arch::traits_t& device_ref::arch_traits() const
-{
-  return ::cuda::__physical_devices()[__id_].__arch_traits();
-}
 [[nodiscard]] _CCCL_HOST_API inline ::cuda::std::span<const device_ref> device_ref::peers() const
 {
   return ::cuda::__physical_devices()[__id_].__peers();

cuda/cccl/headers/include/cuda/__event/event.h CHANGED Viewed

@@ -28,8 +28,8 @@
 #  include <cuda/__event/event_ref.h>
 #  include <cuda/__runtime/ensure_current_context.h>
 #  include <cuda/__utility/no_init.h>
+#  include <cuda/std/__utility/to_underlying.h>
 #  include <cuda/std/cstddef>
-#  include <cuda/std/utility>
 #  include <cuda/std/__cccl/prologue.h>
@@ -37,38 +37,43 @@ _CCCL_BEGIN_NAMESPACE_CUDA
 class timed_event;
+//! @brief Flags to use when creating the event.
+enum class event_flags : unsigned
+{
+  none          = cudaEventDefault,
+  blocking_sync = cudaEventBlockingSync,
+  interprocess  = cudaEventInterprocess,
+};
+[[nodiscard]] _CCCL_HOST_API constexpr event_flags operator|(event_flags __lhs, event_flags __rhs) noexcept
+{
+  return static_cast<event_flags>(::cuda::std::to_underlying(__lhs) | ::cuda::std::to_underlying(__rhs));
+}
 //! @brief An owning wrapper for an untimed `cudaEvent_t`.
 class event : public event_ref
 {
   friend class timed_event;
 public:
-  //! @brief Flags to use when creating the event.
-  enum class flags : unsigned
-  {
-    none          = cudaEventDefault,
-    blocking_sync = cudaEventBlockingSync,
-    interprocess  = cudaEventInterprocess,
-  };
   //! @brief Construct a new `event` object with timing disabled, and record
   //!        the event in the specified stream.
   //!
   //! @throws cuda_error if the event creation fails.
-  explicit event(stream_ref __stream, flags __flags = flags::none);
+  _CCCL_HOST_API explicit event(stream_ref __stream, event_flags __flags = event_flags::none);
   //! @brief Construct a new `event` object with timing disabled. The event can only be recorded on streams from the
   //! specified device.
   //!
   //! @throws cuda_error if the event creation fails.
-  explicit event(device_ref __device, flags __flags = flags::none)
-      : event(__device, static_cast<unsigned int>(__flags) | cudaEventDisableTiming)
+  _CCCL_HOST_API explicit event(device_ref __device, event_flags __flags = event_flags::none)
+      : event(__device, ::cuda::std::to_underlying(__flags) | cudaEventDisableTiming)
   {}
   //! @brief Construct a new `event` object into the moved-from state.
   //!
   //! @post `get()` returns `cudaEvent_t()`.
-  explicit constexpr event(no_init_t) noexcept
+  _CCCL_HOST_API explicit constexpr event(no_init_t) noexcept
       : event_ref(::cudaEvent_t{})
   {}
@@ -77,7 +82,7 @@ public:
   //! @param __other
   //!
   //! @post `__other` is in a moved-from state.
-  constexpr event(event&& __other) noexcept
+  _CCCL_HOST_API constexpr event(event&& __other) noexcept
       : event_ref(::cuda::std::exchange(__other.__event_, {}))
   {}
@@ -87,7 +92,7 @@ public:
   //! @brief Destroy the `event` object
   //!
   //! @note If the event fails to be destroyed, the error is silently ignored.
-  ~event()
+  _CCCL_HOST_API ~event()
   {
     if (__event_ != nullptr)
     {
@@ -102,7 +107,7 @@ public:
   //! @param __other
   //!
   //! @post `__other` is in a moved-from state.
-  event& operator=(event&& __other) noexcept
+  _CCCL_HOST_API event& operator=(event&& __other) noexcept
   {
     event __tmp(::cuda::std::move(__other));
     ::cuda::std::swap(__event_, __tmp.__event_);
@@ -119,7 +124,7 @@ public:
   //! @return event The constructed `event` object
   //!
   //! @note The constructed `event` object takes ownership of the native handle.
-  [[nodiscard]] static event from_native_handle(::cudaEvent_t __evnt) noexcept
+  [[nodiscard]] static _CCCL_HOST_API event from_native_handle(::cudaEvent_t __evnt) noexcept
   {
     return event(__evnt);
   }
@@ -135,26 +140,21 @@ public:
   //! @return cudaEvent_t The native handle being held by the `event` object.
   //!
   //! @post The event object is in a moved-from state.
-  [[nodiscard]] constexpr ::cudaEvent_t release() noexcept
+  [[nodiscard]] _CCCL_HOST_API constexpr ::cudaEvent_t release() noexcept
   {
     return ::cuda::std::exchange(__event_, {});
   }
-  [[nodiscard]] friend constexpr flags operator|(flags __lhs, flags __rhs) noexcept
-  {
-    return static_cast<flags>(static_cast<unsigned>(__lhs) | static_cast<unsigned>(__rhs));
-  }
 private:
   // Use `event::from_native_handle(e)` to construct an owning `event`
   // object from a `cudaEvent_t` handle.
-  explicit constexpr event(::cudaEvent_t __evnt) noexcept
+  _CCCL_HOST_API explicit constexpr event(::cudaEvent_t __evnt) noexcept
       : event_ref(__evnt)
   {}
-  explicit event(stream_ref __stream, unsigned __flags);
+  _CCCL_HOST_API explicit event(stream_ref __stream, unsigned __flags);
-  explicit event(device_ref __device, unsigned __flags)
+  _CCCL_HOST_API explicit event(device_ref __device, unsigned __flags)
       : event_ref(::cudaEvent_t{})
   {
     [[maybe_unused]] __ensure_current_context __ctx_setter(__device);

cuda/cccl/headers/include/cuda/__event/event_ref.h CHANGED Viewed

@@ -56,7 +56,7 @@ public:
   //!
   //! @note: It is the callers responsibility to ensure the `event_ref` does not
   //! outlive the event denoted by the `cudaEvent_t` handle.
-  constexpr event_ref(::cudaEvent_t __evnt) noexcept
+  _CCCL_HOST_API constexpr event_ref(::cudaEvent_t __evnt) noexcept
       : __event_(__evnt)
   {}
@@ -108,7 +108,7 @@ public:
   //! @brief Retrieve the native `cudaEvent_t` handle.
   //!
   //! @return cudaEvent_t The native handle being held by the event_ref object.
-  [[nodiscard]] constexpr ::cudaEvent_t get() const noexcept
+  [[nodiscard]] _CCCL_HOST_API constexpr ::cudaEvent_t get() const noexcept
   {
     return __event_;
   }
@@ -116,7 +116,7 @@ public:
   //! @brief Checks if the `event_ref` is valid
   //!
   //! @return true if the `event_ref` is valid, false otherwise.
-  [[nodiscard]] explicit constexpr operator bool() const noexcept
+  [[nodiscard]] _CCCL_HOST_API explicit constexpr operator bool() const noexcept
   {
     return __event_ != nullptr;
   }
@@ -129,7 +129,7 @@ public:
   //! @param __lhs The first `event_ref` to compare
   //! @param __rhs The second `event_ref` to compare
   //! @return true if `lhs` and `rhs` refer to the same `cudaEvent_t` object.
-  [[nodiscard]] friend constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept
+  [[nodiscard]] friend _CCCL_HOST_API constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept
   {
     return __lhs.__event_ == __rhs.__event_;
   }
@@ -142,7 +142,7 @@ public:
   //! @param __lhs The first `event_ref` to compare
   //! @param __rhs The second `event_ref` to compare
   //! @return true if `lhs` and `rhs` refer to different `cudaEvent_t` objects.
-  [[nodiscard]] friend constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept
+  [[nodiscard]] friend _CCCL_HOST_API constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept
   {
     return __lhs.__event_ != __rhs.__event_;
   }

cuda/cccl/headers/include/cuda/__event/timed_event.h CHANGED Viewed

@@ -31,6 +31,7 @@
 #  include <cuda/__event/event.h>
 #  include <cuda/__utility/no_init.h>
 #  include <cuda/std/__chrono/duration.h>
+#  include <cuda/std/__utility/to_underlying.h>
 #  include <cuda/std/cstddef>
 #  include <cuda/std/__cccl/prologue.h>
@@ -45,20 +46,20 @@ public:
   //!        and record the event on the specified stream.
   //!
   //! @throws cuda_error if the event creation fails.
-  explicit timed_event(stream_ref __stream, flags __flags = flags::none);
+  _CCCL_HOST_API explicit timed_event(stream_ref __stream, event_flags __flags = event_flags::none);
   //! @brief Construct a new `timed_event` object with the specified flags. The event can only be recorded on streams
   //! from the specified device.
   //!
   //! @throws cuda_error if the event creation fails.
-  explicit timed_event(device_ref __device, flags __flags = flags::none)
-      : event(__device, static_cast<unsigned>(__flags))
+  _CCCL_HOST_API explicit timed_event(device_ref __device, event_flags __flags = event_flags::none)
+      : event(__device, ::cuda::std::to_underlying(__flags))
   {}
   //! @brief Construct a new `timed_event` object into the moved-from state.
   //!
   //! @post `get()` returns `cudaEvent_t()`.
-  explicit constexpr timed_event(no_init_t) noexcept
+  _CCCL_HOST_API explicit constexpr timed_event(no_init_t) noexcept
       : event(no_init)
   {}
@@ -74,7 +75,7 @@ public:
   //! @return timed_event The constructed `timed_event` object
   //!
   //! @note The constructed `timed_event` object takes ownership of the native handle.
-  [[nodiscard]] static timed_event from_native_handle(::cudaEvent_t __evnt) noexcept
+  [[nodiscard]] static _CCCL_HOST_API timed_event from_native_handle(::cudaEvent_t __evnt) noexcept
   {
     return timed_event(__evnt);
   }
@@ -95,7 +96,8 @@ public:
   //! @return cuda::std::chrono::nanoseconds The elapsed time in nanoseconds.
   //!
   //! @note The elapsed time has a resolution of approximately 0.5 microseconds.
-  [[nodiscard]] friend ::cuda::std::chrono::nanoseconds operator-(const timed_event& __end, const timed_event& __start)
+  [[nodiscard]] friend _CCCL_HOST_API ::cuda::std::chrono::nanoseconds
+  operator-(const timed_event& __end, const timed_event& __start)
   {
     const auto __ms = ::cuda::__driver::__eventElapsedTime(__start.get(), __end.get());
     return ::cuda::std::chrono::nanoseconds(static_cast<::cuda::std::chrono::nanoseconds::rep>(__ms * 1'000'000.0));
@@ -104,7 +106,7 @@ public:
 private:
   // Use `timed_event::from_native_handle(e)` to construct an owning `timed_event`
   // object from a `cudaEvent_t` handle.
-  explicit constexpr timed_event(::cudaEvent_t __evnt) noexcept
+  _CCCL_HOST_API explicit constexpr timed_event(::cudaEvent_t __evnt) noexcept
       : event(__evnt)
   {}
 };

cuda/cccl/headers/include/cuda/__fwd/devices.h CHANGED Viewed

@@ -31,11 +31,11 @@ class __physical_device;
 class device_ref;
 template <::cudaDeviceAttr _Attr>
 struct __dev_attr;
+struct arch_traits_t;
+class compute_capability;
+enum class arch_id : int;
-namespace arch
-{
-struct traits_t;
-} // namespace arch
+inline constexpr int __arch_specific_id_multiplier = 100000;
 _CCCL_END_NAMESPACE_CUDA