PyPI - cuda-cccl - Versions diffs - 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show

cuda/cccl/headers/include/cub/warp/warp_scan.cuh CHANGED Viewed

@@ -114,6 +114,7 @@ CUB_NAMESPACE_BEGIN
 //!        // Compute warp-wide prefix sums
 //!        int warp_id = threadIdx.x / 32;
 //!        WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+//!    }
 //!
 //! Suppose the set of input ``thread_data`` across the block of threads is
 //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps of
@@ -143,6 +144,8 @@ CUB_NAMESPACE_BEGIN
 //!
 //!            // Compute warp-wide prefix sums
 //!            WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+//!        }
+//!    }
 //!
 //! Suppose the set of input ``thread_data`` across the warp of threads is
 //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` will be
@@ -248,6 +251,7 @@ public:
   //!        // Compute inclusive warp-wide prefix sums
   //!        int warp_id = threadIdx.x / 32;
   //!        WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
@@ -294,9 +298,8 @@ public:
   //!        // Compute inclusive warp-wide prefix sums
   //!        int warp_aggregate;
   //!        int warp_id = threadIdx.x / 32;
-  //!        WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data,
-  //!                                                     thread_data,
-  //!                                                     warp_aggregate);
+  //!        WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
@@ -352,6 +355,7 @@ public:
   //!        // Compute exclusive warp-wide prefix sums
   //!        int warp_id = threadIdx.x / 32;
   //!        WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+  //!    }
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps

cuda/cccl/headers/include/cub/warp/warp_store.cuh CHANGED Viewed

@@ -201,6 +201,7 @@ enum WarpStoreAlgorithm
 //!
 //!        // Store items to linear memory
 //!        WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
+//!    }
 //!
 //! Suppose the set of ``thread_data`` across the warp threads is
 //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``.

cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h CHANGED Viewed

@@ -26,6 +26,7 @@
 #if _CCCL_CUDA_COMPILATION()
 #  include <cuda/__ptx/instructions/get_sreg.h>
 #  include <cuda/__ptx/instructions/mbarrier_arrive.h>
+#  include <cuda/__ptx/instructions/mbarrier_wait.h>
 #  include <cuda/__ptx/ptx_dot_variants.h>
 #  include <cuda/__ptx/ptx_helper_functions.h>
 #endif // _CCCL_CUDA_COMPILATION()
@@ -381,12 +382,30 @@ private:
 public:
   _CCCL_API inline void wait(arrival_token&& __phase) const
   {
+    // no need to back off on a barrier in SMEM on SM90+, SYNCS unit is taking care of this
+    NV_IF_TARGET(NV_PROVIDES_SM_90,
+                 (if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared)) {
+                   while (!::cuda::ptx::mbarrier_try_wait(
+                     reinterpret_cast<uint64_t*>(const_cast<__barrier_base*>(&__barrier)), __phase))
+                     ;
+                   return;
+                 }))
+    // fallback implementation
     ::cuda::std::__cccl_thread_poll_with_backoff(
       ::cuda::std::__barrier_poll_tester_phase<barrier>(this, ::cuda::std::move(__phase)));
   }
   _CCCL_API inline void wait_parity(bool __phase_parity) const
   {
+    // no need to back off on a barrier in SMEM on SM90+, SYNCS unit is taking care of this
+    NV_IF_TARGET(NV_PROVIDES_SM_90,
+                 (if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared)) {
+                   while (!::cuda::ptx::mbarrier_try_wait_parity(
+                     reinterpret_cast<uint64_t*>(const_cast<__barrier_base*>(&__barrier)), __phase_parity))
+                     ;
+                   return;
+                 }))
+    // fallback implementation
     ::cuda::std::__cccl_thread_poll_with_backoff(
       ::cuda::std::__barrier_poll_tester_parity<barrier>(this, __phase_parity));
   }

cuda/cccl/headers/include/cuda/__cccl_config CHANGED Viewed

@@ -23,6 +23,7 @@
 #include <cuda/std/__cccl/exceptions.h> // IWYU pragma: export
 #include <cuda/std/__cccl/execution_space.h> // IWYU pragma: export
 #include <cuda/std/__cccl/extended_data_types.h> // IWYU pragma: export
+#include <cuda/std/__cccl/host_std_lib.h> // IWYU pragma: export
 #include <cuda/std/__cccl/os.h> // IWYU pragma: export
 #include <cuda/std/__cccl/preprocessor.h> // IWYU pragma: export
 #include <cuda/std/__cccl/ptx_isa.h> // IWYU pragma: export

cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h CHANGED Viewed

@@ -23,6 +23,7 @@
 #include <cuda/__cmath/ceil_div.h>
 #include <cuda/__cmath/ilog.h>
+#include <cuda/__cmath/mul_hi.h>
 #include <cuda/__cmath/pow2.h>
 #include <cuda/std/__type_traits/common_type.h>
 #include <cuda/std/__type_traits/is_integer.h>
@@ -30,7 +31,6 @@
 #include <cuda/std/__type_traits/make_nbit_int.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__type_traits/num_bits.h>
-#include <cuda/std/__type_traits/promote.h>
 #include <cuda/std/__utility/pair.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
@@ -39,78 +39,6 @@
 _CCCL_BEGIN_NAMESPACE_CUDA
-/***********************************************************************************************************************
- * Extract higher bits after multiplication
- **********************************************************************************************************************/
-template <typename _Tp, typename _Lhs>
-[[nodiscard]] _CCCL_API constexpr ::cuda::std::common_type_t<_Tp, _Lhs>
-__multiply_extract_higher_bits_fallback(_Tp __x, _Lhs __y)
-{
-  using __ret_t         = ::cuda::std::common_type_t<_Tp, _Lhs>;
-  constexpr int __shift = ::cuda::std::__num_bits_v<__ret_t> / 2;
-  using __half_bits_t   = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<__ret_t>>;
-  auto __x_high         = static_cast<__half_bits_t>(__x >> __shift);
-  auto __x_low          = static_cast<__half_bits_t>(__x);
-  auto __y_high         = static_cast<__half_bits_t>(__y >> __shift);
-  auto __y_low          = static_cast<__half_bits_t>(__y);
-  auto __p0             = __x_low * __y_low;
-  auto __p1             = __x_low * __y_high;
-  auto __p2             = __x_high * __y_low;
-  auto __p3             = __x_high * __y_high;
-  auto __mid            = __p1 + __p2;
-  __half_bits_t __carry = (__mid < __p1);
-  auto __po_half        = __p0 >> __shift;
-  __mid                 = __mid + __po_half;
-  __carry += (__mid < __po_half);
-  return __p3 + (__mid >> __shift) + (__carry << __shift);
-}
-template <typename _Tp, typename _Lhs>
-[[nodiscard]] _CCCL_API constexpr ::cuda::std::common_type_t<_Tp, _Lhs> __multiply_extract_higher_bits(_Tp __x, _Lhs __y)
-{
-  using ::cuda::std::__cccl_is_integer_v;
-  using ::cuda::std::__num_bits_v;
-  using ::cuda::std::is_signed_v;
-  static_assert(__cccl_is_integer_v<_Tp>, "__multiply_extract_higher_bits: T is required to be an integer type");
-  static_assert(__cccl_is_integer_v<_Lhs>, "__multiply_extract_higher_bits: T is required to be an integer type");
-  if constexpr (is_signed_v<_Tp>)
-  {
-    _CCCL_ASSERT(__x >= 0, "__x must be non-negative");
-    _CCCL_ASSUME(__x >= 0);
-  }
-  if constexpr (is_signed_v<_Lhs>)
-  {
-    _CCCL_ASSERT(__y >= 0, "__y must be non-negative");
-    _CCCL_ASSUME(__y >= 0);
-  }
-  using __ret_t = ::cuda::std::common_type_t<_Tp, _Lhs>;
-  if (!::cuda::std::__cccl_default_is_constant_evaluated())
-  {
-    if constexpr (sizeof(_Tp) == sizeof(uint32_t) && sizeof(_Lhs) == sizeof(uint32_t))
-    {
-      NV_IF_TARGET(NV_IS_DEVICE, (return ::__umulhi(static_cast<uint32_t>(__x), static_cast<uint32_t>(__y));));
-    }
-#if !_CCCL_HAS_INT128()
-    else if constexpr (sizeof(_Tp) == sizeof(uint64_t) && sizeof(_Lhs) == sizeof(uint64_t))
-    {
-      NV_DISPATCH_TARGET(NV_IS_DEVICE, (return ::__umul64hi(static_cast<uint64_t>(__x), static_cast<uint64_t>(__y));));
-    }
-#endif // !_CCCL_HAS_INT128()
-  }
-  if constexpr (sizeof(__ret_t) < sizeof(uint64_t) || (sizeof(__ret_t) == sizeof(uint64_t) && _CCCL_HAS_INT128()))
-  {
-    constexpr auto __mul_bits = ::cuda::next_power_of_two(__num_bits_v<_Tp> + __num_bits_v<_Lhs>);
-    using __larger_t          = ::cuda::std::__make_nbit_uint_t<__mul_bits>;
-    auto __ret                = (static_cast<__larger_t>(__x) * __y) >> (__mul_bits / 2);
-    return static_cast<__ret_t>(__ret);
-  }
-  else
-  {
-    return ::cuda::__multiply_extract_higher_bits_fallback(__x, __y);
-  }
-}
 /***********************************************************************************************************************
  * Fast Modulo/Division based on Precomputation
  **********************************************************************************************************************/
@@ -184,6 +112,7 @@ public:
       _CCCL_ASSERT(__dividend >= 0, "dividend must be non-negative");
     }
     using __common_t    = ::cuda::std::common_type_t<_Tp, _Lhs>;
+    using __ucommon_t   = ::cuda::std::make_unsigned_t<__common_t>;
     using _Up           = ::cuda::std::make_unsigned_t<_Lhs>;
     const auto __div    = __divisor1.__divisor; // cannot use structure binding because of clang-14
     const auto __mul    = __divisor1.__multiplier;
@@ -205,7 +134,7 @@ public:
     {
       return static_cast<__common_t>(__dividend);
     }
-    auto __higher_bits = ::cuda::__multiply_extract_higher_bits(__udividend, __mul);
+    auto __higher_bits = ::cuda::mul_hi(static_cast<__ucommon_t>(__udividend), static_cast<__ucommon_t>(__mul));
     auto __quotient    = static_cast<__common_t>(__higher_bits >> __shift_);
     _CCCL_ASSERT(__quotient == static_cast<__common_t>(__dividend / __div), "wrong __quotient");
     return __quotient;

cuda/cccl/headers/include/cuda/__cmath/mul_hi.h ADDED Viewed

@@ -0,0 +1,146 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#ifndef _CUDA___CMATH_MUL_HI_H
+#define _CUDA___CMATH_MUL_HI_H
+#include <cuda/std/detail/__config>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_integer.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/make_nbit_int.h>
+#include <cuda/std/__type_traits/make_unsigned.h>
+#include <cuda/std/__type_traits/num_bits.h>
+#include <cuda/std/cstdint>
+#if _CCCL_COMPILER(MSVC)
+#  include <intrin.h>
+#endif // _CCCL_COMPILER(MSVC)
+#include <cuda/std/__cccl/prologue.h>
+_CCCL_BEGIN_NAMESPACE_CUDA
+/***********************************************************************************************************************
+ * Extract higher bits after multiplication
+ **********************************************************************************************************************/
+template <typename _Tp>
+[[nodiscard]] _CCCL_API constexpr _Tp __mul_hi_fallback(_Tp __lhs, _Tp __rhs) noexcept
+{
+  static_assert(::cuda::std::is_unsigned_v<_Tp>, "__mul_hi_fallback: T is required to be a unsigned integer type");
+  constexpr int __half_bits = ::cuda::std::__num_bits_v<_Tp> / 2;
+  using __half_bits_t       = ::cuda::std::__make_nbit_uint_t<__half_bits>;
+  const auto __lhs_low      = static_cast<__half_bits_t>(__lhs); // 32-bit
+  const auto __lhs_high     = static_cast<__half_bits_t>(__lhs >> __half_bits); // 32-bit
+  const auto __rhs_low      = static_cast<__half_bits_t>(__rhs); // 32-bit
+  const auto __rhs_high     = static_cast<__half_bits_t>(__rhs >> __half_bits); // 32-bit
+  const auto __po_half      = (static_cast<_Tp>(__lhs_low) * __rhs_low) >> __half_bits;
+  const auto __p1           = static_cast<_Tp>(__lhs_low) * __rhs_high; // 64-bit
+  const auto __p2           = static_cast<_Tp>(__lhs_high) * __rhs_low; // 64-bit
+  const auto __p3           = static_cast<_Tp>(__lhs_high) * __rhs_high; // 64-bit
+  const auto __p1_half      = static_cast<__half_bits_t>(__p1); // 32-bit
+  const auto __p2_half      = static_cast<__half_bits_t>(__p2); // 32-bit
+  const auto __carry        = (__po_half + __p1_half + __p2_half) >> __half_bits; // 64-bit
+  return __p3 + (__p1 >> __half_bits) + (__p2 >> __half_bits) + __carry;
+}
+_CCCL_TEMPLATE(typename _Tp)
+_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>)
+[[nodiscard]]
+_CCCL_API constexpr _Tp mul_hi(_Tp __lhs, _Tp __rhs) noexcept
+{
+  using ::cuda::std::int64_t;
+  using ::cuda::std::is_signed_v;
+  if (!::cuda::std::__cccl_default_is_constant_evaluated())
+  {
+    if constexpr (sizeof(_Tp) == sizeof(int))
+    {
+      if constexpr (is_signed_v<_Tp>)
+      {
+        [[maybe_unused]] const auto __lhs1 = static_cast<int>(__lhs);
+        [[maybe_unused]] const auto __rhs1 = static_cast<int>(__rhs);
+        NV_IF_TARGET(NV_IS_DEVICE, (return ::__mulhi(__lhs1, __rhs1);));
+      }
+      else // is_unsigned_v<_Tp>
+      {
+        [[maybe_unused]] const auto __lhs1 = static_cast<unsigned>(__lhs);
+        [[maybe_unused]] const auto __rhs1 = static_cast<unsigned>(__rhs);
+        NV_IF_TARGET(NV_IS_DEVICE, (return ::__umulhi(__lhs1, __rhs1);));
+      }
+    }
+    else if constexpr (sizeof(_Tp) == sizeof(int64_t))
+    {
+      if constexpr (is_signed_v<_Tp>)
+      {
+        [[maybe_unused]] const auto __lhs1 = static_cast<long long>(__lhs);
+        [[maybe_unused]] const auto __rhs1 = static_cast<long long>(__rhs);
+        NV_IF_TARGET(NV_IS_DEVICE, (return ::__mul64hi(__lhs1, __rhs1);));
+#if _CCCL_COMPILER(MSVC)
+        NV_IF_TARGET(NV_IS_HOST, (return ::__mulh(__lhs1, __rhs1);));
+#endif // _CCCL_COMPILER(MSVC)
+      }
+      else // is_unsigned_v<_Tp>
+      {
+        [[maybe_unused]] const auto __lhs1 = static_cast<unsigned long long>(__lhs);
+        [[maybe_unused]] const auto __rhs1 = static_cast<unsigned long long>(__rhs);
+        NV_IF_TARGET(NV_IS_DEVICE, (return ::__umul64hi(__lhs1, __rhs1);));
+#if _CCCL_COMPILER(MSVC)
+        NV_IF_TARGET(NV_IS_HOST, (return ::__umulh(__lhs1, __rhs1);));
+#endif // _CCCL_COMPILER(MSVC)
+      }
+    }
+  }
+  if constexpr (sizeof(_Tp) < sizeof(int64_t) || (sizeof(_Tp) == sizeof(int64_t) && _CCCL_HAS_INT128()))
+  {
+    constexpr auto __bits = ::cuda::std::__num_bits_v<_Tp>;
+    using __larger_t      = ::cuda::std::__make_nbit_int_t<__bits * 2, is_signed_v<_Tp>>;
+    const auto __ret      = (static_cast<__larger_t>(__lhs) * __rhs) >> __bits;
+    return static_cast<_Tp>(__ret);
+  }
+  else // sizeof(_Tp) >= sizeof(int64_t) && !_CCCL_HAS_INT128()
+  {
+    if constexpr (is_signed_v<_Tp>)
+    {
+      using _Up         = ::cuda::std::make_unsigned_t<_Tp>;
+      const auto __lhs1 = static_cast<_Up>(__lhs);
+      const auto __rhs1 = static_cast<_Up>(__rhs);
+      auto __hi         = ::cuda::__mul_hi_fallback(__lhs1, __rhs1);
+      if (__lhs < 0)
+      {
+        __hi -= __rhs1;
+      }
+      if (__rhs < 0)
+      {
+        __hi -= __lhs1;
+      }
+      return static_cast<_Tp>(__hi);
+    }
+    else
+    {
+      return ::cuda::__mul_hi_fallback(__lhs, __rhs);
+    }
+  }
+}
+_CCCL_END_NAMESPACE_CUDA
+#include <cuda/std/__cccl/epilogue.h>
+#endif // _CUDA___CMATH_MULTIPLY_HIGH_HALF_H

cuda/cccl/headers/include/cuda/__complex/get_real_imag.h CHANGED Viewed

@@ -24,10 +24,6 @@
 #include <cuda/__fwd/complex.h>
 #include <cuda/std/__fwd/complex.h>
-#if !_CCCL_COMPILER(NVRTC)
-#  include <complex>
-#endif // !_CCCL_COMPILER(NVRTC)
 #include <cuda/std/__cccl/prologue.h>
 _CCCL_BEGIN_NAMESPACE_CUDA

cuda/cccl/headers/include/cuda/__device/arch_id.h ADDED Viewed

@@ -0,0 +1,176 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#ifndef _CUDA___DEVICE_ARCH_ID_H
+#define _CUDA___DEVICE_ARCH_ID_H
+#include <cuda/std/detail/__config>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+#include <cuda/__device/compute_capability.h>
+#include <cuda/__fwd/devices.h>
+#include <cuda/std/__type_traits/always_false.h>
+#include <cuda/std/__utility/to_underlying.h>
+#include <cuda/std/__cccl/prologue.h>
+_CCCL_BEGIN_NAMESPACE_CUDA
+//! @brief Architecture identifier
+//! This type identifies an architecture. It has more possible entries than just numeric values of the compute
+//! capability. For example, sm_90 and sm_90a have the same compute capability, but the identifier is different.
+enum class arch_id : int
+{
+  sm_60   = 60,
+  sm_61   = 61,
+  sm_70   = 70,
+  sm_75   = 75,
+  sm_80   = 80,
+  sm_86   = 86,
+  sm_87   = 87,
+  sm_88   = 88,
+  sm_89   = 89,
+  sm_90   = 90,
+  sm_100  = 100,
+  sm_103  = 103,
+  sm_110  = 110,
+  sm_120  = 120,
+  sm_121  = 121,
+  sm_90a  = 90 * __arch_specific_id_multiplier,
+  sm_100a = 100 * __arch_specific_id_multiplier,
+  sm_103a = 103 * __arch_specific_id_multiplier,
+  sm_110a = 110 * __arch_specific_id_multiplier,
+  sm_120a = 120 * __arch_specific_id_multiplier,
+  sm_121a = 121 * __arch_specific_id_multiplier,
+};
+[[nodiscard]] _CCCL_API constexpr bool __has_known_arch(compute_capability __cc) noexcept
+{
+  switch (__cc.get())
+  {
+    case ::cuda::std::to_underlying(arch_id::sm_60):
+    case ::cuda::std::to_underlying(arch_id::sm_61):
+    case ::cuda::std::to_underlying(arch_id::sm_70):
+    case ::cuda::std::to_underlying(arch_id::sm_75):
+    case ::cuda::std::to_underlying(arch_id::sm_80):
+    case ::cuda::std::to_underlying(arch_id::sm_86):
+    case ::cuda::std::to_underlying(arch_id::sm_87):
+    case ::cuda::std::to_underlying(arch_id::sm_88):
+    case ::cuda::std::to_underlying(arch_id::sm_89):
+    case ::cuda::std::to_underlying(arch_id::sm_90):
+    case ::cuda::std::to_underlying(arch_id::sm_100):
+    case ::cuda::std::to_underlying(arch_id::sm_103):
+    case ::cuda::std::to_underlying(arch_id::sm_110):
+    case ::cuda::std::to_underlying(arch_id::sm_120):
+    case ::cuda::std::to_underlying(arch_id::sm_121):
+      return true;
+    default:
+      return false;
+  }
+}
+[[nodiscard]] _CCCL_API constexpr bool __has_known_specific_arch(compute_capability __cc) noexcept
+{
+  switch (__cc.get() * __arch_specific_id_multiplier)
+  {
+    case ::cuda::std::to_underlying(arch_id::sm_90a):
+    case ::cuda::std::to_underlying(arch_id::sm_100a):
+    case ::cuda::std::to_underlying(arch_id::sm_103a):
+    case ::cuda::std::to_underlying(arch_id::sm_110a):
+    case ::cuda::std::to_underlying(arch_id::sm_120a):
+    case ::cuda::std::to_underlying(arch_id::sm_121a):
+      return true;
+    default:
+      return false;
+  }
+}
+//! @brief Converts the compute capability to the architecture id.
+//!
+//! @param __cc The compute capability. Must have a corresponding architecture id.
+//!
+//! @returns The architecture id.
+[[nodiscard]] _CCCL_API constexpr arch_id to_arch_id(compute_capability __cc) noexcept
+{
+  _CCCL_ASSERT(::cuda::__has_known_arch(__cc), "this compute capability cannot be converted to arch id");
+  return static_cast<arch_id>(__cc.get());
+}
+//! @brief Converts the compute capability to the architecture specific id.
+//!
+//! @param __cc The compute capability. Must have a corresponding architecture specific id.
+//!
+//! @returns The architecture specific id.
+[[nodiscard]] _CCCL_API constexpr arch_id to_arch_specific_id(compute_capability __cc) noexcept
+{
+  _CCCL_ASSERT(::cuda::__has_known_specific_arch(__cc),
+               "this compute capability cannot be converted to arch specific id");
+  return static_cast<arch_id>(__cc.get() * __arch_specific_id_multiplier);
+}
+_CCCL_END_NAMESPACE_CUDA
+#if _CCCL_CUDA_COMPILATION()
+_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE
+//! @brief This function should cause a link error. If it happens, you are trying to compile the code for an unsupported
+//!        architecture (too new/old).
+_CCCL_DEVICE_API ::cuda::arch_id __unknown_cuda_architecture();
+//! @brief Returns the \c cuda::arch_id that is currently being compiled.
+//!
+//!        If the current architecture is not a known architecture from \c cuda::arch_id enumeration, the compilation
+//!        will fail.
+//!
+//! @note This API cannot be used in constexpr context when compiling with nvc++ in CUDA mode.
+template <class _Dummy = void>
+[[nodiscard]] _CCCL_DEVICE_API _CCCL_TARGET_CONSTEXPR ::cuda::arch_id current_arch_id() noexcept
+{
+#  if _CCCL_CUDA_COMPILER(NVHPC)
+  const auto __cc = ::cuda::device::current_compute_capability();
+  if (::cuda::__is_known_arch_of(__cc))
+  {
+    return ::cuda::to_arch_id(__cc);
+  }
+  else
+  {
+    return ::cuda::device::__unknown_cuda_architecture();
+  }
+#  elif _CCCL_DEVICE_COMPILATION()
+  constexpr auto __cc = ::cuda::device::current_compute_capability();
+#    if defined(__CUDA_ARCH_SPECIFIC__)
+  constexpr auto __is_known_cc = ::cuda::std::__always_false_v<_Dummy> || ::cuda::__has_known_specific_arch(__cc);
+  static_assert(__is_known_cc, "unknown CUDA specific architecture");
+  return ::cuda::to_arch_specific_id(__cc);
+#    else // ^^^ __CUDA_ARCH_SPECIFIC__ ^^^ / vvv !__CUDA_ARCH_SPECIFIC__ vvv
+  constexpr auto __is_known_cc = ::cuda::std::__always_false_v<_Dummy> || ::cuda::__has_known_arch(__cc);
+  static_assert(__is_known_cc, "unknown CUDA architecture");
+  return ::cuda::to_arch_id(__cc);
+#    endif // ^^^ __CUDA_ARCH_SPECIFIC__ ^^^
+#  else
+  return {};
+#  endif // ^^^ single-pass cuda compiler ^^^
+}
+_CCCL_END_NAMESPACE_CUDA_DEVICE
+#endif // _CCCL_CUDA_COMPILATION()
+#include <cuda/std/__cccl/epilogue.h>
+#endif // _CUDA___DEVICE_ARCH_ID_H