PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show

cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh CHANGED Viewed

@@ -50,8 +50,8 @@
 #include <cuda/__ptx/instructions/get_sreg.h>
 #include <cuda/std/__algorithm/clamp.h>
-#include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__bit/has_single_bit.h>
+#include <cuda/std/__bit/integral.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/is_integral.h>
@@ -630,7 +630,7 @@ struct WarpScanShfl
     ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
     // Find index of first set bit
-    int segment_first_lane = ::cuda::std::max(0, 31 - __clz(ballot));
+    int segment_first_lane = ::cuda::std::__bit_log2(ballot);
     // Iterate scan steps
     _CCCL_PRAGMA_UNROLL_FULL()

cuda/cccl/headers/include/cuda/__algorithm/common.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef __CUDA___ALGORITHM_COMMON
 #define __CUDA___ALGORITHM_COMMON
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header

cuda/cccl/headers/include/cuda/__algorithm/copy.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef __CUDA___ALGORITHM_COPY_H
 #define __CUDA___ALGORITHM_COPY_H
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -38,11 +38,11 @@ enum class source_access_order
 {
 #  if _CCCL_CTK_AT_LEAST(13, 0)
   //! @brief Access source in stream order
-  stream = cudaMemcpySrcAccessOrderStream,
+  stream = ::cudaMemcpySrcAccessOrderStream,
   //! @brief Access source during the copy call, source can be destroyed after the API returns
-  during_api_call = cudaMemcpySrcAccessOrderDuringApiCall,
+  during_api_call = ::cudaMemcpySrcAccessOrderDuringApiCall,
   //! @brief Access source in any order, the order can change across CUDA releases
-  any = cudaMemcpySrcAccessOrderAny,
+  any = ::cudaMemcpySrcAccessOrderAny,
 #  else
   any = 0x3,
 #  endif // _CCCL_CTK_BELOW(13, 0)

cuda/cccl/headers/include/cuda/__algorithm/fill.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef __CUDA___ALGORITHM_FILL
 #define __CUDA___ALGORITHM_FILL
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header

cuda/cccl/headers/include/cuda/__device/all_devices.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef _CUDA___DEVICE_ALL_DEVICES_H
 #define _CUDA___DEVICE_ALL_DEVICES_H
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -22,10 +22,12 @@
 #endif // no system header
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+#  include <cuda/__device/device_ref.h>
 #  include <cuda/__device/physical_device.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
-#  include <cuda/std/cassert>
-#  include <cuda/std/detail/libcxx/include/stdexcept>
+#  include <cuda/__driver/driver_api.h>
+#  include <cuda/__fwd/devices.h>
+#  include <cuda/std/__cstddef/types.h>
 #  include <cuda/std/span>
 #  include <vector>
@@ -33,132 +35,62 @@
 #  include <cuda/std/__cccl/prologue.h>
 _CCCL_BEGIN_NAMESPACE_CUDA
-namespace __detail
-{
-//! @brief A random-access range of all available CUDA devices
-class all_devices
-{
-public:
-  using size_type      = ::std::vector<physical_device>::size_type;
-  using iterator       = ::std::vector<physical_device>::const_iterator;
-  using const_iterator = ::std::vector<physical_device>::const_iterator;
-  all_devices() = default;
-  [[nodiscard]] const physical_device& operator[](size_type __i) const;
-  [[nodiscard]] size_type size() const;
-  [[nodiscard]] iterator begin() const noexcept;
-  [[nodiscard]] iterator end() const noexcept;
-  operator ::cuda::std::span<const device_ref>() const;
-private:
-  struct __initializer_iterator;
-  static const ::std::vector<physical_device>& __devices();
-};
-//! @brief An iterator used to in-place construct `device` objects in a
-//! std::vector.
-//!
-//! Since `device` objects are not movable or copyable, we need to construct them
-//! in-place with a proxy object that can be implicitly converted to a `device`
-//! object.
-struct all_devices::__initializer_iterator
+[[nodiscard]] _CCCL_HOST_API inline ::std::vector<device_ref> __make_devices()
 {
-  using value_type        = __emplace_device;
-  using reference         = __emplace_device;
-  using iterator_category = ::std::forward_iterator_tag;
-  using difference_type   = int;
-  using pointer           = __emplace_device;
-  int __id_;
-  __emplace_device operator*() const noexcept
+  ::std::vector<device_ref> __ret{};
+  __ret.reserve(::cuda::__physical_devices().size());
+  for (::cuda::std::size_t __i = 0; __i < ::cuda::__physical_devices().size(); ++__i)
   {
-    return __emplace_device{__id_};
+    __ret.emplace_back(static_cast<int>(__i));
   }
+  return __ret;
+}
-  __emplace_device operator->() const noexcept
-  {
-    return __emplace_device{__id_};
-  }
+[[nodiscard]] inline ::cuda::std::span<const device_ref> __devices()
+{
+  static const auto __devices = ::cuda::__make_devices();
+  return ::cuda::std::span<const device_ref>{__devices.data(), __devices.size()};
+}
-  __initializer_iterator& operator++() noexcept
-  {
-    ++__id_;
-    return *this;
-  }
+//! @brief A random-access range of all available CUDA devices
+class __all_devices
+{
+public:
+  using value_type = ::cuda::std::span<const device_ref>::value_type;
+  using size_type  = ::cuda::std::span<const device_ref>::size_type;
+  using iterator   = ::cuda::std::span<const device_ref>::iterator;
+  _CCCL_HIDE_FROM_ABI __all_devices()            = default;
+  __all_devices(const __all_devices&)            = delete;
+  __all_devices(__all_devices&&)                 = delete;
+  __all_devices& operator=(const __all_devices&) = delete;
+  __all_devices& operator=(__all_devices&&)      = delete;
-  __initializer_iterator operator++(int) noexcept
+  [[nodiscard]] _CCCL_HOST_API device_ref operator[](size_type __i) const
   {
-    auto __tmp = *this;
-    ++__id_;
-    return __tmp;
+    if (__i >= size())
+    {
+      ::cuda::std::__throw_out_of_range("device index out of range");
+    }
+    return ::cuda::__devices()[__i];
   }
-  bool operator==(const __initializer_iterator& __other) const noexcept
+  [[nodiscard]] _CCCL_HOST_API size_type size() const
   {
-    return __id_ == __other.__id_;
+    return ::cuda::__devices().size();
   }
-  bool operator!=(const __initializer_iterator& __other) const noexcept
+  [[nodiscard]] _CCCL_HOST_API iterator begin() const
   {
-    return __id_ != __other.__id_;
+    return ::cuda::__devices().begin();
   }
-};
-[[nodiscard]] inline const physical_device& all_devices::operator[](size_type __id_) const
-{
-  if (__id_ >= size())
+  [[nodiscard]] _CCCL_HOST_API iterator end() const
   {
-    if (size() == 0)
-    {
-      ::cuda::std::__throw_out_of_range("device was requested but no CUDA devices found");
-    }
-    else
-    {
-      ::cuda::std::__throw_out_of_range(
-        (::std::string("device index out of range: ") + ::std::to_string(__id_)).c_str());
-    }
+    return ::cuda::__devices().end();
   }
-  return __devices()[__id_];
-}
-[[nodiscard]] inline all_devices::size_type all_devices::size() const
-{
-  return __devices().size();
-}
-[[nodiscard]] inline all_devices::iterator all_devices::begin() const noexcept
-{
-  return __devices().begin();
-}
-[[nodiscard]] inline all_devices::iterator all_devices::end() const noexcept
-{
-  return __devices().end();
-}
-inline all_devices::operator ::cuda::std::span<const device_ref>() const
-{
-  static const ::std::vector<device_ref> __refs(begin(), end());
-  return ::cuda::std::span<const device_ref>(__refs);
-}
-inline const ::std::vector<physical_device>& all_devices::__devices()
-{
-  static const ::std::vector<physical_device> __devices = [] {
-    int __count = 0;
-    _CCCL_TRY_CUDA_API(::cudaGetDeviceCount, "failed to get the count of CUDA devices", &__count);
-    return ::std::vector<physical_device>{__initializer_iterator{0}, __initializer_iterator{__count}};
-  }();
-  return __devices;
-}
-} // namespace __detail
+};
 //! @brief A range of all available CUDA devices
 //!
@@ -174,7 +106,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
 //!   struct iterator;
 //!   using const_iterator = iterator;
 //!
-//!   [[nodiscard]] constexpr const physical_device& operator[](size_type i) const noexcept;
+//!   [[nodiscard]] device_ref operator[](size_type i) const noexcept;
 //!
 //!   [[nodiscard]] size_type size() const;
 //!
@@ -186,7 +118,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
 //!
 //! @par
 //! `__all_devices::iterator` is a random access iterator with a `reference`
-//! type of `const physical_device&`.
+//! type of `const device_ref&`.
 //!
 //! @par Example
 //! @code
@@ -197,39 +129,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
 //! @sa
 //! * device
 //! * device_ref
-inline constexpr __detail::all_devices devices{};
-inline const arch::traits_t& device_ref::arch_traits() const
-{
-  return devices[get()].arch_traits();
-}
-[[nodiscard]] inline ::std::vector<device_ref> device_ref::peer_devices() const
-{
-  ::std::vector<device_ref> __result;
-  __result.reserve(devices.size());
-  for (const physical_device& __other_dev : devices)
-  {
-    // Exclude the device this API is called on. The main use case for this API
-    // is enable/disable peer access. While enable peer access can be called on
-    // device on which memory resides, disable peer access will error-out.
-    // Usage of the peer access control is smoother when *this is excluded,
-    // while it can be easily added with .push_back() on the vector if a full
-    // group of peers is needed (for cases other than peer access control)
-    if (__other_dev != *this)
-    {
-      // While in almost all practical applications peer access should be symmetrical,
-      // it is possible to build a system with one directional peer access, check
-      // both ways here just to be safe
-      if (has_peer_access_to(__other_dev) && __other_dev.has_peer_access_to(*this))
-      {
-        __result.push_back(__other_dev);
-      }
-    }
-  }
-  return __result;
-}
+inline constexpr __all_devices devices{};
 _CCCL_END_NAMESPACE_CUDA

cuda/cccl/headers/include/cuda/__device/arch_traits.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef _CUDA___DEVICE_ARCH_TRAITS_H
 #define _CUDA___DEVICE_ARCH_TRAITS_H
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -22,7 +22,9 @@
 #endif // no system header
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #  include <cuda/__device/attributes.h>
+#  include <cuda/__fwd/devices.h>
 #  include <cuda/std/__exception/cuda_error.h>
 #  include <cuda/std/limits>
@@ -58,76 +60,76 @@ enum class id : int
   sm_120a = 120 * __arch_specific_id_multiplier,
 };
-// @brief Architecture traits
-// This type contains information about an architecture that is constant across devices of that architecture.
+//! @brief Architecture traits
+//! This type contains information about an architecture that is constant across devices of that architecture.
 struct traits_t
 {
   // Maximum number of threads per block
-  const int max_threads_per_block = 1024;
+  int max_threads_per_block = 1024;
   // Maximum x-dimension of a block
-  const int max_block_dim_x = 1024;
+  int max_block_dim_x = 1024;
   // Maximum y-dimension of a block
-  const int max_block_dim_y = 1024;
+  int max_block_dim_y = 1024;
   // Maximum z-dimension of a block
-  const int max_block_dim_z = 64;
+  int max_block_dim_z = 64;
   // Maximum x-dimension of a grid
-  const int max_grid_dim_x = ::cuda::std::numeric_limits<int32_t>::max();
+  int max_grid_dim_x = ::cuda::std::numeric_limits<int32_t>::max();
   // Maximum y-dimension of a grid
-  const int max_grid_dim_y = 64 * 1024 - 1;
+  int max_grid_dim_y = 64 * 1024 - 1;
   // Maximum z-dimension of a grid
-  const int max_grid_dim_z = 64 * 1024 - 1;
+  int max_grid_dim_z = 64 * 1024 - 1;
   // Maximum amount of shared memory available to a thread block in bytes
-  const int max_shared_memory_per_block = 48 * 1024;
+  ::cuda::std::size_t max_shared_memory_per_block = 48 * 1024;
   // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
-  const int total_constant_memory = 64 * 1024;
+  ::cuda::std::size_t total_constant_memory = 64 * 1024;
   // Warp size in threads
-  const int warp_size = 32;
+  int warp_size = 32;
   // Maximum number of concurrent grids on the device
-  const int max_resident_grids = 128;
+  int max_resident_grids = 128;
   // true if the device can concurrently copy memory between host and device
   // while executing a kernel, or false if not
-  const bool gpu_overlap = true;
+  bool gpu_overlap = true;
   // true if the device can map host memory into CUDA address space
-  const bool can_map_host_memory = true;
+  bool can_map_host_memory = true;
   // true if the device supports executing multiple kernels within the same
   // context simultaneously, or false if not. It is not guaranteed that multiple
   // kernels will be resident on the device concurrently so this feature should
   // not be relied upon for correctness.
-  const bool concurrent_kernels = true;
+  bool concurrent_kernels = true;
   // true if the device supports stream priorities, or false if not
-  const bool stream_priorities_supported = true;
+  bool stream_priorities_supported = true;
   // true if device supports caching globals in L1 cache, false if not
-  const bool global_l1_cache_supported = true;
+  bool global_l1_cache_supported = true;
   // true if device supports caching locals in L1 cache, false if not
-  const bool local_l1_cache_supported = true;
+  bool local_l1_cache_supported = true;
   // TODO: We might want to have these per-arch
   // Maximum number of 32-bit registers available to a thread block
-  const int max_registers_per_block = 64 * 1024;
+  int max_registers_per_block = 64 * 1024;
   // Maximum number of 32-bit registers available to a multiprocessor; this
   // number is shared by all thread blocks simultaneously resident on a
   // multiprocessor
-  const int max_registers_per_multiprocessor = 64 * 1024;
+  int max_registers_per_multiprocessor = 64 * 1024;
   // Maximum number of 32-bit registers available to a thread
-  const int max_registers_per_thread = 255;
+  int max_registers_per_thread = 255;
   // Identifier for the architecture
   id arch_id;
@@ -144,7 +146,7 @@ struct traits_t
   // Maximum amount of shared memory available to a multiprocessor in bytes;
   // this amount is shared by all thread blocks simultaneously resident on a
   // multiprocessor
-  int max_shared_memory_per_multiprocessor;
+  ::cuda::std::size_t max_shared_memory_per_multiprocessor;
   // Maximum number of thread blocks that can reside on a multiprocessor
   int max_blocks_per_multiprocessor;
@@ -156,11 +158,11 @@ struct traits_t
   int max_warps_per_multiprocessor;
   // Shared memory reserved by CUDA driver per block in bytes
-  int reserved_shared_memory_per_block;
+  ::cuda::std::size_t reserved_shared_memory_per_block;
   // Maximum per block shared memory size on the device. This value can be opted
   // into when using dynamic_shared_memory with NonPortableSize set to true
-  int max_shared_memory_per_block_optin;
+  ::cuda::std::size_t max_shared_memory_per_block_optin;
   // TODO: Do we want these?:
   // true if architecture supports clusters
@@ -182,10 +184,10 @@ struct traits_t
 // @brief Architecture traits
 // Template function that returns the traits for an architecture with a given id.
 template <id _Id>
-[[nodiscard]] _CCCL_HOST_DEVICE constexpr traits_t traits();
+[[nodiscard]] _CCCL_API constexpr traits_t traits();
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_60>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_60>()
 {
   traits_t __traits{};
   __traits.arch_id                              = id::sm_60;
@@ -208,7 +210,7 @@ template <>
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_61>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_61>()
 {
   traits_t __traits{};
   __traits.arch_id                              = id::sm_61;
@@ -231,7 +233,7 @@ template <>
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_70>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_70>()
 {
   traits_t __traits{};
   __traits.arch_id                              = id::sm_70;
@@ -255,7 +257,7 @@ template <>
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_75>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_75>()
 {
   traits_t __traits{};
   __traits.arch_id                              = id::sm_75;
@@ -279,7 +281,7 @@ template <>
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_80>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_80>()
 {
   traits_t __traits{};
   __traits.arch_id                              = id::sm_80;
@@ -303,7 +305,7 @@ template <>
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_86>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_86>()
 {
   traits_t __traits{};
   __traits.arch_id                              = id::sm_86;
@@ -327,7 +329,7 @@ template <>
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_89>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_89>()
 {
   traits_t __traits{};
   __traits.arch_id                              = id::sm_89;
@@ -351,7 +353,7 @@ template <>
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_90>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_90>()
 {
   traits_t __traits{};
   __traits.arch_id                              = id::sm_90;
@@ -376,13 +378,13 @@ template <>
 // No sm_90a specific fields for now.
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_90a>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_90a>()
 {
   return ::cuda::arch::traits<id::sm_90>();
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_100>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_100>()
 {
   traits_t __traits{};
   __traits.arch_id                              = id::sm_100;
@@ -406,13 +408,13 @@ template <>
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_100a>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_100a>()
 {
   return ::cuda::arch::traits<id::sm_100>();
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_103>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_103>()
 {
   traits_t __traits                 = ::cuda::arch::traits<id::sm_100>();
   __traits.arch_id                  = id::sm_103;
@@ -423,13 +425,13 @@ template <>
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_103a>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_103a>()
 {
   return ::cuda::arch::traits<id::sm_103>();
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_110>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_110>()
 {
   traits_t __traits                 = ::cuda::arch::traits<id::sm_100>();
   __traits.arch_id                  = id::sm_110;
@@ -440,7 +442,7 @@ template <>
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_110a>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_110a>()
 {
   return ::cuda::arch::traits<id::sm_110>();
 };
@@ -470,7 +472,7 @@ template <>
 };
 template <>
-[[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_120a>()
+[[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_120a>()
 {
   return ::cuda::arch::traits<id::sm_120>();
 };
@@ -516,7 +518,7 @@ inline constexpr int __highest_known_arch = 120;
     case id::sm_120a:
       return ::cuda::arch::traits<id::sm_120a>();
     default:
-      ::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Traits requested for an unknown architecture");
+      ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Traits requested for an unknown architecture");
       break;
   }
 }
@@ -525,7 +527,7 @@ inline constexpr int __highest_known_arch = 120;
 {
   if (compute_capability < 60 || compute_capability > __highest_known_arch)
   {
-    ::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Compute capability out of range");
+    ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
   }
   return static_cast<id>(compute_capability);
 }
@@ -535,7 +537,7 @@ inline constexpr int __highest_known_arch = 120;
   return ::cuda::arch::traits_for_id(::cuda::arch::id_for_compute_capability(compute_capability));
 }
-_CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
+[[nodiscard]] _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
 {
   switch (value)
   {
@@ -550,13 +552,13 @@ _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
     case 120:
       return id::sm_120a;
     default:
-      ::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Compute capability out of range");
+      ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
       break;
   }
 }
 //! @brief Provides architecture traits of the architecture matching __CUDA_ARCH__ macro
-[[nodiscard]] _CCCL_DEVICE inline constexpr arch::traits_t current_traits()
+[[nodiscard]] _CCCL_DEVICE_API inline constexpr arch::traits_t current_traits()
 {
   // fixme: this doesn't work with nvc++ -cuda
 #  ifdef __CUDA_ARCH__
@@ -571,7 +573,7 @@ _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
 #  endif // __CUDA_ARCH__
 }
-[[nodiscard]] inline constexpr arch::traits_t
+[[nodiscard]] _CCCL_HOST_API inline constexpr arch::traits_t
 __arch_traits_might_be_unknown(int __device, unsigned int __compute_capability)
 {
   if (__compute_capability <= arch::__highest_known_arch)