PyPI - cuda-cccl - Versions diffs - 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show

cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h CHANGED Viewed

@@ -23,6 +23,7 @@
 #include <cuda/__cmath/ceil_div.h>
 #include <cuda/__cmath/ilog.h>
+#include <cuda/__cmath/mul_hi.h>
 #include <cuda/__cmath/pow2.h>
 #include <cuda/std/__type_traits/common_type.h>
 #include <cuda/std/__type_traits/is_integer.h>
@@ -30,7 +31,6 @@
 #include <cuda/std/__type_traits/make_nbit_int.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__type_traits/num_bits.h>
-#include <cuda/std/__type_traits/promote.h>
 #include <cuda/std/__utility/pair.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
@@ -39,78 +39,6 @@
 _CCCL_BEGIN_NAMESPACE_CUDA
-/***********************************************************************************************************************
- * Extract higher bits after multiplication
- **********************************************************************************************************************/
-template <typename _Tp, typename _Lhs>
-[[nodiscard]] _CCCL_API constexpr ::cuda::std::common_type_t<_Tp, _Lhs>
-__multiply_extract_higher_bits_fallback(_Tp __x, _Lhs __y)
-{
-  using __ret_t         = ::cuda::std::common_type_t<_Tp, _Lhs>;
-  constexpr int __shift = ::cuda::std::__num_bits_v<__ret_t> / 2;
-  using __half_bits_t   = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<__ret_t>>;
-  auto __x_high         = static_cast<__half_bits_t>(__x >> __shift);
-  auto __x_low          = static_cast<__half_bits_t>(__x);
-  auto __y_high         = static_cast<__half_bits_t>(__y >> __shift);
-  auto __y_low          = static_cast<__half_bits_t>(__y);
-  auto __p0             = __x_low * __y_low;
-  auto __p1             = __x_low * __y_high;
-  auto __p2             = __x_high * __y_low;
-  auto __p3             = __x_high * __y_high;
-  auto __mid            = __p1 + __p2;
-  __half_bits_t __carry = (__mid < __p1);
-  auto __po_half        = __p0 >> __shift;
-  __mid                 = __mid + __po_half;
-  __carry += (__mid < __po_half);
-  return __p3 + (__mid >> __shift) + (__carry << __shift);
-}
-template <typename _Tp, typename _Lhs>
-[[nodiscard]] _CCCL_API constexpr ::cuda::std::common_type_t<_Tp, _Lhs> __multiply_extract_higher_bits(_Tp __x, _Lhs __y)
-{
-  using ::cuda::std::__cccl_is_integer_v;
-  using ::cuda::std::__num_bits_v;
-  using ::cuda::std::is_signed_v;
-  static_assert(__cccl_is_integer_v<_Tp>, "__multiply_extract_higher_bits: T is required to be an integer type");
-  static_assert(__cccl_is_integer_v<_Lhs>, "__multiply_extract_higher_bits: T is required to be an integer type");
-  if constexpr (is_signed_v<_Tp>)
-  {
-    _CCCL_ASSERT(__x >= 0, "__x must be non-negative");
-    _CCCL_ASSUME(__x >= 0);
-  }
-  if constexpr (is_signed_v<_Lhs>)
-  {
-    _CCCL_ASSERT(__y >= 0, "__y must be non-negative");
-    _CCCL_ASSUME(__y >= 0);
-  }
-  using __ret_t = ::cuda::std::common_type_t<_Tp, _Lhs>;
-  if (!::cuda::std::__cccl_default_is_constant_evaluated())
-  {
-    if constexpr (sizeof(_Tp) == sizeof(uint32_t) && sizeof(_Lhs) == sizeof(uint32_t))
-    {
-      NV_IF_TARGET(NV_IS_DEVICE, (return ::__umulhi(static_cast<uint32_t>(__x), static_cast<uint32_t>(__y));));
-    }
-#if !_CCCL_HAS_INT128()
-    else if constexpr (sizeof(_Tp) == sizeof(uint64_t) && sizeof(_Lhs) == sizeof(uint64_t))
-    {
-      NV_DISPATCH_TARGET(NV_IS_DEVICE, (return ::__umul64hi(static_cast<uint64_t>(__x), static_cast<uint64_t>(__y));));
-    }
-#endif // !_CCCL_HAS_INT128()
-  }
-  if constexpr (sizeof(__ret_t) < sizeof(uint64_t) || (sizeof(__ret_t) == sizeof(uint64_t) && _CCCL_HAS_INT128()))
-  {
-    constexpr auto __mul_bits = ::cuda::next_power_of_two(__num_bits_v<_Tp> + __num_bits_v<_Lhs>);
-    using __larger_t          = ::cuda::std::__make_nbit_uint_t<__mul_bits>;
-    auto __ret                = (static_cast<__larger_t>(__x) * __y) >> (__mul_bits / 2);
-    return static_cast<__ret_t>(__ret);
-  }
-  else
-  {
-    return ::cuda::__multiply_extract_higher_bits_fallback(__x, __y);
-  }
-}
 /***********************************************************************************************************************
  * Fast Modulo/Division based on Precomputation
  **********************************************************************************************************************/
@@ -184,6 +112,7 @@ public:
       _CCCL_ASSERT(__dividend >= 0, "dividend must be non-negative");
     }
     using __common_t    = ::cuda::std::common_type_t<_Tp, _Lhs>;
+    using __ucommon_t   = ::cuda::std::make_unsigned_t<__common_t>;
     using _Up           = ::cuda::std::make_unsigned_t<_Lhs>;
     const auto __div    = __divisor1.__divisor; // cannot use structure binding because of clang-14
     const auto __mul    = __divisor1.__multiplier;
@@ -205,7 +134,7 @@ public:
     {
       return static_cast<__common_t>(__dividend);
     }
-    auto __higher_bits = ::cuda::__multiply_extract_higher_bits(__udividend, __mul);
+    auto __higher_bits = ::cuda::mul_hi(static_cast<__ucommon_t>(__udividend), static_cast<__ucommon_t>(__mul));
     auto __quotient    = static_cast<__common_t>(__higher_bits >> __shift_);
     _CCCL_ASSERT(__quotient == static_cast<__common_t>(__dividend / __div), "wrong __quotient");
     return __quotient;

cuda/cccl/headers/include/cuda/__cmath/mul_hi.h ADDED Viewed

@@ -0,0 +1,146 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#ifndef _CUDA___CMATH_MUL_HI_H
+#define _CUDA___CMATH_MUL_HI_H
+#include <cuda/std/detail/__config>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_integer.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/make_nbit_int.h>
+#include <cuda/std/__type_traits/make_unsigned.h>
+#include <cuda/std/__type_traits/num_bits.h>
+#include <cuda/std/cstdint>
+#if _CCCL_COMPILER(MSVC)
+#  include <intrin.h>
+#endif // _CCCL_COMPILER(MSVC)
+#include <cuda/std/__cccl/prologue.h>
+_CCCL_BEGIN_NAMESPACE_CUDA
+/***********************************************************************************************************************
+ * Extract higher bits after multiplication
+ **********************************************************************************************************************/
+template <typename _Tp>
+[[nodiscard]] _CCCL_API constexpr _Tp __mul_hi_fallback(_Tp __lhs, _Tp __rhs) noexcept
+{
+  static_assert(::cuda::std::is_unsigned_v<_Tp>, "__mul_hi_fallback: T is required to be a unsigned integer type");
+  constexpr int __half_bits = ::cuda::std::__num_bits_v<_Tp> / 2;
+  using __half_bits_t       = ::cuda::std::__make_nbit_uint_t<__half_bits>;
+  const auto __lhs_low      = static_cast<__half_bits_t>(__lhs); // 32-bit
+  const auto __lhs_high     = static_cast<__half_bits_t>(__lhs >> __half_bits); // 32-bit
+  const auto __rhs_low      = static_cast<__half_bits_t>(__rhs); // 32-bit
+  const auto __rhs_high     = static_cast<__half_bits_t>(__rhs >> __half_bits); // 32-bit
+  const auto __po_half      = (static_cast<_Tp>(__lhs_low) * __rhs_low) >> __half_bits;
+  const auto __p1           = static_cast<_Tp>(__lhs_low) * __rhs_high; // 64-bit
+  const auto __p2           = static_cast<_Tp>(__lhs_high) * __rhs_low; // 64-bit
+  const auto __p3           = static_cast<_Tp>(__lhs_high) * __rhs_high; // 64-bit
+  const auto __p1_half      = static_cast<__half_bits_t>(__p1); // 32-bit
+  const auto __p2_half      = static_cast<__half_bits_t>(__p2); // 32-bit
+  const auto __carry        = (__po_half + __p1_half + __p2_half) >> __half_bits; // 64-bit
+  return __p3 + (__p1 >> __half_bits) + (__p2 >> __half_bits) + __carry;
+}
+_CCCL_TEMPLATE(typename _Tp)
+_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>)
+[[nodiscard]]
+_CCCL_API constexpr _Tp mul_hi(_Tp __lhs, _Tp __rhs) noexcept
+{
+  using ::cuda::std::int64_t;
+  using ::cuda::std::is_signed_v;
+  if (!::cuda::std::__cccl_default_is_constant_evaluated())
+  {
+    if constexpr (sizeof(_Tp) == sizeof(int))
+    {
+      if constexpr (is_signed_v<_Tp>)
+      {
+        [[maybe_unused]] const auto __lhs1 = static_cast<int>(__lhs);
+        [[maybe_unused]] const auto __rhs1 = static_cast<int>(__rhs);
+        NV_IF_TARGET(NV_IS_DEVICE, (return ::__mulhi(__lhs1, __rhs1);));
+      }
+      else // is_unsigned_v<_Tp>
+      {
+        [[maybe_unused]] const auto __lhs1 = static_cast<unsigned>(__lhs);
+        [[maybe_unused]] const auto __rhs1 = static_cast<unsigned>(__rhs);
+        NV_IF_TARGET(NV_IS_DEVICE, (return ::__umulhi(__lhs1, __rhs1);));
+      }
+    }
+    else if constexpr (sizeof(_Tp) == sizeof(int64_t))
+    {
+      if constexpr (is_signed_v<_Tp>)
+      {
+        [[maybe_unused]] const auto __lhs1 = static_cast<long long>(__lhs);
+        [[maybe_unused]] const auto __rhs1 = static_cast<long long>(__rhs);
+        NV_IF_TARGET(NV_IS_DEVICE, (return ::__mul64hi(__lhs1, __rhs1);));
+#if _CCCL_COMPILER(MSVC)
+        NV_IF_TARGET(NV_IS_HOST, (return ::__mulh(__lhs1, __rhs1);));
+#endif // _CCCL_COMPILER(MSVC)
+      }
+      else // is_unsigned_v<_Tp>
+      {
+        [[maybe_unused]] const auto __lhs1 = static_cast<unsigned long long>(__lhs);
+        [[maybe_unused]] const auto __rhs1 = static_cast<unsigned long long>(__rhs);
+        NV_IF_TARGET(NV_IS_DEVICE, (return ::__umul64hi(__lhs1, __rhs1);));
+#if _CCCL_COMPILER(MSVC)
+        NV_IF_TARGET(NV_IS_HOST, (return ::__umulh(__lhs1, __rhs1);));
+#endif // _CCCL_COMPILER(MSVC)
+      }
+    }
+  }
+  if constexpr (sizeof(_Tp) < sizeof(int64_t) || (sizeof(_Tp) == sizeof(int64_t) && _CCCL_HAS_INT128()))
+  {
+    constexpr auto __bits = ::cuda::std::__num_bits_v<_Tp>;
+    using __larger_t      = ::cuda::std::__make_nbit_int_t<__bits * 2, is_signed_v<_Tp>>;
+    const auto __ret      = (static_cast<__larger_t>(__lhs) * __rhs) >> __bits;
+    return static_cast<_Tp>(__ret);
+  }
+  else // sizeof(_Tp) >= sizeof(int64_t) && !_CCCL_HAS_INT128()
+  {
+    if constexpr (is_signed_v<_Tp>)
+    {
+      using _Up         = ::cuda::std::make_unsigned_t<_Tp>;
+      const auto __lhs1 = static_cast<_Up>(__lhs);
+      const auto __rhs1 = static_cast<_Up>(__rhs);
+      auto __hi         = ::cuda::__mul_hi_fallback(__lhs1, __rhs1);
+      if (__lhs < 0)
+      {
+        __hi -= __rhs1;
+      }
+      if (__rhs < 0)
+      {
+        __hi -= __lhs1;
+      }
+      return static_cast<_Tp>(__hi);
+    }
+    else
+    {
+      return ::cuda::__mul_hi_fallback(__lhs, __rhs);
+    }
+  }
+}
+_CCCL_END_NAMESPACE_CUDA
+#include <cuda/std/__cccl/epilogue.h>
+#endif // _CUDA___CMATH_MULTIPLY_HIGH_HALF_H

cuda/cccl/headers/include/cuda/__complex/get_real_imag.h CHANGED Viewed

@@ -24,10 +24,6 @@
 #include <cuda/__fwd/complex.h>
 #include <cuda/std/__fwd/complex.h>
-#if !_CCCL_COMPILER(NVRTC)
-#  include <complex>
-#endif // !_CCCL_COMPILER(NVRTC)
 #include <cuda/std/__cccl/prologue.h>
 _CCCL_BEGIN_NAMESPACE_CUDA

cuda/cccl/headers/include/cuda/__device/all_devices.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef _CUDA___DEVICE_ALL_DEVICES_H
 #define _CUDA___DEVICE_ALL_DEVICES_H
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -22,10 +22,12 @@
 #endif // no system header
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+#  include <cuda/__device/device_ref.h>
 #  include <cuda/__device/physical_device.h>
 #  include <cuda/__driver/driver_api.h>
-#  include <cuda/std/cassert>
-#  include <cuda/std/detail/libcxx/include/stdexcept>
+#  include <cuda/__fwd/devices.h>
+#  include <cuda/std/__cstddef/types.h>
 #  include <cuda/std/span>
 #  include <vector>
@@ -33,129 +35,62 @@
 #  include <cuda/std/__cccl/prologue.h>
 _CCCL_BEGIN_NAMESPACE_CUDA
-namespace __detail
-{
-//! @brief A random-access range of all available CUDA devices
-class all_devices
-{
-public:
-  using size_type      = ::std::vector<physical_device>::size_type;
-  using iterator       = ::std::vector<physical_device>::const_iterator;
-  using const_iterator = ::std::vector<physical_device>::const_iterator;
-  all_devices() = default;
-  [[nodiscard]] const physical_device& operator[](size_type __i) const;
-  [[nodiscard]] size_type size() const;
-  [[nodiscard]] iterator begin() const noexcept;
-  [[nodiscard]] iterator end() const noexcept;
-  operator ::cuda::std::span<const device_ref>() const;
-private:
-  struct __initializer_iterator;
-  static const ::std::vector<physical_device>& __devices();
-};
-//! @brief An iterator used to in-place construct `device` objects in a
-//! std::vector.
-//!
-//! Since `device` objects are not movable or copyable, we need to construct them
-//! in-place with a proxy object that can be implicitly converted to a `device`
-//! object.
-struct all_devices::__initializer_iterator
+[[nodiscard]] _CCCL_HOST_API inline ::std::vector<device_ref> __make_devices()
 {
-  using value_type        = __emplace_device;
-  using reference         = __emplace_device;
-  using iterator_category = ::std::forward_iterator_tag;
-  using difference_type   = int;
-  using pointer           = __emplace_device;
-  int __id_;
-  __emplace_device operator*() const noexcept
+  ::std::vector<device_ref> __ret{};
+  __ret.reserve(::cuda::__physical_devices().size());
+  for (::cuda::std::size_t __i = 0; __i < ::cuda::__physical_devices().size(); ++__i)
   {
-    return __emplace_device{__id_};
+    __ret.emplace_back(static_cast<int>(__i));
   }
+  return __ret;
+}
-  __emplace_device operator->() const noexcept
-  {
-    return __emplace_device{__id_};
-  }
+[[nodiscard]] inline ::cuda::std::span<const device_ref> __devices()
+{
+  static const auto __devices = ::cuda::__make_devices();
+  return ::cuda::std::span<const device_ref>{__devices.data(), __devices.size()};
+}
-  __initializer_iterator& operator++() noexcept
-  {
-    ++__id_;
-    return *this;
-  }
+//! @brief A random-access range of all available CUDA devices
+class __all_devices
+{
+public:
+  using value_type = ::cuda::std::span<const device_ref>::value_type;
+  using size_type  = ::cuda::std::span<const device_ref>::size_type;
+  using iterator   = ::cuda::std::span<const device_ref>::iterator;
+  _CCCL_HIDE_FROM_ABI __all_devices()            = default;
+  __all_devices(const __all_devices&)            = delete;
+  __all_devices(__all_devices&&)                 = delete;
+  __all_devices& operator=(const __all_devices&) = delete;
+  __all_devices& operator=(__all_devices&&)      = delete;
-  __initializer_iterator operator++(int) noexcept
+  [[nodiscard]] _CCCL_HOST_API device_ref operator[](size_type __i) const
   {
-    auto __tmp = *this;
-    ++__id_;
-    return __tmp;
+    if (__i >= size())
+    {
+      ::cuda::std::__throw_out_of_range("device index out of range");
+    }
+    return ::cuda::__devices()[__i];
   }
-  bool operator==(const __initializer_iterator& __other) const noexcept
+  [[nodiscard]] _CCCL_HOST_API size_type size() const
   {
-    return __id_ == __other.__id_;
+    return ::cuda::__devices().size();
   }
-  bool operator!=(const __initializer_iterator& __other) const noexcept
+  [[nodiscard]] _CCCL_HOST_API iterator begin() const
   {
-    return __id_ != __other.__id_;
+    return ::cuda::__devices().begin();
   }
-};
-[[nodiscard]] inline const physical_device& all_devices::operator[](size_type __id_) const
-{
-  if (__id_ >= size())
+  [[nodiscard]] _CCCL_HOST_API iterator end() const
   {
-    if (size() == 0)
-    {
-      ::cuda::std::__throw_out_of_range("device was requested but no CUDA devices found");
-    }
-    else
-    {
-      ::cuda::std::__throw_out_of_range(
-        (::std::string("device index out of range: ") + ::std::to_string(__id_)).c_str());
-    }
+    return ::cuda::__devices().end();
   }
-  return __devices()[__id_];
-}
-[[nodiscard]] inline all_devices::size_type all_devices::size() const
-{
-  return __devices().size();
-}
-[[nodiscard]] inline all_devices::iterator all_devices::begin() const noexcept
-{
-  return __devices().begin();
-}
-[[nodiscard]] inline all_devices::iterator all_devices::end() const noexcept
-{
-  return __devices().end();
-}
-inline all_devices::operator ::cuda::std::span<const device_ref>() const
-{
-  static const ::std::vector<device_ref> __refs(begin(), end());
-  return ::cuda::std::span<const device_ref>(__refs);
-}
-inline const ::std::vector<physical_device>& all_devices::__devices()
-{
-  static const ::std::vector<physical_device> __devices{
-    __initializer_iterator{0}, __initializer_iterator{::cuda::__driver::__deviceGetCount()}};
-  return __devices;
-}
-} // namespace __detail
+};
 //! @brief A range of all available CUDA devices
 //!
@@ -171,7 +106,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
 //!   struct iterator;
 //!   using const_iterator = iterator;
 //!
-//!   [[nodiscard]] constexpr const physical_device& operator[](size_type i) const noexcept;
+//!   [[nodiscard]] device_ref operator[](size_type i) const noexcept;
 //!
 //!   [[nodiscard]] size_type size() const;
 //!
@@ -183,7 +118,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
 //!
 //! @par
 //! `__all_devices::iterator` is a random access iterator with a `reference`
-//! type of `const physical_device&`.
+//! type of `const device_ref&`.
 //!
 //! @par Example
 //! @code
@@ -194,39 +129,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
 //! @sa
 //! * device
 //! * device_ref
-inline constexpr __detail::all_devices devices{};
-inline const arch::traits_t& device_ref::arch_traits() const
-{
-  return devices[get()].arch_traits();
-}
-[[nodiscard]] inline ::std::vector<device_ref> device_ref::peer_devices() const
-{
-  ::std::vector<device_ref> __result;
-  __result.reserve(devices.size());
-  for (const physical_device& __other_dev : devices)
-  {
-    // Exclude the device this API is called on. The main use case for this API
-    // is enable/disable peer access. While enable peer access can be called on
-    // device on which memory resides, disable peer access will error-out.
-    // Usage of the peer access control is smoother when *this is excluded,
-    // while it can be easily added with .push_back() on the vector if a full
-    // group of peers is needed (for cases other than peer access control)
-    if (__other_dev != *this)
-    {
-      // While in almost all practical applications peer access should be symmetrical,
-      // it is possible to build a system with one directional peer access, check
-      // both ways here just to be safe
-      if (has_peer_access_to(__other_dev) && __other_dev.has_peer_access_to(*this))
-      {
-        __result.push_back(__other_dev);
-      }
-    }
-  }
-  return __result;
-}
+inline constexpr __all_devices devices{};
 _CCCL_END_NAMESPACE_CUDA