PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.1__cp313-cp313-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.1__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show

cuda/cccl/headers/include/cuda/__stream/stream.h CHANGED Viewed

@@ -24,9 +24,9 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #  include <cuda/__device/device_ref.h>
+#  include <cuda/__driver/driver_api.h>
 #  include <cuda/__runtime/ensure_current_context.h>
 #  include <cuda/__stream/stream_ref.h> // IWYU pragma: export
-#  include <cuda/std/__cuda/api_wrapper.h>
 #  include <cuda/std/__cccl/prologue.h>
@@ -47,8 +47,7 @@ struct stream : stream_ref
       : stream_ref(__detail::__invalid_stream)
   {
     [[maybe_unused]] __ensure_current_context __ctx_setter(__dev);
-    _CCCL_TRY_CUDA_API(
-      ::cudaStreamCreateWithPriority, "Failed to create a stream", &__stream, cudaStreamNonBlocking, __priority);
+    __stream = ::cuda::__driver::__streamCreateWithPriority(cudaStreamNonBlocking, __priority);
   }
   //! @brief Construct a new `stream` object into the moved-from state.

cuda/cccl/headers/include/cuda/__stream/stream_ref.h CHANGED Viewed

@@ -23,11 +23,12 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+#  include <cuda/__device/device_ref.h>
+#  include <cuda/__driver/driver_api.h>
 #  include <cuda/__event/timed_event.h>
 #  include <cuda/__fwd/get_stream.h>
 #  include <cuda/__runtime/ensure_current_context.h>
 #  include <cuda/__utility/no_init.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
 #  include <cuda/std/__exception/cuda_error.h>
 #  include <cuda/std/cstddef>
@@ -39,7 +40,7 @@ namespace __detail
 {
 // 0 is a valid stream in CUDA, so we need some other invalid stream representation
 // Can't make it constexpr, because cudaStream_t is a pointer type
-static const ::cudaStream_t __invalid_stream = reinterpret_cast<cudaStream_t>(~0ULL);
+static const ::cudaStream_t __invalid_stream = reinterpret_cast<::cudaStream_t>(~0ULL);
 } // namespace __detail
 //! @brief A type representing a stream ID.
@@ -238,11 +239,17 @@ public:
   //! @throws cuda_error if device check fails
   _CCCL_HOST_API device_ref device() const
   {
-    CUcontext __stream_ctx = ::cuda::__driver::__streamGetCtx(__stream);
-    __ensure_current_context __setter(__stream_ctx);
-    int __id;
-    _CCCL_TRY_CUDA_API(cudaGetDevice, "Could not get device from a stream", &__id);
-    return device_ref{__id};
+    ::CUdevice __device{};
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+    __device = ::cuda::__driver::__streamGetDevice(__stream);
+#  else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv
+    {
+      ::CUcontext __stream_ctx = ::cuda::__driver::__streamGetCtx(__stream);
+      __ensure_current_context __setter(__stream_ctx);
+      __device = ::cuda::__driver::__ctxGetDevice();
+    }
+#  endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^
+    return device_ref{::cuda::__driver::__cudevice_to_ordinal(__device)};
   }
   //! @brief Queries the \c stream_ref for itself. This makes \c stream_ref usable in places where we expect an
@@ -262,21 +269,20 @@ inline void event_ref::record(stream_ref __stream) const
 }
 inline event::event(stream_ref __stream, event::flags __flags)
-    : event(__stream, static_cast<unsigned int>(__flags) | cudaEventDisableTiming)
+    : event(__stream, static_cast<unsigned>(__flags) | cudaEventDisableTiming)
 {
   record(__stream);
 }
-inline event::event(stream_ref __stream, unsigned int __flags)
+inline event::event(stream_ref __stream, unsigned __flags)
     : event_ref(::cudaEvent_t{})
 {
   [[maybe_unused]] __ensure_current_context __ctx_setter(__stream);
-  _CCCL_TRY_CUDA_API(
-    ::cudaEventCreateWithFlags, "Failed to create CUDA event", &__event_, static_cast<unsigned int>(__flags));
+  __event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
 }
 inline timed_event::timed_event(stream_ref __stream, event::flags __flags)
-    : event(__stream, static_cast<unsigned int>(__flags))
+    : event(__stream, static_cast<unsigned>(__flags))
 {
   record(__stream);
 }

cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h CHANGED Viewed

@@ -42,7 +42,7 @@ using __vtable_for _CCCL_NODEBUG_ALIAS = typename __overrides_for_t<_Interface>:
 //! __basic_vtable
 //!
 template <class _Interface, auto... _Mbrs>
-struct _CCCL_DECLSPEC_EMPTY_BASES __basic_vtable
+struct _CCCL_DECLSPEC_EMPTY_BASES _CCCL_TYPE_VISIBILITY_DEFAULT __basic_vtable
     : __rtti_base
     , __virtual_fn<_Mbrs>...
 {
@@ -105,7 +105,7 @@ struct _CCCL_DECLSPEC_EMPTY_BASES __basic_vtable
 //!
 template <class... _Interfaces>
-struct _CCCL_DECLSPEC_EMPTY_BASES __vtable_tuple
+struct _CCCL_DECLSPEC_EMPTY_BASES _CCCL_TYPE_VISIBILITY_DEFAULT __vtable_tuple
     : __rtti_ex<sizeof...(_Interfaces)>
     , __vtable_for<_Interfaces>...
 {

cuda/cccl/headers/include/cuda/__utility/basic_any.h CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef _CUDA___UTILITY_BASIC_ANY_H
 #define _CUDA___UTILITY_BASIC_ANY_H
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header

cuda/cccl/headers/include/cuda/algorithm CHANGED Viewed

@@ -11,7 +11,7 @@
 #ifndef _CUDA_ALGORITHM
 #define _CUDA_ALGORITHM
-#include <cuda/__cccl_config>
+#include <cuda/std/detail/__config>
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header

cuda/cccl/headers/include/cuda/devices CHANGED Viewed

@@ -11,6 +11,16 @@
 #ifndef _CUDA_DEVICES
 #define _CUDA_DEVICES
+#include <cuda/std/detail/__config>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 #include <cuda/__device/all_devices.h>
 #include <cuda/__device/arch_traits.h>
 #include <cuda/__device/attributes.h>

cuda/cccl/headers/include/cuda/iterator CHANGED Viewed

@@ -33,6 +33,7 @@
 #include <cuda/__iterator/transform_output_iterator.h>
 #include <cuda/__iterator/zip_function.h>
 #include <cuda/__iterator/zip_iterator.h>
+#include <cuda/__iterator/zip_transform_iterator.h>
 #include <cuda/std/iterator>
 #endif // _CUDA_ITERATOR

cuda/cccl/headers/include/cuda/std/__bit/countl.h CHANGED Viewed

@@ -100,7 +100,14 @@ template <typename _Tp>
 template <typename _Tp>
 [[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_DEVICE int __cccl_countl_zero_impl_device(_Tp __v) noexcept
 {
-  return (sizeof(_Tp) == sizeof(uint32_t)) ? ::__clz(static_cast<int>(__v)) : ::__clzll(static_cast<long long>(__v));
+  if constexpr (sizeof(_Tp) == sizeof(uint32_t))
+  {
+    return static_cast<int>(::__clz(static_cast<int>(__v)));
+  }
+  else
+  {
+    return static_cast<int>(::__clzll(static_cast<long long>(__v)));
+  }
 }
 #endif // _CCCL_CUDA_COMPILATION()

cuda/cccl/headers/include/cuda/std/__bit/countr.h CHANGED Viewed

@@ -114,11 +114,11 @@ template <typename _Tp>
 {
   if constexpr (sizeof(_Tp) == sizeof(uint32_t))
   {
-    return ::__clz(static_cast<int>(::__brev(__v)));
+    return static_cast<int>(::__clz(static_cast<int>(::__brev(__v))));
   }
   else
   {
-    return ::__clzll(static_cast<long long>(::__brevll(__v)));
+    return static_cast<int>(::__clzll(static_cast<long long>(::__brevll(__v))));
   }
 }
 #endif // _CCCL_CUDA_COMPILATION()

cuda/cccl/headers/include/cuda/std/__bit/reference.h CHANGED Viewed

@@ -275,10 +275,10 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_aligned(
     // do first word
     if (__first.__ctz_ != 0)
     {
-      unsigned __clz       = __bits_per_word - __first.__ctz_;
-      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
       __n -= __dn;
-      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
       __storage_type __b = *__first.__seg_ & __m;
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
@@ -420,8 +420,8 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_backward_aligned(
     {
       difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__last.__ctz_), __n);
       __n -= __dn;
-      unsigned __clz     = __bits_per_word - __last.__ctz_;
-      __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
+      unsigned __clz_f   = __bits_per_word - __last.__ctz_;
+      __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_f);
       __storage_type __b = *__last.__seg_ & __m;
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
@@ -635,10 +635,10 @@ _CCCL_API inline __bit_iterator<_Cr, false> __swap_ranges_aligned(
     // do first word
     if (__first.__ctz_ != 0)
     {
-      unsigned __clz       = __bits_per_word - __first.__ctz_;
-      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
       __n -= __dn;
-      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
       __storage_type __b1 = *__first.__seg_ & __m;
       *__first.__seg_ &= ~__m;
       __storage_type __b2 = *__result.__seg_ & __m;
@@ -988,10 +988,10 @@ _CCCL_API constexpr bool __equal_aligned(
     // do first word
     if (__first1.__ctz_ != 0)
     {
-      unsigned __clz       = __bits_per_word - __first1.__ctz_;
-      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
+      unsigned __clz_f     = __bits_per_word - __first1.__ctz_;
+      difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
       __n -= __dn;
-      __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
       if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
       {
         return false;

cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h CHANGED Viewed

@@ -39,10 +39,10 @@
 #if _CCCL_HAS_PDL()
 // Waits for the previous kernel to complete (when it reaches its final membar). Should be put before the first global
 // memory access in a kernel.
-#  define _CCCL_PDL_GRID_DEPENDENCY_SYNC() NV_IF_TARGET(NV_PROVIDES_SM_90, cudaGridDependencySynchronize();)
+#  define _CCCL_PDL_GRID_DEPENDENCY_SYNC() NV_IF_TARGET(NV_PROVIDES_SM_90, ::cudaGridDependencySynchronize();)
 // Allows the subsequent kernel in the same stream to launch. Can be put anywhere in a kernel.
 // Heuristic(ahendriksen): put it after the last load.
-#  define _CCCL_PDL_TRIGGER_NEXT_LAUNCH() NV_IF_TARGET(NV_PROVIDES_SM_90, cudaTriggerProgrammaticLaunchCompletion();)
+#  define _CCCL_PDL_TRIGGER_NEXT_LAUNCH() NV_IF_TARGET(NV_PROVIDES_SM_90, ::cudaTriggerProgrammaticLaunchCompletion();)
 #else // _CCCL_HAS_PDL()
 #  define _CCCL_PDL_GRID_DEPENDENCY_SYNC()
 #  define _CCCL_PDL_TRIGGER_NEXT_LAUNCH()

cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h CHANGED Viewed

@@ -107,6 +107,8 @@
 #define _CCCL_PP_FOR_EACH_7(_Mp, _1, _2, _3, _4, _5, _6, _7) _Mp(_1) _Mp(_2) _Mp(_3) _Mp(_4) _Mp(_5) _Mp(_6) _Mp(_7)
 #define _CCCL_PP_FOR_EACH_8(_Mp, _1, _2, _3, _4, _5, _6, _7, _8) \
   _Mp(_1) _Mp(_2) _Mp(_3) _Mp(_4) _Mp(_5) _Mp(_6) _Mp(_7) _Mp(_8)
+#define _CCCL_PP_FOR_EACH_9(_Mp, _1, _2, _3, _4, _5, _6, _7, _8, _9) \
+  _Mp(_1) _Mp(_2) _Mp(_3) _Mp(_4) _Mp(_5) _Mp(_6) _Mp(_7) _Mp(_8) _Mp(_9)
 #define _CCCL_PP_PROBE_EMPTY_PROBE__CCCL_PP_PROBE_EMPTY _CCCL_PP_PROBE(~)

cuda/cccl/headers/include/cuda/std/__chrono/duration.h CHANGED Viewed

@@ -43,19 +43,19 @@ template <class _Rep, class _Period = ratio<1>>
 class _CCCL_TYPE_VISIBILITY_DEFAULT duration;
 template <class _Tp>
-inline const bool __is_duration_v = false;
+inline constexpr bool __is_duration_v = false;
 template <class _Rep, class _Period>
-inline const bool __is_duration_v<duration<_Rep, _Period>> = true;
+inline constexpr bool __is_duration_v<duration<_Rep, _Period>> = true;
 template <class _Rep, class _Period>
-inline const bool __is_duration_v<const duration<_Rep, _Period>> = true;
+inline constexpr bool __is_duration_v<const duration<_Rep, _Period>> = true;
 template <class _Rep, class _Period>
-inline const bool __is_duration_v<volatile duration<_Rep, _Period>> = true;
+inline constexpr bool __is_duration_v<volatile duration<_Rep, _Period>> = true;
 template <class _Rep, class _Period>
-inline const bool __is_duration_v<const volatile duration<_Rep, _Period>> = true;
+inline constexpr bool __is_duration_v<const volatile duration<_Rep, _Period>> = true;
 } // namespace chrono
@@ -190,29 +190,29 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT duration
   struct __no_overflow
   {
   private:
-    static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
-    static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
-    static const intmax_t __n1        = _R1::num / __gcd_n1_n2;
-    static const intmax_t __d1        = _R1::den / __gcd_d1_d2;
-    static const intmax_t __n2        = _R2::num / __gcd_n1_n2;
-    static const intmax_t __d2        = _R2::den / __gcd_d1_d2;
-    static const intmax_t max         = -((intmax_t(1) << (sizeof(intmax_t) * CHAR_BIT - 1)) + 1);
+    static constexpr intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
+    static constexpr intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
+    static constexpr intmax_t __n1        = _R1::num / __gcd_n1_n2;
+    static constexpr intmax_t __d1        = _R1::den / __gcd_d1_d2;
+    static constexpr intmax_t __n2        = _R2::num / __gcd_n1_n2;
+    static constexpr intmax_t __d2        = _R2::den / __gcd_d1_d2;
+    static constexpr intmax_t max         = -((intmax_t(1) << (sizeof(intmax_t) * CHAR_BIT - 1)) + 1);
     template <intmax_t _Xp, intmax_t _Yp, bool __overflow>
     struct __mul // __overflow == false
     {
-      static const intmax_t value = _Xp * _Yp;
+      static constexpr intmax_t value = _Xp * _Yp;
     };
     template <intmax_t _Xp, intmax_t _Yp>
     struct __mul<_Xp, _Yp, true>
     {
-      static const intmax_t value = 1;
+      static constexpr intmax_t value = 1;
     };
   public:
-    static const bool value = (__n1 <= max / __d2) && (__n2 <= max / __d1);
-    using type              = ratio<__mul<__n1, __d2, !value>::value, __mul<__n2, __d1, !value>::value>;
+    static constexpr bool value = (__n1 <= max / __d2) && (__n2 <= max / __d1);
+    using type                  = ratio<__mul<__n1, __d2, !value>::value, __mul<__n2, __d1, !value>::value>;
   };
 public:

cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h CHANGED Viewed

@@ -40,11 +40,11 @@ namespace chrono
 class _CCCL_TYPE_VISIBILITY_DEFAULT steady_clock
 {
 public:
-  using duration                        = nanoseconds;
-  using rep                             = duration::rep;
-  using period                          = duration::period;
-  using time_point                      = ::cuda::std::chrono::time_point<steady_clock, duration>;
-  static constexpr const bool is_steady = true;
+  using duration                  = nanoseconds;
+  using rep                       = duration::rep;
+  using period                    = duration::period;
+  using time_point                = ::cuda::std::chrono::time_point<steady_clock, duration>;
+  static constexpr bool is_steady = true;
   [[nodiscard]] _CCCL_API static time_point now() noexcept;
 };

cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h CHANGED Viewed

@@ -39,11 +39,11 @@ namespace chrono
 class _CCCL_TYPE_VISIBILITY_DEFAULT system_clock
 {
 public:
-  using duration                        = ::cuda::std::chrono::nanoseconds;
-  using rep                             = duration::rep;
-  using period                          = duration::period;
-  using time_point                      = ::cuda::std::chrono::time_point<system_clock>;
-  static constexpr const bool is_steady = false;
+  using duration                  = ::cuda::std::chrono::nanoseconds;
+  using rep                       = duration::rep;
+  using period                    = duration::period;
+  using time_point                = ::cuda::std::chrono::time_point<system_clock>;
+  static constexpr bool is_steady = false;
   [[nodiscard]] _CCCL_API inline static time_point now() noexcept
   {

cuda/cccl/headers/include/cuda/std/__cmath/isnan.h CHANGED Viewed

@@ -21,16 +21,15 @@
 #  pragma system_header
 #endif // no system header
-#include <cuda/std/__bit/popcount.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__floating_point/fp.h>
 #include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_floating_point.h>
 #include <cuda/std/__type_traits/is_integral.h>
-// MSVC and clang cuda need the host side functions included
-#if _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG)
+#if _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG) || !_CCCL_CUDA_COMPILATION()
 #  include <math.h>
-#endif // _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG)
+#endif // _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG) || !_CCCL_CUDA_COMPILATION()
 #include <cuda/std/__cccl/prologue.h>
@@ -158,10 +157,16 @@ template <class _Tp>
 #if _CCCL_HAS_FLOAT128()
 [[nodiscard]] _CCCL_API constexpr bool isnan(__float128 __x) noexcept
 {
+  // __builtin_isnan is not efficient for __float128, prefer __nv_fp128_isnan at run-time
+  if (!::cuda::std::__cccl_default_is_constant_evaluated())
+  {
+    NV_IF_TARGET(NV_PROVIDES_SM_100, (return ::__nv_fp128_isnan(__x);)) // preserve NaN behavior even with optimization
+                                                                        // flags
+  }
 #  if defined(_CCCL_BUILTIN_ISNAN)
   return _CCCL_BUILTIN_ISNAN(__x);
 #  else // ^^^ _CCCL_BUILTIN_ISNAN ^^^ / vvv !_CCCL_BUILTIN_ISNAN vvv
-  return ::cuda::std::__isnan_impl(__x);
+  return __x != __x;
 #  endif // ^^^ !_CCCL_BUILTIN_ISNAN ^^^
 }
 #endif // _CCCL_HAS_FLOAT128()

cuda/cccl/headers/include/cuda/std/__cmath/min_max.h CHANGED Viewed

@@ -24,11 +24,11 @@
 #include <cuda/__type_traits/is_floating_point.h>
 #include <cuda/std/__cmath/isnan.h>
 #include <cuda/std/__concepts/concept_macros.h>
-#include <cuda/std/__floating_point/fp.h>
+#include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/is_extended_arithmetic.h>
 #include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__type_traits/promote.h>
-#include <cuda/std/limits>
 #include <nv/target>
@@ -36,6 +36,10 @@
 _CCCL_BEGIN_NAMESPACE_CUDA_STD
+/***********************************************************************************************************************
+ * fmax
+ **********************************************************************************************************************/
 // We do explicitly also enable GCC here, because that makes the condition below simpler
 #if _CCCL_CHECK_BUILTIN(builtin_fmax) || _CCCL_COMPILER(GCC)
 _CCCL_TEMPLATE(class _Tp)
@@ -63,13 +67,12 @@ _CCCL_REQUIRES(is_floating_point_v<_Tp>)
 #  define _CCCL_USE_BUILTIN_FMAX() 0
 #endif // _CCCL_BUILTIN_FABSF
-// fmax
 _CCCL_TEMPLATE(class _Tp)
 _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
 [[nodiscard]] _CCCL_API constexpr conditional_t<is_integral_v<_Tp>, double, _Tp> fmax(_Tp __x, _Tp __y) noexcept
 {
 #if _CCCL_HAS_NVFP16()
-  if constexpr (is_same_v<_Tp, __half>)
+  if constexpr (is_same_v<_Tp, ::__half>)
   {
 #  if _CCCL_CTK_AT_LEAST(12, 2)
     return ::__hmax(__x, __y);
@@ -82,7 +85,7 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
   else
 #endif // _CCCL_HAS_NVFP16()
 #if _CCCL_HAS_NVBF16()
-    if constexpr (is_same_v<_Tp, __nv_bfloat16>)
+    if constexpr (is_same_v<_Tp, ::__nv_bfloat16>)
   {
 #  if _CCCL_CTK_AT_LEAST(12, 2)
     return ::__hmax(__x, __y);
@@ -100,17 +103,27 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
     }
     else
     {
-#if _CCCL_USE_BUILTIN_FMAX()
       if (!::cuda::std::__cccl_default_is_constant_evaluated())
       {
+#if _CCCL_HAS_FLOAT128()
+        if constexpr (is_same_v<_Tp, __float128>)
+        {
+          NV_IF_TARGET(NV_PROVIDES_SM_100, (return ::__nv_fp128_fmax(__x, __y);))
+        }
+        else
+#endif // _CCCL_HAS_FLOAT128()
+#if _CCCL_USE_BUILTIN_FMAX()
+          if constexpr (is_floating_point_v<_Tp>)
+        {
 // GCC builtins do not treat NaN properly
 #  if _CCCL_COMPILER(GCC)
-        NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmax(__x, __y);))
+          NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmax(__x, __y);))
 #  else // ^^^ _CCCL_COMPILER(GCC) ^^^ / vvv !_CCCL_COMPILER(GCC)
-        return ::cuda::std::__with_builtin_fmax(__x, __y);
+          return ::cuda::std::__with_builtin_fmax(__x, __y);
 #  endif // !_CCCL_COMPILER(GCC)
-      }
+        }
 #endif // _CCCL_USE_BUILTIN_FMAX
+      }
       if (::cuda::std::isnan(__x))
       {
         return __y;
@@ -119,7 +132,10 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
       {
         return __x;
       }
-      return __x < __y ? __y : __x;
+      else
+      {
+        return __x < __y ? __y : __x;
+      }
     }
 }
@@ -144,7 +160,9 @@ _CCCL_REQUIRES(::cuda::is_floating_point_v<_Tp> _CCCL_AND ::cuda::is_floating_po
   return ::cuda::std::fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
 }
-// fmin
+/***********************************************************************************************************************
+ * fmin
+ **********************************************************************************************************************/
 // We do explicitly also enable GCC here, because that makes the condition below simpler
 #if _CCCL_CHECK_BUILTIN(builtin_fmin) || _CCCL_COMPILER(GCC)
@@ -178,7 +196,7 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
 [[nodiscard]] _CCCL_API constexpr conditional_t<is_integral_v<_Tp>, double, _Tp> fmin(_Tp __x, _Tp __y) noexcept
 {
 #if _CCCL_HAS_NVFP16()
-  if constexpr (is_same_v<_Tp, __half>)
+  if constexpr (is_same_v<_Tp, ::__half>)
   {
 #  if _CCCL_CTK_AT_LEAST(12, 2)
     return ::__hmin(__x, __y);
@@ -191,7 +209,7 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
   else
 #endif // _CCCL_HAS_NVFP16()
 #if _CCCL_HAS_NVBF16()
-    if constexpr (is_same_v<_Tp, __nv_bfloat16>)
+    if constexpr (is_same_v<_Tp, ::__nv_bfloat16>)
   {
 #  if _CCCL_CTK_AT_LEAST(12, 2)
     return ::__hmin(__x, __y);
@@ -209,17 +227,26 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
     }
     else
     {
-#if _CCCL_USE_BUILTIN_FMAX()
       if (!::cuda::std::__cccl_default_is_constant_evaluated())
       {
+#if _CCCL_HAS_FLOAT128()
+        if constexpr (is_same_v<_Tp, __float128>)
+        {
+          NV_IF_TARGET(NV_PROVIDES_SM_100, (return ::__nv_fp128_fmin(__x, __y);))
+        }
+#endif // _CCCL_HAS_FLOAT128()
+#if _CCCL_USE_BUILTIN_FMAX()
+        if constexpr (is_floating_point_v<_Tp>)
+        {
 // GCC builtins do not treat NaN properly
 #  if _CCCL_COMPILER(GCC)
-        NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmin(__x, __y);))
+          NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmin(__x, __y);))
 #  else // ^^^ _CCCL_COMPILER(GCC) ^^^ / vvv !_CCCL_COMPILER(GCC)
-        return ::cuda::std::__with_builtin_fmin(__x, __y);
+          return ::cuda::std::__with_builtin_fmin(__x, __y);
 #  endif // !_CCCL_COMPILER(GCC)
-      }
+        }
 #endif // _CCCL_USE_BUILTIN_FMAX
+      }
       if (::cuda::std::isnan(__x))
       {
         return __y;

cuda/cccl/headers/include/cuda/std/__concepts/constructible.h CHANGED Viewed

@@ -138,7 +138,7 @@ _CCCL_CONCEPT __nothrow_initializable_from =
         ? ::cuda::std::is_nothrow_constructible_v<_Tp, _Args...>
         : __nothrow_list_initializable_from<_Tp, _Args...>);
-#if !_CCCL_COMPILER(MSVC)
+#if !_CCCL_COMPILER(MSVC) && !_CCCL_CUDA_COMPILER(NVCC, <, 12, 9)
 //! Constructible with direct non-list initialization syntax from the result of
 //! a function call expression (often useful for immovable types).

cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h CHANGED Viewed

@@ -23,18 +23,18 @@
 #include <cuda/std/__exception/cuda_error.h>
-#define _CCCL_TRY_CUDA_API(_NAME, _MSG, ...)                        \
-  do                                                                \
-  {                                                                 \
-    const ::cudaError_t __status = _NAME(__VA_ARGS__);              \
-    switch (__status)                                               \
-    {                                                               \
-      case ::cudaSuccess:                                           \
-        break;                                                      \
-      default:                                                      \
-        /* CUDA error state is cleared inside __throw_cuda_error */ \
-        ::cuda::__throw_cuda_error(__status, _MSG, #_NAME);         \
-    }                                                               \
+#define _CCCL_TRY_CUDA_API(_NAME, _MSG, ...)                \
+  do                                                        \
+  {                                                         \
+    const ::cudaError_t __status = _NAME(__VA_ARGS__);      \
+    switch (__status)                                       \
+    {                                                       \
+      case ::cudaSuccess:                                   \
+        break;                                              \
+      default:                                              \
+        ::cudaGetLastError(); /* clear CUDA error state */  \
+        ::cuda::__throw_cuda_error(__status, _MSG, #_NAME); \
+    }                                                       \
   } while (0)
 #define _CCCL_ASSERT_CUDA_API(_NAME, _MSG, ...)                         \

cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h CHANGED Viewed

@@ -109,14 +109,7 @@ private:
   [[maybe_unused]] const char* __api                  = nullptr,
   [[maybe_unused]] ::cuda::std::source_location __loc = ::cuda::std::source_location::current())
 {
-#  if _CCCL_CUDA_COMPILATION()
-  NV_IF_ELSE_TARGET(NV_IS_HOST,
-                    (::cudaGetLastError(); // clear CUDA error state
-                     throw ::cuda::cuda_error(__status, __msg, __api, __loc);), //
-                    (::cuda::std::terminate();))
-#  else // ^^^ _CCCL_CUDA_COMPILATION() ^^^ / vvv !_CCCL_CUDA_COMPILATION() vvv
-  throw ::cuda::cuda_error(__status, __msg, __api, __loc);
-#  endif // !_CCCL_CUDA_COMPILATION()
+  NV_IF_TARGET(NV_IS_HOST, (throw ::cuda::cuda_error(__status, __msg, __api, __loc);), (::cuda::std::terminate();))
 }
 #else // ^^^ _CCCL_HAS_EXCEPTIONS() ^^^ / vvv !_CCCL_HAS_EXCEPTIONS() vvv
 class cuda_error