PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (60) hide show

cuda/cccl/headers/include/cuda/__event/event_ref.h CHANGED Viewed

@@ -24,7 +24,6 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #  include <cuda/__driver/driver_api.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
 #  include <cuda/std/cassert>
 #  include <cuda/std/cstddef>
 #  include <cuda/std/utility>
@@ -80,7 +79,7 @@ public:
   _CCCL_HOST_API void sync() const
   {
     _CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::sync no event set");
-    _CCCL_TRY_CUDA_API(::cudaEventSynchronize, "Failed to wait for CUDA event", __event_);
+    ::cuda::__driver::__eventSynchronize(__event_);
   }
   //! @brief Checks if all the work in the stream prior to the record of the event has completed.
@@ -91,12 +90,12 @@ public:
   [[nodiscard]] _CCCL_HOST_API bool is_done() const
   {
     _CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::sync no event set");
-    cudaError_t __status = ::cudaEventQuery(__event_);
-    if (__status == cudaSuccess)
+    ::cudaError_t __status = ::cuda::__driver::__eventQueryNoThrow(__event_);
+    if (__status == ::cudaSuccess)
     {
       return true;
     }
-    else if (__status == cudaErrorNotReady)
+    else if (__status == ::cudaErrorNotReady)
     {
       return false;
     }

cuda/cccl/headers/include/cuda/__event/timed_event.h CHANGED Viewed

@@ -26,10 +26,10 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+#  include <cuda/__driver/driver_api.h>
 #  include <cuda/__event/event.h>
 #  include <cuda/__utility/no_init.h>
 #  include <cuda/std/__chrono/duration.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
 #  include <cuda/std/cstddef>
 #  include <cuda/std/__cccl/prologue.h>
@@ -51,7 +51,7 @@ public:
   //!
   //! @throws cuda_error if the event creation fails.
   explicit timed_event(device_ref __device, flags __flags = flags::none)
-      : event(__device, static_cast<unsigned int>(__flags))
+      : event(__device, static_cast<unsigned>(__flags))
   {}
   //! @brief Construct a new `timed_event` object into the moved-from state.
@@ -96,8 +96,7 @@ public:
   //! @note The elapsed time has a resolution of approximately 0.5 microseconds.
   [[nodiscard]] friend ::cuda::std::chrono::nanoseconds operator-(const timed_event& __end, const timed_event& __start)
   {
-    float __ms = 0.0f;
-    ::cuda::__driver::__eventElapsedTime(__start.get(), __end.get(), &__ms);
+    const auto __ms = ::cuda::__driver::__eventElapsedTime(__start.get(), __end.get());
     return ::cuda::std::chrono::nanoseconds(static_cast<::cuda::std::chrono::nanoseconds::rep>(__ms * 1'000'000.0));
   }

cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h CHANGED Viewed

@@ -159,11 +159,11 @@ public:
   ::cuda::std::ranges::__movable_box<_OutputFn> __output_func_{};
   using iterator_concept = ::cuda::std::conditional_t<
-    ::cuda::std::random_access_iterator<_Iter>,
+    ::cuda::std::__has_random_access_traversal<_Iter>,
     ::cuda::std::random_access_iterator_tag,
-    ::cuda::std::conditional_t<::cuda::std::bidirectional_iterator<_Iter>,
+    ::cuda::std::conditional_t<::cuda::std::__has_bidirectional_traversal<_Iter>,
                                ::cuda::std::bidirectional_iterator_tag,
-                               ::cuda::std::conditional_t<::cuda::std::forward_iterator<_Iter>,
+                               ::cuda::std::conditional_t<::cuda::std::__has_forward_traversal<_Iter>,
                                                           ::cuda::std::forward_iterator_tag,
                                                           ::cuda::std::output_iterator_tag>>>;
   using iterator_category = ::cuda::std::output_iterator_tag;

cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h CHANGED Viewed

@@ -164,11 +164,11 @@ public:
   ::cuda::std::ranges::__movable_box<_Fn> __func_;
   using iterator_concept = ::cuda::std::conditional_t<
-    ::cuda::std::random_access_iterator<_Iter>,
+    ::cuda::std::__has_random_access_traversal<_Iter>,
     ::cuda::std::random_access_iterator_tag,
-    ::cuda::std::conditional_t<::cuda::std::bidirectional_iterator<_Iter>,
+    ::cuda::std::conditional_t<::cuda::std::__has_bidirectional_traversal<_Iter>,
                                ::cuda::std::bidirectional_iterator_tag,
-                               ::cuda::std::conditional_t<::cuda::std::forward_iterator<_Iter>,
+                               ::cuda::std::conditional_t<::cuda::std::__has_forward_traversal<_Iter>,
                                                           ::cuda::std::forward_iterator_tag,
                                                           ::cuda::std::input_iterator_tag>>>;
   using value_type =

cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h CHANGED Viewed

@@ -149,11 +149,11 @@ public:
   ::cuda::std::ranges::__movable_box<_Fn> __func_{};
   using iterator_concept = ::cuda::std::conditional_t<
-    ::cuda::std::random_access_iterator<_Iter>,
+    ::cuda::std::__has_random_access_traversal<_Iter>,
     ::cuda::std::random_access_iterator_tag,
-    ::cuda::std::conditional_t<::cuda::std::bidirectional_iterator<_Iter>,
+    ::cuda::std::conditional_t<::cuda::std::__has_bidirectional_traversal<_Iter>,
                                ::cuda::std::bidirectional_iterator_tag,
-                               ::cuda::std::conditional_t<::cuda::std::forward_iterator<_Iter>,
+                               ::cuda::std::conditional_t<::cuda::std::__has_forward_traversal<_Iter>,
                                                           ::cuda::std::forward_iterator_tag,
                                                           ::cuda::std::output_iterator_tag>>>;
   using iterator_category = ::cuda::std::output_iterator_tag;

cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h CHANGED Viewed

@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
+#include <cuda/__driver/driver_api.h>
 #include <cuda/__memory/address_space.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__cuda/api_wrapper.h>
@@ -107,10 +108,11 @@ class __host_accessor : public _Accessor
 #if _CCCL_HAS_CTK()
     if constexpr (::cuda::std::contiguous_iterator<__data_handle_type>)
     {
-      ::cudaPointerAttributes __ptr_attrib{};
       auto __p1 = ::cuda::std::to_address(__p);
-      _CCCL_ASSERT_CUDA_API(::cudaPointerGetAttributes, "cudaPointerGetAttributes failed", &__ptr_attrib, __p1);
-      return __ptr_attrib.hostPointer != nullptr || __ptr_attrib.type == ::cudaMemoryTypeUnregistered;
+      ::CUmemorytype __type{};
+      const auto __status =
+        ::cuda::__driver::__pointerGetAttributeNoThrow<::CU_POINTER_ATTRIBUTE_MEMORY_TYPE>(__type, __p1);
+      return (__status != ::cudaSuccess) || __type == ::CU_MEMORYTYPE_HOST;
     }
     else
 #endif // _CCCL_HAS_CTK()
@@ -223,10 +225,11 @@ class __device_accessor : public _Accessor
 #if _CCCL_HAS_CTK()
     if constexpr (::cuda::std::contiguous_iterator<__data_handle_type>)
     {
-      ::cudaPointerAttributes __ptr_attrib{};
       auto __p1 = ::cuda::std::to_address(__p);
-      _CCCL_ASSERT_CUDA_API(::cudaPointerGetAttributes, "cudaPointerGetAttributes failed", &__ptr_attrib, __p1);
-      return __ptr_attrib.devicePointer != nullptr || __ptr_attrib.type == ::cudaMemoryTypeUnregistered;
+      ::CUmemorytype __type{};
+      const auto __status =
+        ::cuda::__driver::__pointerGetAttributeNoThrow<::CU_POINTER_ATTRIBUTE_MEMORY_TYPE>(__type, __p1);
+      return (__status != ::cudaSuccess) || __type == ::CU_MEMORYTYPE_DEVICE;
     }
     else
 #endif // _CCCL_HAS_CTK()
@@ -352,10 +355,11 @@ class __managed_accessor : public _Accessor
 #if _CCCL_HAS_CTK()
     if constexpr (::cuda::std::contiguous_iterator<__data_handle_type>)
     {
-      ::cudaPointerAttributes __ptr_attrib{};
-      auto __p1 = ::cuda::std::to_address(__p);
-      _CCCL_ASSERT_CUDA_API(::cudaPointerGetAttributes, "cudaPointerGetAttributes failed", &__ptr_attrib, __p1);
-      return __ptr_attrib.devicePointer != nullptr && __ptr_attrib.hostPointer == __ptr_attrib.devicePointer;
+      const auto __p1 = ::cuda::std::to_address(__p);
+      bool __is_managed{};
+      const auto __status =
+        ::cuda::__driver::__pointerGetAttributeNoThrow<::CU_POINTER_ATTRIBUTE_IS_MANAGED>(__is_managed, __p1);
+      return (__status != ::cudaSuccess) || __is_managed;
     }
     else
 #endif // _CCCL_HAS_CTK()

cuda/cccl/headers/include/cuda/__runtime/types.h CHANGED Viewed

@@ -29,7 +29,7 @@ _CCCL_BEGIN_NAMESPACE_CUDA
 using memory_location = ::cudaMemLocation;
 #  if _CCCL_CTK_AT_LEAST(12, 2)
-inline constexpr memory_location host_memory_location = {cudaMemLocationTypeHost, 0};
+inline constexpr memory_location host_memory_location = {::cudaMemLocationTypeHost, 0};
 #  endif // _CCCL_CTK_AT_LEAST(12, 2)
 _CCCL_END_NAMESPACE_CUDA

cuda/cccl/headers/include/cuda/__stream/stream.h CHANGED Viewed

@@ -24,9 +24,9 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 #  include <cuda/__device/device_ref.h>
+#  include <cuda/__driver/driver_api.h>
 #  include <cuda/__runtime/ensure_current_context.h>
 #  include <cuda/__stream/stream_ref.h> // IWYU pragma: export
-#  include <cuda/std/__cuda/api_wrapper.h>
 #  include <cuda/std/__cccl/prologue.h>
@@ -47,8 +47,7 @@ struct stream : stream_ref
       : stream_ref(__detail::__invalid_stream)
   {
     [[maybe_unused]] __ensure_current_context __ctx_setter(__dev);
-    _CCCL_TRY_CUDA_API(
-      ::cudaStreamCreateWithPriority, "Failed to create a stream", &__stream, cudaStreamNonBlocking, __priority);
+    __stream = ::cuda::__driver::__streamCreateWithPriority(cudaStreamNonBlocking, __priority);
   }
   //! @brief Construct a new `stream` object into the moved-from state.

cuda/cccl/headers/include/cuda/__stream/stream_ref.h CHANGED Viewed

@@ -23,11 +23,11 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+#  include <cuda/__driver/driver_api.h>
 #  include <cuda/__event/timed_event.h>
 #  include <cuda/__fwd/get_stream.h>
 #  include <cuda/__runtime/ensure_current_context.h>
 #  include <cuda/__utility/no_init.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
 #  include <cuda/std/__exception/cuda_error.h>
 #  include <cuda/std/cstddef>
@@ -39,7 +39,7 @@ namespace __detail
 {
 // 0 is a valid stream in CUDA, so we need some other invalid stream representation
 // Can't make it constexpr, because cudaStream_t is a pointer type
-static const ::cudaStream_t __invalid_stream = reinterpret_cast<cudaStream_t>(~0ULL);
+static const ::cudaStream_t __invalid_stream = reinterpret_cast<::cudaStream_t>(~0ULL);
 } // namespace __detail
 //! @brief A type representing a stream ID.
@@ -238,11 +238,17 @@ public:
   //! @throws cuda_error if device check fails
   _CCCL_HOST_API device_ref device() const
   {
-    CUcontext __stream_ctx = ::cuda::__driver::__streamGetCtx(__stream);
-    __ensure_current_context __setter(__stream_ctx);
-    int __id;
-    _CCCL_TRY_CUDA_API(cudaGetDevice, "Could not get device from a stream", &__id);
-    return device_ref{__id};
+    ::CUdevice __device{};
+#  if _CCCL_CTK_AT_LEAST(13, 0)
+    __device = ::cuda::__driver::__streamGetDevice(__stream);
+#  else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv
+    {
+      ::CUcontext __stream_ctx = ::cuda::__driver::__streamGetCtx(__stream);
+      __ensure_current_context __setter(__stream_ctx);
+      __device = ::cuda::__driver::__ctxGetDevice();
+    }
+#  endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^
+    return device_ref{::cuda::__driver::__cudevice_to_ordinal(__device)};
   }
   //! @brief Queries the \c stream_ref for itself. This makes \c stream_ref usable in places where we expect an
@@ -262,21 +268,20 @@ inline void event_ref::record(stream_ref __stream) const
 }
 inline event::event(stream_ref __stream, event::flags __flags)
-    : event(__stream, static_cast<unsigned int>(__flags) | cudaEventDisableTiming)
+    : event(__stream, static_cast<unsigned>(__flags) | cudaEventDisableTiming)
 {
   record(__stream);
 }
-inline event::event(stream_ref __stream, unsigned int __flags)
+inline event::event(stream_ref __stream, unsigned __flags)
     : event_ref(::cudaEvent_t{})
 {
   [[maybe_unused]] __ensure_current_context __ctx_setter(__stream);
-  _CCCL_TRY_CUDA_API(
-    ::cudaEventCreateWithFlags, "Failed to create CUDA event", &__event_, static_cast<unsigned int>(__flags));
+  __event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
 }
 inline timed_event::timed_event(stream_ref __stream, event::flags __flags)
-    : event(__stream, static_cast<unsigned int>(__flags))
+    : event(__stream, static_cast<unsigned>(__flags))
 {
   record(__stream);
 }

cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h CHANGED Viewed

@@ -42,7 +42,7 @@ using __vtable_for _CCCL_NODEBUG_ALIAS = typename __overrides_for_t<_Interface>:
 //! __basic_vtable
 //!
 template <class _Interface, auto... _Mbrs>
-struct _CCCL_DECLSPEC_EMPTY_BASES __basic_vtable
+struct _CCCL_DECLSPEC_EMPTY_BASES _CCCL_TYPE_VISIBILITY_DEFAULT __basic_vtable
     : __rtti_base
     , __virtual_fn<_Mbrs>...
 {
@@ -105,7 +105,7 @@ struct _CCCL_DECLSPEC_EMPTY_BASES __basic_vtable
 //!
 template <class... _Interfaces>
-struct _CCCL_DECLSPEC_EMPTY_BASES __vtable_tuple
+struct _CCCL_DECLSPEC_EMPTY_BASES _CCCL_TYPE_VISIBILITY_DEFAULT __vtable_tuple
     : __rtti_ex<sizeof...(_Interfaces)>
     , __vtable_for<_Interfaces>...
 {

cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h CHANGED Viewed

@@ -39,10 +39,10 @@
 #if _CCCL_HAS_PDL()
 // Waits for the previous kernel to complete (when it reaches its final membar). Should be put before the first global
 // memory access in a kernel.
-#  define _CCCL_PDL_GRID_DEPENDENCY_SYNC() NV_IF_TARGET(NV_PROVIDES_SM_90, cudaGridDependencySynchronize();)
+#  define _CCCL_PDL_GRID_DEPENDENCY_SYNC() NV_IF_TARGET(NV_PROVIDES_SM_90, ::cudaGridDependencySynchronize();)
 // Allows the subsequent kernel in the same stream to launch. Can be put anywhere in a kernel.
 // Heuristic(ahendriksen): put it after the last load.
-#  define _CCCL_PDL_TRIGGER_NEXT_LAUNCH() NV_IF_TARGET(NV_PROVIDES_SM_90, cudaTriggerProgrammaticLaunchCompletion();)
+#  define _CCCL_PDL_TRIGGER_NEXT_LAUNCH() NV_IF_TARGET(NV_PROVIDES_SM_90, ::cudaTriggerProgrammaticLaunchCompletion();)
 #else // _CCCL_HAS_PDL()
 #  define _CCCL_PDL_GRID_DEPENDENCY_SYNC()
 #  define _CCCL_PDL_TRIGGER_NEXT_LAUNCH()

cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h CHANGED Viewed

@@ -107,6 +107,8 @@
 #define _CCCL_PP_FOR_EACH_7(_Mp, _1, _2, _3, _4, _5, _6, _7) _Mp(_1) _Mp(_2) _Mp(_3) _Mp(_4) _Mp(_5) _Mp(_6) _Mp(_7)
 #define _CCCL_PP_FOR_EACH_8(_Mp, _1, _2, _3, _4, _5, _6, _7, _8) \
   _Mp(_1) _Mp(_2) _Mp(_3) _Mp(_4) _Mp(_5) _Mp(_6) _Mp(_7) _Mp(_8)
+#define _CCCL_PP_FOR_EACH_9(_Mp, _1, _2, _3, _4, _5, _6, _7, _8, _9) \
+  _Mp(_1) _Mp(_2) _Mp(_3) _Mp(_4) _Mp(_5) _Mp(_6) _Mp(_7) _Mp(_8) _Mp(_9)
 #define _CCCL_PP_PROBE_EMPTY_PROBE__CCCL_PP_PROBE_EMPTY _CCCL_PP_PROBE(~)

cuda/cccl/headers/include/cuda/std/__cmath/isnan.h CHANGED Viewed

@@ -21,16 +21,15 @@
 #  pragma system_header
 #endif // no system header
-#include <cuda/std/__bit/popcount.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__floating_point/fp.h>
 #include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_floating_point.h>
 #include <cuda/std/__type_traits/is_integral.h>
-// MSVC and clang cuda need the host side functions included
-#if _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG)
+#if _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG) || !_CCCL_CUDA_COMPILATION()
 #  include <math.h>
-#endif // _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG)
+#endif // _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG) || !_CCCL_CUDA_COMPILATION()
 #include <cuda/std/__cccl/prologue.h>
@@ -158,10 +157,16 @@ template <class _Tp>
 #if _CCCL_HAS_FLOAT128()
 [[nodiscard]] _CCCL_API constexpr bool isnan(__float128 __x) noexcept
 {
+  // __builtin_isnan is not efficient for __float128, prefer __nv_fp128_isnan at run-time
+  if (!::cuda::std::__cccl_default_is_constant_evaluated())
+  {
+    NV_IF_TARGET(NV_PROVIDES_SM_100, (return ::__nv_fp128_isnan(__x);)) // preserve NaN behavior even with optimization
+                                                                        // flags
+  }
 #  if defined(_CCCL_BUILTIN_ISNAN)
   return _CCCL_BUILTIN_ISNAN(__x);
 #  else // ^^^ _CCCL_BUILTIN_ISNAN ^^^ / vvv !_CCCL_BUILTIN_ISNAN vvv
-  return ::cuda::std::__isnan_impl(__x);
+  return __x != __x;
 #  endif // ^^^ !_CCCL_BUILTIN_ISNAN ^^^
 }
 #endif // _CCCL_HAS_FLOAT128()

cuda/cccl/headers/include/cuda/std/__cmath/min_max.h CHANGED Viewed

@@ -24,11 +24,11 @@
 #include <cuda/__type_traits/is_floating_point.h>
 #include <cuda/std/__cmath/isnan.h>
 #include <cuda/std/__concepts/concept_macros.h>
-#include <cuda/std/__floating_point/fp.h>
+#include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/is_extended_arithmetic.h>
 #include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__type_traits/promote.h>
-#include <cuda/std/limits>
 #include <nv/target>
@@ -36,6 +36,10 @@
 _CCCL_BEGIN_NAMESPACE_CUDA_STD
+/***********************************************************************************************************************
+ * fmax
+ **********************************************************************************************************************/
 // We do explicitly also enable GCC here, because that makes the condition below simpler
 #if _CCCL_CHECK_BUILTIN(builtin_fmax) || _CCCL_COMPILER(GCC)
 _CCCL_TEMPLATE(class _Tp)
@@ -63,13 +67,12 @@ _CCCL_REQUIRES(is_floating_point_v<_Tp>)
 #  define _CCCL_USE_BUILTIN_FMAX() 0
 #endif // _CCCL_BUILTIN_FABSF
-// fmax
 _CCCL_TEMPLATE(class _Tp)
 _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
 [[nodiscard]] _CCCL_API constexpr conditional_t<is_integral_v<_Tp>, double, _Tp> fmax(_Tp __x, _Tp __y) noexcept
 {
 #if _CCCL_HAS_NVFP16()
-  if constexpr (is_same_v<_Tp, __half>)
+  if constexpr (is_same_v<_Tp, ::__half>)
   {
 #  if _CCCL_CTK_AT_LEAST(12, 2)
     return ::__hmax(__x, __y);
@@ -82,7 +85,7 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
   else
 #endif // _CCCL_HAS_NVFP16()
 #if _CCCL_HAS_NVBF16()
-    if constexpr (is_same_v<_Tp, __nv_bfloat16>)
+    if constexpr (is_same_v<_Tp, ::__nv_bfloat16>)
   {
 #  if _CCCL_CTK_AT_LEAST(12, 2)
     return ::__hmax(__x, __y);
@@ -100,17 +103,27 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
     }
     else
     {
-#if _CCCL_USE_BUILTIN_FMAX()
       if (!::cuda::std::__cccl_default_is_constant_evaluated())
       {
+#if _CCCL_HAS_FLOAT128()
+        if constexpr (is_same_v<_Tp, __float128>)
+        {
+          NV_IF_TARGET(NV_PROVIDES_SM_100, (return ::__nv_fp128_fmax(__x, __y);))
+        }
+        else
+#endif // _CCCL_HAS_FLOAT128()
+#if _CCCL_USE_BUILTIN_FMAX()
+          if constexpr (is_floating_point_v<_Tp>)
+        {
 // GCC builtins do not treat NaN properly
 #  if _CCCL_COMPILER(GCC)
-        NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmax(__x, __y);))
+          NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmax(__x, __y);))
 #  else // ^^^ _CCCL_COMPILER(GCC) ^^^ / vvv !_CCCL_COMPILER(GCC)
-        return ::cuda::std::__with_builtin_fmax(__x, __y);
+          return ::cuda::std::__with_builtin_fmax(__x, __y);
 #  endif // !_CCCL_COMPILER(GCC)
-      }
+        }
 #endif // _CCCL_USE_BUILTIN_FMAX
+      }
       if (::cuda::std::isnan(__x))
       {
         return __y;
@@ -119,7 +132,10 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
       {
         return __x;
       }
-      return __x < __y ? __y : __x;
+      else
+      {
+        return __x < __y ? __y : __x;
+      }
     }
 }
@@ -144,7 +160,9 @@ _CCCL_REQUIRES(::cuda::is_floating_point_v<_Tp> _CCCL_AND ::cuda::is_floating_po
   return ::cuda::std::fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
 }
-// fmin
+/***********************************************************************************************************************
+ * fmin
+ **********************************************************************************************************************/
 // We do explicitly also enable GCC here, because that makes the condition below simpler
 #if _CCCL_CHECK_BUILTIN(builtin_fmin) || _CCCL_COMPILER(GCC)
@@ -178,7 +196,7 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
 [[nodiscard]] _CCCL_API constexpr conditional_t<is_integral_v<_Tp>, double, _Tp> fmin(_Tp __x, _Tp __y) noexcept
 {
 #if _CCCL_HAS_NVFP16()
-  if constexpr (is_same_v<_Tp, __half>)
+  if constexpr (is_same_v<_Tp, ::__half>)
   {
 #  if _CCCL_CTK_AT_LEAST(12, 2)
     return ::__hmin(__x, __y);
@@ -191,7 +209,7 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
   else
 #endif // _CCCL_HAS_NVFP16()
 #if _CCCL_HAS_NVBF16()
-    if constexpr (is_same_v<_Tp, __nv_bfloat16>)
+    if constexpr (is_same_v<_Tp, ::__nv_bfloat16>)
   {
 #  if _CCCL_CTK_AT_LEAST(12, 2)
     return ::__hmin(__x, __y);
@@ -209,17 +227,26 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
     }
     else
     {
-#if _CCCL_USE_BUILTIN_FMAX()
       if (!::cuda::std::__cccl_default_is_constant_evaluated())
       {
+#if _CCCL_HAS_FLOAT128()
+        if constexpr (is_same_v<_Tp, __float128>)
+        {
+          NV_IF_TARGET(NV_PROVIDES_SM_100, (return ::__nv_fp128_fmin(__x, __y);))
+        }
+#endif // _CCCL_HAS_FLOAT128()
+#if _CCCL_USE_BUILTIN_FMAX()
+        if constexpr (is_floating_point_v<_Tp>)
+        {
 // GCC builtins do not treat NaN properly
 #  if _CCCL_COMPILER(GCC)
-        NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmin(__x, __y);))
+          NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmin(__x, __y);))
 #  else // ^^^ _CCCL_COMPILER(GCC) ^^^ / vvv !_CCCL_COMPILER(GCC)
-        return ::cuda::std::__with_builtin_fmin(__x, __y);
+          return ::cuda::std::__with_builtin_fmin(__x, __y);
 #  endif // !_CCCL_COMPILER(GCC)
-      }
+        }
 #endif // _CCCL_USE_BUILTIN_FMAX
+      }
       if (::cuda::std::isnan(__x))
       {
         return __y;

cuda/cccl/headers/include/cuda/std/__concepts/constructible.h CHANGED Viewed

@@ -138,7 +138,7 @@ _CCCL_CONCEPT __nothrow_initializable_from =
         ? ::cuda::std::is_nothrow_constructible_v<_Tp, _Args...>
         : __nothrow_list_initializable_from<_Tp, _Args...>);
-#if !_CCCL_COMPILER(MSVC)
+#if !_CCCL_COMPILER(MSVC) && !_CCCL_CUDA_COMPILER(NVCC, <, 12, 9)
 //! Constructible with direct non-list initialization syntax from the result of
 //! a function call expression (often useful for immovable types).

cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h CHANGED Viewed

@@ -23,18 +23,18 @@
 #include <cuda/std/__exception/cuda_error.h>
-#define _CCCL_TRY_CUDA_API(_NAME, _MSG, ...)                        \
-  do                                                                \
-  {                                                                 \
-    const ::cudaError_t __status = _NAME(__VA_ARGS__);              \
-    switch (__status)                                               \
-    {                                                               \
-      case ::cudaSuccess:                                           \
-        break;                                                      \
-      default:                                                      \
-        /* CUDA error state is cleared inside __throw_cuda_error */ \
-        ::cuda::__throw_cuda_error(__status, _MSG, #_NAME);         \
-    }                                                               \
+#define _CCCL_TRY_CUDA_API(_NAME, _MSG, ...)                \
+  do                                                        \
+  {                                                         \
+    const ::cudaError_t __status = _NAME(__VA_ARGS__);      \
+    switch (__status)                                       \
+    {                                                       \
+      case ::cudaSuccess:                                   \
+        break;                                              \
+      default:                                              \
+        ::cudaGetLastError(); /* clear CUDA error state */  \
+        ::cuda::__throw_cuda_error(__status, _MSG, #_NAME); \
+    }                                                       \
   } while (0)
 #define _CCCL_ASSERT_CUDA_API(_NAME, _MSG, ...)                         \

cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h CHANGED Viewed

@@ -109,14 +109,7 @@ private:
   [[maybe_unused]] const char* __api                  = nullptr,
   [[maybe_unused]] ::cuda::std::source_location __loc = ::cuda::std::source_location::current())
 {
-#  if _CCCL_CUDA_COMPILATION()
-  NV_IF_ELSE_TARGET(NV_IS_HOST,
-                    (::cudaGetLastError(); // clear CUDA error state
-                     throw ::cuda::cuda_error(__status, __msg, __api, __loc);), //
-                    (::cuda::std::terminate();))
-#  else // ^^^ _CCCL_CUDA_COMPILATION() ^^^ / vvv !_CCCL_CUDA_COMPILATION() vvv
-  throw ::cuda::cuda_error(__status, __msg, __api, __loc);
-#  endif // !_CCCL_CUDA_COMPILATION()
+  NV_IF_TARGET(NV_IS_HOST, (throw ::cuda::cuda_error(__status, __msg, __api, __loc);), (::cuda::std::terminate();))
 }
 #else // ^^^ _CCCL_HAS_EXCEPTIONS() ^^^ / vvv !_CCCL_HAS_EXCEPTIONS() vvv
 class cuda_error