cuda-cccl 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.0__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (60) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +23 -0
  2. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +22 -14
  3. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  4. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  5. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  6. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  7. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  8. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +321 -262
  9. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +8 -0
  10. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  11. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
  12. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +203 -51
  13. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  14. cuda/cccl/headers/include/cub/util_device.cuh +51 -35
  15. cuda/cccl/headers/include/cuda/__algorithm/copy.h +3 -3
  16. cuda/cccl/headers/include/cuda/__device/all_devices.h +3 -6
  17. cuda/cccl/headers/include/cuda/__device/arch_traits.h +3 -3
  18. cuda/cccl/headers/include/cuda/__device/attributes.h +7 -7
  19. cuda/cccl/headers/include/cuda/__device/device_ref.h +3 -10
  20. cuda/cccl/headers/include/cuda/__driver/driver_api.h +225 -33
  21. cuda/cccl/headers/include/cuda/__event/event.h +7 -8
  22. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  23. cuda/cccl/headers/include/cuda/__event/timed_event.h +3 -4
  24. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
  25. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
  26. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
  27. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  28. cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
  29. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  30. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -12
  31. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  32. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  33. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  34. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  35. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  36. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
  37. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  38. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  39. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  40. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  41. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  42. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
  43. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  44. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  45. cuda/cccl/parallel/experimental/__init__.py +4 -0
  46. cuda/cccl/parallel/experimental/_bindings.pyi +28 -0
  47. cuda/cccl/parallel/experimental/_bindings_impl.pyx +140 -0
  48. cuda/cccl/parallel/experimental/algorithms/__init__.py +4 -0
  49. cuda/cccl/parallel/experimental/algorithms/_reduce.py +0 -2
  50. cuda/cccl/parallel/experimental/algorithms/_scan.py +0 -2
  51. cuda/cccl/parallel/experimental/algorithms/_three_way_partition.py +261 -0
  52. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  53. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  54. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  55. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  56. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/METADATA +1 -1
  57. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/RECORD +59 -57
  58. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  59. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/WHEEL +0 -0
  60. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -24,7 +24,6 @@
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
26
  # include <cuda/__driver/driver_api.h>
27
- # include <cuda/std/__cuda/api_wrapper.h>
28
27
  # include <cuda/std/cassert>
29
28
  # include <cuda/std/cstddef>
30
29
  # include <cuda/std/utility>
@@ -80,7 +79,7 @@ public:
80
79
  _CCCL_HOST_API void sync() const
81
80
  {
82
81
  _CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::sync no event set");
83
- _CCCL_TRY_CUDA_API(::cudaEventSynchronize, "Failed to wait for CUDA event", __event_);
82
+ ::cuda::__driver::__eventSynchronize(__event_);
84
83
  }
85
84
 
86
85
  //! @brief Checks if all the work in the stream prior to the record of the event has completed.
@@ -91,12 +90,12 @@ public:
91
90
  [[nodiscard]] _CCCL_HOST_API bool is_done() const
92
91
  {
93
92
  _CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::sync no event set");
94
- cudaError_t __status = ::cudaEventQuery(__event_);
95
- if (__status == cudaSuccess)
93
+ ::cudaError_t __status = ::cuda::__driver::__eventQueryNoThrow(__event_);
94
+ if (__status == ::cudaSuccess)
96
95
  {
97
96
  return true;
98
97
  }
99
- else if (__status == cudaErrorNotReady)
98
+ else if (__status == ::cudaErrorNotReady)
100
99
  {
101
100
  return false;
102
101
  }
@@ -26,10 +26,10 @@
26
26
 
27
27
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
28
28
 
29
+ # include <cuda/__driver/driver_api.h>
29
30
  # include <cuda/__event/event.h>
30
31
  # include <cuda/__utility/no_init.h>
31
32
  # include <cuda/std/__chrono/duration.h>
32
- # include <cuda/std/__cuda/api_wrapper.h>
33
33
  # include <cuda/std/cstddef>
34
34
 
35
35
  # include <cuda/std/__cccl/prologue.h>
@@ -51,7 +51,7 @@ public:
51
51
  //!
52
52
  //! @throws cuda_error if the event creation fails.
53
53
  explicit timed_event(device_ref __device, flags __flags = flags::none)
54
- : event(__device, static_cast<unsigned int>(__flags))
54
+ : event(__device, static_cast<unsigned>(__flags))
55
55
  {}
56
56
 
57
57
  //! @brief Construct a new `timed_event` object into the moved-from state.
@@ -96,8 +96,7 @@ public:
96
96
  //! @note The elapsed time has a resolution of approximately 0.5 microseconds.
97
97
  [[nodiscard]] friend ::cuda::std::chrono::nanoseconds operator-(const timed_event& __end, const timed_event& __start)
98
98
  {
99
- float __ms = 0.0f;
100
- ::cuda::__driver::__eventElapsedTime(__start.get(), __end.get(), &__ms);
99
+ const auto __ms = ::cuda::__driver::__eventElapsedTime(__start.get(), __end.get());
101
100
  return ::cuda::std::chrono::nanoseconds(static_cast<::cuda::std::chrono::nanoseconds::rep>(__ms * 1'000'000.0));
102
101
  }
103
102
 
@@ -159,11 +159,11 @@ public:
159
159
  ::cuda::std::ranges::__movable_box<_OutputFn> __output_func_{};
160
160
 
161
161
  using iterator_concept = ::cuda::std::conditional_t<
162
- ::cuda::std::random_access_iterator<_Iter>,
162
+ ::cuda::std::__has_random_access_traversal<_Iter>,
163
163
  ::cuda::std::random_access_iterator_tag,
164
- ::cuda::std::conditional_t<::cuda::std::bidirectional_iterator<_Iter>,
164
+ ::cuda::std::conditional_t<::cuda::std::__has_bidirectional_traversal<_Iter>,
165
165
  ::cuda::std::bidirectional_iterator_tag,
166
- ::cuda::std::conditional_t<::cuda::std::forward_iterator<_Iter>,
166
+ ::cuda::std::conditional_t<::cuda::std::__has_forward_traversal<_Iter>,
167
167
  ::cuda::std::forward_iterator_tag,
168
168
  ::cuda::std::output_iterator_tag>>>;
169
169
  using iterator_category = ::cuda::std::output_iterator_tag;
@@ -164,11 +164,11 @@ public:
164
164
  ::cuda::std::ranges::__movable_box<_Fn> __func_;
165
165
 
166
166
  using iterator_concept = ::cuda::std::conditional_t<
167
- ::cuda::std::random_access_iterator<_Iter>,
167
+ ::cuda::std::__has_random_access_traversal<_Iter>,
168
168
  ::cuda::std::random_access_iterator_tag,
169
- ::cuda::std::conditional_t<::cuda::std::bidirectional_iterator<_Iter>,
169
+ ::cuda::std::conditional_t<::cuda::std::__has_bidirectional_traversal<_Iter>,
170
170
  ::cuda::std::bidirectional_iterator_tag,
171
- ::cuda::std::conditional_t<::cuda::std::forward_iterator<_Iter>,
171
+ ::cuda::std::conditional_t<::cuda::std::__has_forward_traversal<_Iter>,
172
172
  ::cuda::std::forward_iterator_tag,
173
173
  ::cuda::std::input_iterator_tag>>>;
174
174
  using value_type =
@@ -149,11 +149,11 @@ public:
149
149
  ::cuda::std::ranges::__movable_box<_Fn> __func_{};
150
150
 
151
151
  using iterator_concept = ::cuda::std::conditional_t<
152
- ::cuda::std::random_access_iterator<_Iter>,
152
+ ::cuda::std::__has_random_access_traversal<_Iter>,
153
153
  ::cuda::std::random_access_iterator_tag,
154
- ::cuda::std::conditional_t<::cuda::std::bidirectional_iterator<_Iter>,
154
+ ::cuda::std::conditional_t<::cuda::std::__has_bidirectional_traversal<_Iter>,
155
155
  ::cuda::std::bidirectional_iterator_tag,
156
- ::cuda::std::conditional_t<::cuda::std::forward_iterator<_Iter>,
156
+ ::cuda::std::conditional_t<::cuda::std::__has_forward_traversal<_Iter>,
157
157
  ::cuda::std::forward_iterator_tag,
158
158
  ::cuda::std::output_iterator_tag>>>;
159
159
  using iterator_category = ::cuda::std::output_iterator_tag;
@@ -21,6 +21,7 @@
21
21
  # pragma system_header
22
22
  #endif // no system header
23
23
 
24
+ #include <cuda/__driver/driver_api.h>
24
25
  #include <cuda/__memory/address_space.h>
25
26
  #include <cuda/std/__concepts/concept_macros.h>
26
27
  #include <cuda/std/__cuda/api_wrapper.h>
@@ -107,10 +108,11 @@ class __host_accessor : public _Accessor
107
108
  #if _CCCL_HAS_CTK()
108
109
  if constexpr (::cuda::std::contiguous_iterator<__data_handle_type>)
109
110
  {
110
- ::cudaPointerAttributes __ptr_attrib{};
111
111
  auto __p1 = ::cuda::std::to_address(__p);
112
- _CCCL_ASSERT_CUDA_API(::cudaPointerGetAttributes, "cudaPointerGetAttributes failed", &__ptr_attrib, __p1);
113
- return __ptr_attrib.hostPointer != nullptr || __ptr_attrib.type == ::cudaMemoryTypeUnregistered;
112
+ ::CUmemorytype __type{};
113
+ const auto __status =
114
+ ::cuda::__driver::__pointerGetAttributeNoThrow<::CU_POINTER_ATTRIBUTE_MEMORY_TYPE>(__type, __p1);
115
+ return (__status != ::cudaSuccess) || __type == ::CU_MEMORYTYPE_HOST;
114
116
  }
115
117
  else
116
118
  #endif // _CCCL_HAS_CTK()
@@ -223,10 +225,11 @@ class __device_accessor : public _Accessor
223
225
  #if _CCCL_HAS_CTK()
224
226
  if constexpr (::cuda::std::contiguous_iterator<__data_handle_type>)
225
227
  {
226
- ::cudaPointerAttributes __ptr_attrib{};
227
228
  auto __p1 = ::cuda::std::to_address(__p);
228
- _CCCL_ASSERT_CUDA_API(::cudaPointerGetAttributes, "cudaPointerGetAttributes failed", &__ptr_attrib, __p1);
229
- return __ptr_attrib.devicePointer != nullptr || __ptr_attrib.type == ::cudaMemoryTypeUnregistered;
229
+ ::CUmemorytype __type{};
230
+ const auto __status =
231
+ ::cuda::__driver::__pointerGetAttributeNoThrow<::CU_POINTER_ATTRIBUTE_MEMORY_TYPE>(__type, __p1);
232
+ return (__status != ::cudaSuccess) || __type == ::CU_MEMORYTYPE_DEVICE;
230
233
  }
231
234
  else
232
235
  #endif // _CCCL_HAS_CTK()
@@ -352,10 +355,11 @@ class __managed_accessor : public _Accessor
352
355
  #if _CCCL_HAS_CTK()
353
356
  if constexpr (::cuda::std::contiguous_iterator<__data_handle_type>)
354
357
  {
355
- ::cudaPointerAttributes __ptr_attrib{};
356
- auto __p1 = ::cuda::std::to_address(__p);
357
- _CCCL_ASSERT_CUDA_API(::cudaPointerGetAttributes, "cudaPointerGetAttributes failed", &__ptr_attrib, __p1);
358
- return __ptr_attrib.devicePointer != nullptr && __ptr_attrib.hostPointer == __ptr_attrib.devicePointer;
358
+ const auto __p1 = ::cuda::std::to_address(__p);
359
+ bool __is_managed{};
360
+ const auto __status =
361
+ ::cuda::__driver::__pointerGetAttributeNoThrow<::CU_POINTER_ATTRIBUTE_IS_MANAGED>(__is_managed, __p1);
362
+ return (__status != ::cudaSuccess) || __is_managed;
359
363
  }
360
364
  else
361
365
  #endif // _CCCL_HAS_CTK()
@@ -29,7 +29,7 @@ _CCCL_BEGIN_NAMESPACE_CUDA
29
29
 
30
30
  using memory_location = ::cudaMemLocation;
31
31
  # if _CCCL_CTK_AT_LEAST(12, 2)
32
- inline constexpr memory_location host_memory_location = {cudaMemLocationTypeHost, 0};
32
+ inline constexpr memory_location host_memory_location = {::cudaMemLocationTypeHost, 0};
33
33
  # endif // _CCCL_CTK_AT_LEAST(12, 2)
34
34
 
35
35
  _CCCL_END_NAMESPACE_CUDA
@@ -24,9 +24,9 @@
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
26
  # include <cuda/__device/device_ref.h>
27
+ # include <cuda/__driver/driver_api.h>
27
28
  # include <cuda/__runtime/ensure_current_context.h>
28
29
  # include <cuda/__stream/stream_ref.h> // IWYU pragma: export
29
- # include <cuda/std/__cuda/api_wrapper.h>
30
30
 
31
31
  # include <cuda/std/__cccl/prologue.h>
32
32
 
@@ -47,8 +47,7 @@ struct stream : stream_ref
47
47
  : stream_ref(__detail::__invalid_stream)
48
48
  {
49
49
  [[maybe_unused]] __ensure_current_context __ctx_setter(__dev);
50
- _CCCL_TRY_CUDA_API(
51
- ::cudaStreamCreateWithPriority, "Failed to create a stream", &__stream, cudaStreamNonBlocking, __priority);
50
+ __stream = ::cuda::__driver::__streamCreateWithPriority(cudaStreamNonBlocking, __priority);
52
51
  }
53
52
 
54
53
  //! @brief Construct a new `stream` object into the moved-from state.
@@ -23,11 +23,11 @@
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
+ # include <cuda/__driver/driver_api.h>
26
27
  # include <cuda/__event/timed_event.h>
27
28
  # include <cuda/__fwd/get_stream.h>
28
29
  # include <cuda/__runtime/ensure_current_context.h>
29
30
  # include <cuda/__utility/no_init.h>
30
- # include <cuda/std/__cuda/api_wrapper.h>
31
31
  # include <cuda/std/__exception/cuda_error.h>
32
32
  # include <cuda/std/cstddef>
33
33
 
@@ -39,7 +39,7 @@ namespace __detail
39
39
  {
40
40
  // 0 is a valid stream in CUDA, so we need some other invalid stream representation
41
41
  // Can't make it constexpr, because cudaStream_t is a pointer type
42
- static const ::cudaStream_t __invalid_stream = reinterpret_cast<cudaStream_t>(~0ULL);
42
+ static const ::cudaStream_t __invalid_stream = reinterpret_cast<::cudaStream_t>(~0ULL);
43
43
  } // namespace __detail
44
44
 
45
45
  //! @brief A type representing a stream ID.
@@ -238,11 +238,17 @@ public:
238
238
  //! @throws cuda_error if device check fails
239
239
  _CCCL_HOST_API device_ref device() const
240
240
  {
241
- CUcontext __stream_ctx = ::cuda::__driver::__streamGetCtx(__stream);
242
- __ensure_current_context __setter(__stream_ctx);
243
- int __id;
244
- _CCCL_TRY_CUDA_API(cudaGetDevice, "Could not get device from a stream", &__id);
245
- return device_ref{__id};
241
+ ::CUdevice __device{};
242
+ # if _CCCL_CTK_AT_LEAST(13, 0)
243
+ __device = ::cuda::__driver::__streamGetDevice(__stream);
244
+ # else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv
245
+ {
246
+ ::CUcontext __stream_ctx = ::cuda::__driver::__streamGetCtx(__stream);
247
+ __ensure_current_context __setter(__stream_ctx);
248
+ __device = ::cuda::__driver::__ctxGetDevice();
249
+ }
250
+ # endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^
251
+ return device_ref{::cuda::__driver::__cudevice_to_ordinal(__device)};
246
252
  }
247
253
 
248
254
  //! @brief Queries the \c stream_ref for itself. This makes \c stream_ref usable in places where we expect an
@@ -262,21 +268,20 @@ inline void event_ref::record(stream_ref __stream) const
262
268
  }
263
269
 
264
270
  inline event::event(stream_ref __stream, event::flags __flags)
265
- : event(__stream, static_cast<unsigned int>(__flags) | cudaEventDisableTiming)
271
+ : event(__stream, static_cast<unsigned>(__flags) | cudaEventDisableTiming)
266
272
  {
267
273
  record(__stream);
268
274
  }
269
275
 
270
- inline event::event(stream_ref __stream, unsigned int __flags)
276
+ inline event::event(stream_ref __stream, unsigned __flags)
271
277
  : event_ref(::cudaEvent_t{})
272
278
  {
273
279
  [[maybe_unused]] __ensure_current_context __ctx_setter(__stream);
274
- _CCCL_TRY_CUDA_API(
275
- ::cudaEventCreateWithFlags, "Failed to create CUDA event", &__event_, static_cast<unsigned int>(__flags));
280
+ __event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
276
281
  }
277
282
 
278
283
  inline timed_event::timed_event(stream_ref __stream, event::flags __flags)
279
- : event(__stream, static_cast<unsigned int>(__flags))
284
+ : event(__stream, static_cast<unsigned>(__flags))
280
285
  {
281
286
  record(__stream);
282
287
  }
@@ -42,7 +42,7 @@ using __vtable_for _CCCL_NODEBUG_ALIAS = typename __overrides_for_t<_Interface>:
42
42
  //! __basic_vtable
43
43
  //!
44
44
  template <class _Interface, auto... _Mbrs>
45
- struct _CCCL_DECLSPEC_EMPTY_BASES __basic_vtable
45
+ struct _CCCL_DECLSPEC_EMPTY_BASES _CCCL_TYPE_VISIBILITY_DEFAULT __basic_vtable
46
46
  : __rtti_base
47
47
  , __virtual_fn<_Mbrs>...
48
48
  {
@@ -105,7 +105,7 @@ struct _CCCL_DECLSPEC_EMPTY_BASES __basic_vtable
105
105
  //!
106
106
 
107
107
  template <class... _Interfaces>
108
- struct _CCCL_DECLSPEC_EMPTY_BASES __vtable_tuple
108
+ struct _CCCL_DECLSPEC_EMPTY_BASES _CCCL_TYPE_VISIBILITY_DEFAULT __vtable_tuple
109
109
  : __rtti_ex<sizeof...(_Interfaces)>
110
110
  , __vtable_for<_Interfaces>...
111
111
  {
@@ -39,10 +39,10 @@
39
39
  #if _CCCL_HAS_PDL()
40
40
  // Waits for the previous kernel to complete (when it reaches its final membar). Should be put before the first global
41
41
  // memory access in a kernel.
42
- # define _CCCL_PDL_GRID_DEPENDENCY_SYNC() NV_IF_TARGET(NV_PROVIDES_SM_90, cudaGridDependencySynchronize();)
42
+ # define _CCCL_PDL_GRID_DEPENDENCY_SYNC() NV_IF_TARGET(NV_PROVIDES_SM_90, ::cudaGridDependencySynchronize();)
43
43
  // Allows the subsequent kernel in the same stream to launch. Can be put anywhere in a kernel.
44
44
  // Heuristic(ahendriksen): put it after the last load.
45
- # define _CCCL_PDL_TRIGGER_NEXT_LAUNCH() NV_IF_TARGET(NV_PROVIDES_SM_90, cudaTriggerProgrammaticLaunchCompletion();)
45
+ # define _CCCL_PDL_TRIGGER_NEXT_LAUNCH() NV_IF_TARGET(NV_PROVIDES_SM_90, ::cudaTriggerProgrammaticLaunchCompletion();)
46
46
  #else // _CCCL_HAS_PDL()
47
47
  # define _CCCL_PDL_GRID_DEPENDENCY_SYNC()
48
48
  # define _CCCL_PDL_TRIGGER_NEXT_LAUNCH()
@@ -107,6 +107,8 @@
107
107
  #define _CCCL_PP_FOR_EACH_7(_Mp, _1, _2, _3, _4, _5, _6, _7) _Mp(_1) _Mp(_2) _Mp(_3) _Mp(_4) _Mp(_5) _Mp(_6) _Mp(_7)
108
108
  #define _CCCL_PP_FOR_EACH_8(_Mp, _1, _2, _3, _4, _5, _6, _7, _8) \
109
109
  _Mp(_1) _Mp(_2) _Mp(_3) _Mp(_4) _Mp(_5) _Mp(_6) _Mp(_7) _Mp(_8)
110
+ #define _CCCL_PP_FOR_EACH_9(_Mp, _1, _2, _3, _4, _5, _6, _7, _8, _9) \
111
+ _Mp(_1) _Mp(_2) _Mp(_3) _Mp(_4) _Mp(_5) _Mp(_6) _Mp(_7) _Mp(_8) _Mp(_9)
110
112
 
111
113
  #define _CCCL_PP_PROBE_EMPTY_PROBE__CCCL_PP_PROBE_EMPTY _CCCL_PP_PROBE(~)
112
114
 
@@ -21,16 +21,15 @@
21
21
  # pragma system_header
22
22
  #endif // no system header
23
23
 
24
- #include <cuda/std/__bit/popcount.h>
25
24
  #include <cuda/std/__concepts/concept_macros.h>
26
25
  #include <cuda/std/__floating_point/fp.h>
27
26
  #include <cuda/std/__type_traits/is_constant_evaluated.h>
27
+ #include <cuda/std/__type_traits/is_floating_point.h>
28
28
  #include <cuda/std/__type_traits/is_integral.h>
29
29
 
30
- // MSVC and clang cuda need the host side functions included
31
- #if _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG)
30
+ #if _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG) || !_CCCL_CUDA_COMPILATION()
32
31
  # include <math.h>
33
- #endif // _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG)
32
+ #endif // _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG) || !_CCCL_CUDA_COMPILATION()
34
33
 
35
34
  #include <cuda/std/__cccl/prologue.h>
36
35
 
@@ -158,10 +157,16 @@ template <class _Tp>
158
157
  #if _CCCL_HAS_FLOAT128()
159
158
  [[nodiscard]] _CCCL_API constexpr bool isnan(__float128 __x) noexcept
160
159
  {
160
+ // __builtin_isnan is not efficient for __float128, prefer __nv_fp128_isnan at run-time
161
+ if (!::cuda::std::__cccl_default_is_constant_evaluated())
162
+ {
163
+ NV_IF_TARGET(NV_PROVIDES_SM_100, (return ::__nv_fp128_isnan(__x);)) // preserve NaN behavior even with optimization
164
+ // flags
165
+ }
161
166
  # if defined(_CCCL_BUILTIN_ISNAN)
162
167
  return _CCCL_BUILTIN_ISNAN(__x);
163
168
  # else // ^^^ _CCCL_BUILTIN_ISNAN ^^^ / vvv !_CCCL_BUILTIN_ISNAN vvv
164
- return ::cuda::std::__isnan_impl(__x);
169
+ return __x != __x;
165
170
  # endif // ^^^ !_CCCL_BUILTIN_ISNAN ^^^
166
171
  }
167
172
  #endif // _CCCL_HAS_FLOAT128()
@@ -24,11 +24,11 @@
24
24
  #include <cuda/__type_traits/is_floating_point.h>
25
25
  #include <cuda/std/__cmath/isnan.h>
26
26
  #include <cuda/std/__concepts/concept_macros.h>
27
- #include <cuda/std/__floating_point/fp.h>
27
+ #include <cuda/std/__type_traits/conditional.h>
28
28
  #include <cuda/std/__type_traits/is_extended_arithmetic.h>
29
29
  #include <cuda/std/__type_traits/is_integral.h>
30
+ #include <cuda/std/__type_traits/is_same.h>
30
31
  #include <cuda/std/__type_traits/promote.h>
31
- #include <cuda/std/limits>
32
32
 
33
33
  #include <nv/target>
34
34
 
@@ -36,6 +36,10 @@
36
36
 
37
37
  _CCCL_BEGIN_NAMESPACE_CUDA_STD
38
38
 
39
+ /***********************************************************************************************************************
40
+ * fmax
41
+ **********************************************************************************************************************/
42
+
39
43
  // We do explicitly also enable GCC here, because that makes the condition below simpler
40
44
  #if _CCCL_CHECK_BUILTIN(builtin_fmax) || _CCCL_COMPILER(GCC)
41
45
  _CCCL_TEMPLATE(class _Tp)
@@ -63,13 +67,12 @@ _CCCL_REQUIRES(is_floating_point_v<_Tp>)
63
67
  # define _CCCL_USE_BUILTIN_FMAX() 0
64
68
  #endif // _CCCL_BUILTIN_FABSF
65
69
 
66
- // fmax
67
70
  _CCCL_TEMPLATE(class _Tp)
68
71
  _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
69
72
  [[nodiscard]] _CCCL_API constexpr conditional_t<is_integral_v<_Tp>, double, _Tp> fmax(_Tp __x, _Tp __y) noexcept
70
73
  {
71
74
  #if _CCCL_HAS_NVFP16()
72
- if constexpr (is_same_v<_Tp, __half>)
75
+ if constexpr (is_same_v<_Tp, ::__half>)
73
76
  {
74
77
  # if _CCCL_CTK_AT_LEAST(12, 2)
75
78
  return ::__hmax(__x, __y);
@@ -82,7 +85,7 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
82
85
  else
83
86
  #endif // _CCCL_HAS_NVFP16()
84
87
  #if _CCCL_HAS_NVBF16()
85
- if constexpr (is_same_v<_Tp, __nv_bfloat16>)
88
+ if constexpr (is_same_v<_Tp, ::__nv_bfloat16>)
86
89
  {
87
90
  # if _CCCL_CTK_AT_LEAST(12, 2)
88
91
  return ::__hmax(__x, __y);
@@ -100,17 +103,27 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
100
103
  }
101
104
  else
102
105
  {
103
- #if _CCCL_USE_BUILTIN_FMAX()
104
106
  if (!::cuda::std::__cccl_default_is_constant_evaluated())
105
107
  {
108
+ #if _CCCL_HAS_FLOAT128()
109
+ if constexpr (is_same_v<_Tp, __float128>)
110
+ {
111
+ NV_IF_TARGET(NV_PROVIDES_SM_100, (return ::__nv_fp128_fmax(__x, __y);))
112
+ }
113
+ else
114
+ #endif // _CCCL_HAS_FLOAT128()
115
+ #if _CCCL_USE_BUILTIN_FMAX()
116
+ if constexpr (is_floating_point_v<_Tp>)
117
+ {
106
118
  // GCC builtins do not treat NaN properly
107
119
  # if _CCCL_COMPILER(GCC)
108
- NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmax(__x, __y);))
120
+ NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmax(__x, __y);))
109
121
  # else // ^^^ _CCCL_COMPILER(GCC) ^^^ / vvv !_CCCL_COMPILER(GCC)
110
- return ::cuda::std::__with_builtin_fmax(__x, __y);
122
+ return ::cuda::std::__with_builtin_fmax(__x, __y);
111
123
  # endif // !_CCCL_COMPILER(GCC)
112
- }
124
+ }
113
125
  #endif // _CCCL_USE_BUILTIN_FMAX
126
+ }
114
127
  if (::cuda::std::isnan(__x))
115
128
  {
116
129
  return __y;
@@ -119,7 +132,10 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
119
132
  {
120
133
  return __x;
121
134
  }
122
- return __x < __y ? __y : __x;
135
+ else
136
+ {
137
+ return __x < __y ? __y : __x;
138
+ }
123
139
  }
124
140
  }
125
141
 
@@ -144,7 +160,9 @@ _CCCL_REQUIRES(::cuda::is_floating_point_v<_Tp> _CCCL_AND ::cuda::is_floating_po
144
160
  return ::cuda::std::fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
145
161
  }
146
162
 
147
- // fmin
163
+ /***********************************************************************************************************************
164
+ * fmin
165
+ **********************************************************************************************************************/
148
166
 
149
167
  // We do explicitly also enable GCC here, because that makes the condition below simpler
150
168
  #if _CCCL_CHECK_BUILTIN(builtin_fmin) || _CCCL_COMPILER(GCC)
@@ -178,7 +196,7 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
178
196
  [[nodiscard]] _CCCL_API constexpr conditional_t<is_integral_v<_Tp>, double, _Tp> fmin(_Tp __x, _Tp __y) noexcept
179
197
  {
180
198
  #if _CCCL_HAS_NVFP16()
181
- if constexpr (is_same_v<_Tp, __half>)
199
+ if constexpr (is_same_v<_Tp, ::__half>)
182
200
  {
183
201
  # if _CCCL_CTK_AT_LEAST(12, 2)
184
202
  return ::__hmin(__x, __y);
@@ -191,7 +209,7 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
191
209
  else
192
210
  #endif // _CCCL_HAS_NVFP16()
193
211
  #if _CCCL_HAS_NVBF16()
194
- if constexpr (is_same_v<_Tp, __nv_bfloat16>)
212
+ if constexpr (is_same_v<_Tp, ::__nv_bfloat16>)
195
213
  {
196
214
  # if _CCCL_CTK_AT_LEAST(12, 2)
197
215
  return ::__hmin(__x, __y);
@@ -209,17 +227,26 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
209
227
  }
210
228
  else
211
229
  {
212
- #if _CCCL_USE_BUILTIN_FMAX()
213
230
  if (!::cuda::std::__cccl_default_is_constant_evaluated())
214
231
  {
232
+ #if _CCCL_HAS_FLOAT128()
233
+ if constexpr (is_same_v<_Tp, __float128>)
234
+ {
235
+ NV_IF_TARGET(NV_PROVIDES_SM_100, (return ::__nv_fp128_fmin(__x, __y);))
236
+ }
237
+ #endif // _CCCL_HAS_FLOAT128()
238
+ #if _CCCL_USE_BUILTIN_FMAX()
239
+ if constexpr (is_floating_point_v<_Tp>)
240
+ {
215
241
  // GCC builtins do not treat NaN properly
216
242
  # if _CCCL_COMPILER(GCC)
217
- NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmin(__x, __y);))
243
+ NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmin(__x, __y);))
218
244
  # else // ^^^ _CCCL_COMPILER(GCC) ^^^ / vvv !_CCCL_COMPILER(GCC)
219
- return ::cuda::std::__with_builtin_fmin(__x, __y);
245
+ return ::cuda::std::__with_builtin_fmin(__x, __y);
220
246
  # endif // !_CCCL_COMPILER(GCC)
221
- }
247
+ }
222
248
  #endif // _CCCL_USE_BUILTIN_FMAX
249
+ }
223
250
  if (::cuda::std::isnan(__x))
224
251
  {
225
252
  return __y;
@@ -138,7 +138,7 @@ _CCCL_CONCEPT __nothrow_initializable_from =
138
138
  ? ::cuda::std::is_nothrow_constructible_v<_Tp, _Args...>
139
139
  : __nothrow_list_initializable_from<_Tp, _Args...>);
140
140
 
141
- #if !_CCCL_COMPILER(MSVC)
141
+ #if !_CCCL_COMPILER(MSVC) && !_CCCL_CUDA_COMPILER(NVCC, <, 12, 9)
142
142
 
143
143
  //! Constructible with direct non-list initialization syntax from the result of
144
144
  //! a function call expression (often useful for immovable types).
@@ -23,18 +23,18 @@
23
23
 
24
24
  #include <cuda/std/__exception/cuda_error.h>
25
25
 
26
- #define _CCCL_TRY_CUDA_API(_NAME, _MSG, ...) \
27
- do \
28
- { \
29
- const ::cudaError_t __status = _NAME(__VA_ARGS__); \
30
- switch (__status) \
31
- { \
32
- case ::cudaSuccess: \
33
- break; \
34
- default: \
35
- /* CUDA error state is cleared inside __throw_cuda_error */ \
36
- ::cuda::__throw_cuda_error(__status, _MSG, #_NAME); \
37
- } \
26
+ #define _CCCL_TRY_CUDA_API(_NAME, _MSG, ...) \
27
+ do \
28
+ { \
29
+ const ::cudaError_t __status = _NAME(__VA_ARGS__); \
30
+ switch (__status) \
31
+ { \
32
+ case ::cudaSuccess: \
33
+ break; \
34
+ default: \
35
+ ::cudaGetLastError(); /* clear CUDA error state */ \
36
+ ::cuda::__throw_cuda_error(__status, _MSG, #_NAME); \
37
+ } \
38
38
  } while (0)
39
39
 
40
40
  #define _CCCL_ASSERT_CUDA_API(_NAME, _MSG, ...) \
@@ -109,14 +109,7 @@ private:
109
109
  [[maybe_unused]] const char* __api = nullptr,
110
110
  [[maybe_unused]] ::cuda::std::source_location __loc = ::cuda::std::source_location::current())
111
111
  {
112
- # if _CCCL_CUDA_COMPILATION()
113
- NV_IF_ELSE_TARGET(NV_IS_HOST,
114
- (::cudaGetLastError(); // clear CUDA error state
115
- throw ::cuda::cuda_error(__status, __msg, __api, __loc);), //
116
- (::cuda::std::terminate();))
117
- # else // ^^^ _CCCL_CUDA_COMPILATION() ^^^ / vvv !_CCCL_CUDA_COMPILATION() vvv
118
- throw ::cuda::cuda_error(__status, __msg, __api, __loc);
119
- # endif // !_CCCL_CUDA_COMPILATION()
112
+ NV_IF_TARGET(NV_IS_HOST, (throw ::cuda::cuda_error(__status, __msg, __api, __loc);), (::cuda::std::terminate();))
120
113
  }
121
114
  #else // ^^^ _CCCL_HAS_EXCEPTIONS() ^^^ / vvv !_CCCL_HAS_EXCEPTIONS() vvv
122
115
  class cuda_error