cuda-cccl 0.3.1__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  2. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  3. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  4. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  5. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  6. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  7. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
  8. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  9. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  10. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  11. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  12. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  13. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  14. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  15. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
  16. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  17. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
  18. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
  19. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
  20. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  21. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  22. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  23. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  24. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  25. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  26. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  27. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  28. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  29. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  30. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  31. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  32. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  33. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  34. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  35. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  36. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  37. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  38. cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
  39. cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
  40. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  41. cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
  42. cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
  43. cuda/cccl/headers/include/cuda/__event/event.h +26 -26
  44. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  45. cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
  46. cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
  47. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  48. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  49. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  50. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  51. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  52. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  53. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
  54. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
  55. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
  56. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  57. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  58. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  59. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  60. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  61. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  62. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  63. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  64. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  65. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
  67. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  68. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
  69. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  70. cuda/cccl/headers/include/cuda/cmath +1 -0
  71. cuda/cccl/headers/include/cuda/devices +3 -0
  72. cuda/cccl/headers/include/cuda/memory +1 -0
  73. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  75. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  76. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  77. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  78. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  80. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  81. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  82. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  83. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  84. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  85. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  86. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  87. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  88. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  89. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  90. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  91. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  92. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  93. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  94. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  95. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  96. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  97. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  98. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  99. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  100. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  101. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  102. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  103. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  104. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  105. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  106. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  107. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  108. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  109. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  110. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  111. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  112. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  113. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  114. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  115. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  116. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  117. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  118. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  120. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  121. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  122. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  123. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  124. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  125. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  126. cuda/cccl/headers/include/cuda/std/string_view +146 -11
  127. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  128. cuda/cccl/headers/include/cuda/utility +1 -0
  129. cuda/cccl/headers/include/nv/target +7 -2
  130. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  131. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  132. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  133. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  134. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  135. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  136. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  137. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  138. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  139. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  140. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  141. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  142. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  143. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  144. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  145. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  146. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  147. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  148. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  149. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  150. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  151. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  152. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  153. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  154. cuda/compute/__init__.py +2 -0
  155. cuda/compute/_bindings.pyi +43 -1
  156. cuda/compute/_bindings_impl.pyx +156 -7
  157. cuda/compute/algorithms/_scan.py +108 -36
  158. cuda/compute/algorithms/_transform.py +32 -11
  159. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  160. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  161. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  162. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  163. cuda/compute/iterators/__init__.py +2 -0
  164. cuda/compute/iterators/_factories.py +28 -0
  165. cuda/compute/iterators/_iterators.py +206 -1
  166. cuda/compute/numba_utils.py +2 -2
  167. cuda/compute/typing.py +2 -0
  168. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  169. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
  170. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  171. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  172. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  173. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  174. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  175. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  176. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  177. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  178. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  179. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  180. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  181. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  182. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  183. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  184. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  185. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -33,7 +33,7 @@ template <class _Iter, class _Sent, class _Tp, class _Proj>
33
33
  {
34
34
  for (; __first != __last; ++__first)
35
35
  {
36
- if (::cuda::std::__invoke(__proj, *__first) == __value)
36
+ if (::cuda::std::invoke(__proj, *__first) == __value)
37
37
  {
38
38
  break;
39
39
  }
@@ -40,13 +40,11 @@ _CCCL_API constexpr bool __includes(
40
40
  for (; __first2 != __last2; ++__first1)
41
41
  {
42
42
  if (__first1 == __last1
43
- || ::cuda::std::__invoke(
44
- __comp, ::cuda::std::__invoke(__proj2, *__first2), ::cuda::std::__invoke(__proj1, *__first1)))
43
+ || ::cuda::std::invoke(__comp, ::cuda::std::invoke(__proj2, *__first2), ::cuda::std::invoke(__proj1, *__first1)))
45
44
  {
46
45
  return false;
47
46
  }
48
- if (!::cuda::std::__invoke(
49
- __comp, ::cuda::std::__invoke(__proj1, *__first1), ::cuda::std::__invoke(__proj2, *__first2)))
47
+ if (!::cuda::std::invoke(__comp, ::cuda::std::invoke(__proj1, *__first1), ::cuda::std::invoke(__proj2, *__first2)))
50
48
  {
51
49
  ++__first2;
52
50
  }
@@ -46,7 +46,7 @@ _CCCL_API constexpr _Iter __lower_bound(_Iter __first, _Sent __last, const _Type
46
46
  auto __l2 = ::cuda::std::__half_positive(__len);
47
47
  _Iter __m = __first;
48
48
  _IterOps<_AlgPolicy>::advance(__m, __l2);
49
- if (::cuda::std::__invoke(__comp, ::cuda::std::__invoke(__proj, *__m), __value))
49
+ if (::cuda::std::invoke(__comp, ::cuda::std::invoke(__proj, *__m), __value))
50
50
  {
51
51
  __first = ++__m;
52
52
  __len -= __l2 + 1;
@@ -47,26 +47,18 @@ struct _ProjectedPred
47
47
  {}
48
48
 
49
49
  template <class _Tp>
50
- typename __invoke_of<
51
- _Pred&,
52
- decltype(::cuda::std::__invoke(::cuda::std::declval<_Proj&>(), ::cuda::std::declval<_Tp>()))>::type constexpr
53
- _CCCL_API inline
54
- operator()(_Tp&& __v) const
50
+ invoke_result_t<_Pred&, invoke_result_t<_Proj&, _Tp>> constexpr _CCCL_API inline operator()(_Tp&& __v) const
55
51
  {
56
- return ::cuda::std::__invoke(__pred, ::cuda::std::__invoke(__proj, ::cuda::std::forward<_Tp>(__v)));
52
+ return ::cuda::std::invoke(__pred, ::cuda::std::invoke(__proj, ::cuda::std::forward<_Tp>(__v)));
57
53
  }
58
54
 
59
55
  template <class _T1, class _T2>
60
- typename __invoke_of<
61
- _Pred&,
62
- decltype(::cuda::std::__invoke(::cuda::std::declval<_Proj&>(), ::cuda::std::declval<_T1>())),
63
- decltype(::cuda::std::__invoke(::cuda::std::declval<_Proj&>(), ::cuda::std::declval<_T2>()))>::type constexpr
64
- _CCCL_API inline
65
- operator()(_T1&& __lhs, _T2&& __rhs) const
56
+ invoke_result_t<_Pred&, invoke_result_t<_Proj&, _T1>, invoke_result_t<_Proj&, _T2>> _CCCL_API inline
57
+ operator()(_T1&& __lhs, _T2&& __rhs) const
66
58
  {
67
- return ::cuda::std::__invoke(__pred,
68
- ::cuda::std::__invoke(__proj, ::cuda::std::forward<_T1>(__lhs)),
69
- ::cuda::std::__invoke(__proj, ::cuda::std::forward<_T2>(__rhs)));
59
+ return ::cuda::std::invoke(__pred,
60
+ ::cuda::std::invoke(__proj, ::cuda::std::forward<_T1>(__lhs)),
61
+ ::cuda::std::invoke(__proj, ::cuda::std::forward<_T2>(__rhs)));
70
62
  }
71
63
  };
72
64
 
@@ -44,7 +44,7 @@ _CCCL_API constexpr _Iter __min_element(_Iter __first, _Sent __last, _Comp __com
44
44
  _Iter __i = __first;
45
45
  while (++__i != __last)
46
46
  {
47
- if (::cuda::std::__invoke(__comp, ::cuda::std::__invoke(__proj, *__i), ::cuda::std::__invoke(__proj, *__first)))
47
+ if (::cuda::std::invoke(__comp, ::cuda::std::invoke(__proj, *__i), ::cuda::std::invoke(__proj, *__first)))
48
48
  {
49
49
  __first = __i;
50
50
  }
@@ -46,8 +46,7 @@ public:
46
46
  template <class _Iter>
47
47
  _CCCL_API constexpr bool operator()(_Iter& __it1, _Iter& __it2)
48
48
  {
49
- return ::cuda::std::__invoke(
50
- __comp_, ::cuda::std::__invoke(__proj_, *__it1), ::cuda::std::__invoke(__proj_, *__it2));
49
+ return ::cuda::std::invoke(__comp_, ::cuda::std::invoke(__proj_, *__it1), ::cuda::std::invoke(__proj_, *__it2));
51
50
  }
52
51
  };
53
52
 
@@ -69,8 +69,8 @@ _CCCL_API constexpr pair<_InputIterator, _RandomAccessIterator> __partial_sort_c
69
69
  typename iterator_traits<_RandomAccessIterator>::difference_type __len = __r - __result_first;
70
70
  for (; __first != __last; ++__first)
71
71
  {
72
- if (::cuda::std::__invoke(
73
- __comp, ::cuda::std::__invoke(__proj1, *__first), ::cuda::std::__invoke(__proj2, *__result_first)))
72
+ if (::cuda::std::invoke(
73
+ __comp, ::cuda::std::invoke(__proj1, *__first), ::cuda::std::invoke(__proj2, *__result_first)))
74
74
  {
75
75
  *__result_first = *__first;
76
76
  ::cuda::std::__sift_down<_AlgPolicy>(__result_first, __projected_comp, __len, __result_first);
@@ -45,7 +45,7 @@ __upper_bound(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp
45
45
  {
46
46
  auto __half_len = ::cuda::std::__half_positive(__len);
47
47
  auto __mid = _IterOps<_AlgPolicy>::next(__first, __half_len);
48
- if (::cuda::std::__invoke(__comp, __value, ::cuda::std::__invoke(__proj, *__mid)))
48
+ if (::cuda::std::invoke(__comp, __value, ::cuda::std::invoke(__proj, *__mid)))
49
49
  {
50
50
  __len = __half_len;
51
51
  }
@@ -0,0 +1,36 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA_STD__CCCL_ALGORITHM_WRAPPER_H
12
+ #define _CUDA_STD__CCCL_ALGORITHM_WRAPPER_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ // When nvc++ uses CCCL components as part of its implementation of
25
+ // Standard C++ algorithms, a cycle of included files may result when CCCL code
26
+ // tries to use a standard algorithm. The THRUST_INCLUDING_ALGORITHMS_HEADER macro
27
+ // is defined only when CCCL is including an algorithms-related header, giving
28
+ // the compiler a chance to detect and break the cycle of includes.
29
+
30
+ #if !_CCCL_COMPILER(NVRTC)
31
+ # define THRUST_INCLUDING_ALGORITHMS_HEADER
32
+ # include <algorithm>
33
+ # undef THRUST_INCLUDING_ALGORITHMS_HEADER
34
+ #endif // !_CCCL_COMPILER(NVRTC)
35
+
36
+ #endif // _CUDA_STD__CCCL_ALGORITHM_WRAPPER_H
@@ -25,6 +25,7 @@
25
25
 
26
26
  #include <cuda/std/__cccl/attributes.h>
27
27
  #include <cuda/std/__cccl/extended_data_types.h>
28
+ #include <cuda/std/__cccl/host_std_lib.h>
28
29
 
29
30
  //! This file consolidates all compiler builtin detection for CCCL.
30
31
  //!
@@ -607,55 +608,51 @@
607
608
  # define _CCCL_BUILTIN_STRLEN(...) __builtin_strlen(__VA_ARGS__)
608
609
  #endif
609
610
 
610
- // Some compilers provide std::move/std::forward/etc as builtins
611
- #if defined(__cplusplus)
612
- // Bring in the feature test macros (needed for std::forward_like)
613
- # if _CCCL_HAS_INCLUDE(<version>) // <version> should be the smallest include possible
614
- # include <version>
615
- # elif !_CCCL_COMPILER(NVRTC)
616
- # include <ciso646> // otherwise go for the smallest possible header
617
- # endif // !_CCCL_COMPILER(NVRTC)
618
-
619
- // Bring in the bits of the STL we need
620
- # if defined(_GLIBCXX_VERSION)
621
- # include <bits/move.h> // for move, forward, forward_like, and addressof
622
- # elif defined(_LIBCPP_VERSION)
623
- # include <__memory/addressof.h>
624
- # include <__utility/as_const.h>
625
- # include <__utility/forward.h>
626
- # include <__utility/forward_like.h>
627
- # include <__utility/move.h>
628
- # endif
629
-
630
- # if defined(_GLIBCXX_VERSION) || defined(_LIBCPP_VERSION)
631
- // std::move builtin
632
- # if _CCCL_COMPILER(CLANG, >=, 15) || _CCCL_COMPILER(GCC, >=, 15)
633
- # define _CCCL_HAS_BUILTIN_STD_MOVE() 1
634
- # endif
635
-
636
- // std::forward builtin
637
- # if _CCCL_COMPILER(CLANG, >=, 15) || _CCCL_COMPILER(GCC, >=, 15)
638
- # define _CCCL_HAS_BUILTIN_STD_FORWARD() 1
639
- # endif
640
-
641
- // std::addressof builtin
642
- # if _CCCL_COMPILER(CLANG, >=, 15) || _CCCL_COMPILER(GCC, >=, 15)
643
- # define _CCCL_HAS_BUILTIN_STD_ADDRESSOF() 1
644
- # endif
645
-
646
- // std::as_const builtin
647
- # if _CCCL_COMPILER(CLANG, >=, 15)
648
- # define _CCCL_HAS_BUILTIN_STD_AS_CONST() 1
649
- # endif
650
-
651
- // std::forward_like builtin
652
- // Leaving out MSVC for now because it is hard for forward-declare std::forward_like.
653
- # if (_CCCL_COMPILER(CLANG, >=, 17) || _CCCL_COMPILER(GCC, >=, 15)) && defined(__cpp_lib_forward_like) \
654
- && (__cpp_lib_forward_like >= 202217L)
655
- # define _CCCL_HAS_BUILTIN_STD_FORWARD_LIKE() 1
656
- # endif
657
- # endif // defined(_GLIBCXX_VERSION) || defined(_LIBCPP_VERSION) || defined(_MSVC_STL_VERSION)
658
- #endif // defined(__cplusplus)
611
+ // todo: re-enable std builtins
612
+
613
+ // // Some compilers provide std::move/std::forward/etc as builtins
614
+ // #if defined(__cplusplus)
615
+ // // Bring in the bits of the STL we need
616
+ // # if _CCCL_HOST_STD_LIB(LIBSTDCXX)
617
+ // # include <bits/move.h> // for move, forward, forward_like, and addressof
618
+ // # elif _CCCL_HOST_STD_LIB(LIBCXX)
619
+ // # include <__memory/addressof.h>
620
+ // # include <__utility/as_const.h>
621
+ // # include <__utility/forward.h>
622
+ // # if __cpp_lib_forward_like >= 202217L
623
+ // # include <__utility/forward_like.h>
624
+ // # endif // __cpp_lib_forward_like >= 202217L
625
+ // # include <__utility/move.h>
626
+ // # endif
627
+
628
+ // # if _CCCL_HOST_STD_LIB(LIBSTDCXX) || _CCCL_HOST_STD_LIB(LIBCXX)
629
+ // // std::move builtin
630
+ // # if _CCCL_COMPILER(CLANG, >=, 15) || _CCCL_COMPILER(GCC, >=, 15)
631
+ // # define _CCCL_HAS_BUILTIN_STD_MOVE() 1
632
+ // # endif
633
+
634
+ // // std::forward builtin
635
+ // # if _CCCL_COMPILER(CLANG, >=, 15) || _CCCL_COMPILER(GCC, >=, 15)
636
+ // # define _CCCL_HAS_BUILTIN_STD_FORWARD() 1
637
+ // # endif
638
+
639
+ // // std::addressof builtin
640
+ // # if _CCCL_COMPILER(CLANG, >=, 15) || _CCCL_COMPILER(GCC, >=, 15)
641
+ // # define _CCCL_HAS_BUILTIN_STD_ADDRESSOF() 1
642
+ // # endif
643
+
644
+ // // std::as_const builtin
645
+ // # if _CCCL_COMPILER(CLANG, >=, 15)
646
+ // # define _CCCL_HAS_BUILTIN_STD_AS_CONST() 1
647
+ // # endif
648
+
649
+ // // std::forward_like builtin
650
+ // // Leaving out MSVC for now because it is hard for forward-declare std::forward_like.
651
+ // # if (_CCCL_COMPILER(CLANG, >=, 17) || _CCCL_COMPILER(GCC, >=, 15)) && __cpp_lib_forward_like >= 202217L
652
+ // # define _CCCL_HAS_BUILTIN_STD_FORWARD_LIKE() 1
653
+ // # endif
654
+ // # endif // _CCCL_HOST_STD_LIB(LIBSTDCXX) || _CCCL_HOST_STD_LIB(LIBCXX)
655
+ // #endif // defined(__cplusplus)
659
656
 
660
657
  #ifndef _CCCL_HAS_BUILTIN_STD_MOVE
661
658
  # define _CCCL_HAS_BUILTIN_STD_MOVE() 0
@@ -65,4 +65,10 @@
65
65
  # endif // _CCCL_CUDA_COMPILER(NVCC)
66
66
  #endif // !_CCCL_EXEC_CHECK_DISABLE
67
67
 
68
+ #if _CCCL_CUDA_COMPILER(NVHPC)
69
+ # define _CCCL_TARGET_CONSTEXPR
70
+ #else // ^^^ _CCCL_CUDA_COMPILER(NVHPC) ^^^ / vvv !_CCCL_CUDA_COMPILER(NVHPC) vvv
71
+ # define _CCCL_TARGET_CONSTEXPR constexpr
72
+ #endif // ^^^ !_CCCL_CUDA_COMPILER(NVHPC) ^^^
73
+
68
74
  #endif // __CCCL_EXECUTION_SPACE_H
@@ -0,0 +1,52 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef __CCCL_HOST_STD_LIB_H
12
+ #define __CCCL_HOST_STD_LIB_H
13
+
14
+ #include <cuda/std/__cccl/compiler.h>
15
+ #include <cuda/std/__cccl/preprocessor.h>
16
+ #include <cuda/std/__cccl/system_header.h>
17
+
18
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
19
+ # pragma GCC system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
21
+ # pragma clang system_header
22
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
23
+ # pragma system_header
24
+ #endif // no system header
25
+
26
+ #define _CCCL_HOST_STD_LIB_LIBSTDCXX() 0
27
+ #define _CCCL_HOST_STD_LIB_LIBCXX() 0
28
+ #define _CCCL_HOST_STD_LIB_STL() 0
29
+
30
+ // include a minimal header
31
+ #if _CCCL_HAS_INCLUDE(<version>)
32
+ # include <version>
33
+ #elif _CCCL_HAS_INCLUDE(<ciso646>)
34
+ # include <ciso646>
35
+ #endif // ^^^ _CCCL_HAS_INCLUDE(<ciso646>) ^^^
36
+
37
+ #if defined(_MSVC_STL_VERSION)
38
+ # undef _CCCL_HOST_STD_LIB_STL
39
+ # define _CCCL_HOST_STD_LIB_STL() 1
40
+ #elif defined(__GLIBCXX__)
41
+ # undef _CCCL_HOST_STD_LIB_LIBSTDCXX
42
+ # define _CCCL_HOST_STD_LIB_LIBSTDCXX() 1
43
+ #elif defined(_LIBCPP_VERSION)
44
+ # undef _CCCL_HOST_STD_LIB_LIBCXX
45
+ # define _CCCL_HOST_STD_LIB_LIBCXX() 1
46
+ #endif // ^^^ _LIBCPP_VERSION ^^^
47
+
48
+ #define _CCCL_HOST_STD_LIB(_X) _CCCL_HOST_STD_LIB_##_X()
49
+ #define _CCCL_HAS_HOST_STD_LIB() \
50
+ (_CCCL_HOST_STD_LIB_LIBSTDCXX() || _CCCL_HOST_STD_LIB_LIBCXX() || _CCCL_HOST_STD_LIB_STL())
51
+
52
+ #endif // __CCCL_HOST_STD_LIB_H
@@ -0,0 +1,36 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA_STD__CCCL_MEMORY_WRAPPER_H
12
+ #define _CUDA_STD__CCCL_MEMORY_WRAPPER_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ // When nvc++ uses CCCL components as part of its implementation of
25
+ // Standard C++ algorithms, a cycle of included files may result when CCCL code
26
+ // tries to use a standard algorithm. The THRUST_INCLUDING_ALGORITHMS_HEADER macro
27
+ // is defined only when CCCL is including an algorithms-related header, giving
28
+ // the compiler a chance to detect and break the cycle of includes.
29
+
30
+ #if !_CCCL_COMPILER(NVRTC)
31
+ # define THRUST_INCLUDING_ALGORITHMS_HEADER
32
+ # include <memory>
33
+ # undef THRUST_INCLUDING_ALGORITHMS_HEADER
34
+ #endif // !_CCCL_COMPILER(NVRTC)
35
+
36
+ #endif // _CUDA_STD__CCCL_MEMORY_WRAPPER_H
@@ -0,0 +1,36 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA_STD__CCCL_NUMERIC_WRAPPER_H
12
+ #define _CUDA_STD__CCCL_NUMERIC_WRAPPER_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ // When a compiler uses CCCL components as part of its implementation of
25
+ // Standard C++ algorithms, a cycle of included files may result when CCCL code
26
+ // tries to use a standard algorithm. The THRUST_INCLUDING_ALGORITHMS_HEADER macro
27
+ // is defined only when CCCL is including an algorithms-related header, giving
28
+ // the compiler a chance to detect and break the cycle of includes.
29
+
30
+ #if !_CCCL_COMPILER(NVRTC)
31
+ # define THRUST_INCLUDING_ALGORITHMS_HEADER
32
+ # include <numeric>
33
+ # undef THRUST_INCLUDING_ALGORITHMS_HEADER
34
+ #endif // !_CCCL_COMPILER(NVRTC)
35
+
36
+ #endif // _CUDA_STD__CCCL_NUMERIC_WRAPPER_H
@@ -27,9 +27,10 @@
27
27
  #include <cuda/std/__type_traits/is_floating_point.h>
28
28
  #include <cuda/std/__type_traits/is_integral.h>
29
29
 
30
- #if _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG) || !_CCCL_CUDA_COMPILATION()
30
+ // MSVC and clang cuda need the host side functions included
31
+ #if _CCCL_HOST_COMPILATION() || _CCCL_CUDA_COMPILER(CLANG)
31
32
  # include <math.h>
32
- #endif // _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG) || !_CCCL_CUDA_COMPILATION()
33
+ #endif // _CCCL_HOST_COMPILATION() || _CCCL_CUDA_COMPILER(CLANG)
33
34
 
34
35
  #include <cuda/std/__cccl/prologue.h>
35
36
 
@@ -23,6 +23,7 @@
23
23
 
24
24
  #include <cuda/std/__complex/vector_support.h>
25
25
  #include <cuda/std/__concepts/concept_macros.h>
26
+ #include <cuda/std/__fwd/complex.h>
26
27
  #include <cuda/std/__fwd/get.h>
27
28
  #include <cuda/std/__tuple_dir/tuple_element.h>
28
29
  #include <cuda/std/__tuple_dir/tuple_size.h>
@@ -36,9 +37,9 @@
36
37
  #include <cuda/std/limits>
37
38
 
38
39
  // Compatibility helpers for thrust to convert between `std::complex` and `cuda::std::complex`
40
+ // todo: find a way to get rid of this include
39
41
  #if !_CCCL_COMPILER(NVRTC)
40
- # include <complex>
41
- # include <sstream> // for std::basic_ostringstream
42
+ # include <complex> // for std::complex stream operators
42
43
 
43
44
  # define _LIBCUDACXX_ACCESS_STD_COMPLEX_REAL(__c) reinterpret_cast<const _Up(&)[2]>(__c)[0]
44
45
  # define _LIBCUDACXX_ACCESS_STD_COMPLEX_IMAG(__c) reinterpret_cast<const _Up(&)[2]>(__c)[1]
@@ -21,28 +21,28 @@
21
21
  # pragma system_header
22
22
  #endif // no system header
23
23
 
24
- #include <cuda/std/__complex/complex.h>
24
+ // gcc < 8 warns about it's extended literals being shadowed by the implementation, so let's just disable the complex
25
+ // literals
26
+ #if !_CCCL_COMPILER(GCC, <, 8)
25
27
 
26
- #include <cuda/std/__cccl/prologue.h>
28
+ # include <cuda/std/__complex/complex.h>
27
29
 
28
- _CCCL_BEGIN_NAMESPACE_CUDA_STD
30
+ # include <cuda/std/__cccl/prologue.h>
29
31
 
30
- #ifdef _LIBCUDACXX_HAS_STL_LITERALS
31
- // Literal suffix for complex number literals [complex.literals]
32
+ _CCCL_BEGIN_NAMESPACE_CUDA_STD
32
33
 
33
34
  _CCCL_DIAG_PUSH
34
35
  _CCCL_DIAG_SUPPRESS_GCC("-Wliteral-suffix")
35
36
  _CCCL_DIAG_SUPPRESS_CLANG("-Wuser-defined-literals")
36
- _CCCL_DIAG_SUPPRESS_MSVC(4455)
37
+ _CCCL_DIAG_SUPPRESS_NVHPC(lit_suffix_no_underscore)
38
+ _CCCL_DIAG_SUPPRESS_MSVC(4455) // literal suffix identifiers that do not start with an underscore are reserved
39
+ _CCCL_BEGIN_NV_DIAG_SUPPRESS(2506, 20208) // a user-provided literal suffix must begin with "_",
40
+ // long double treated as double
37
41
 
38
42
  inline namespace literals
39
43
  {
40
44
  inline namespace complex_literals
41
45
  {
42
- # if !_CCCL_CUDA_COMPILER(NVCC) && !_CCCL_COMPILER(NVRTC)
43
- // NOTE: if you get a warning from GCC <7 here that "literal operator suffixes not preceded by ‘_’ are reserved for
44
- // future standardization" then we are sorry. The warning was implemented before GCC 7, but can only be disabled since
45
- // GCC 7. See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69523
46
46
  _CCCL_API constexpr complex<long double> operator""il(long double __im)
47
47
  {
48
48
  return {0.0l, __im};
@@ -71,36 +71,16 @@ _CCCL_API constexpr complex<float> operator""if(unsigned long long __im)
71
71
  {
72
72
  return {0.0f, static_cast<float>(__im)};
73
73
  }
74
- # else // ^^^ !_CCCL_CUDA_COMPILER(NVCC) && !_CCCL_COMPILER(NVRTC) ^^^ / vvv other compilers vvv
75
- _CCCL_API constexpr complex<double> operator""i(double __im)
76
- {
77
- return {0.0, static_cast<double>(__im)};
78
- }
79
-
80
- _CCCL_API constexpr complex<double> operator""i(unsigned long long __im)
81
- {
82
- return {0.0, static_cast<double>(__im)};
83
- }
84
-
85
- _CCCL_API constexpr complex<float> operator""if(double __im)
86
- {
87
- return {0.0f, static_cast<float>(__im)};
88
- }
89
-
90
- _CCCL_API constexpr complex<float> operator""if(unsigned long long __im)
91
- {
92
- return {0.0f, static_cast<float>(__im)};
93
- }
94
- # endif // other compilers
95
74
  } // namespace complex_literals
96
75
  } // namespace literals
97
76
 
77
+ _CCCL_END_NV_DIAG_SUPPRESS()
98
78
  _CCCL_DIAG_POP
99
79
 
100
- #endif // _LIBCUDACXX_HAS_STL_LITERALS
101
-
102
80
  _CCCL_END_NAMESPACE_CUDA_STD
103
81
 
104
- #include <cuda/std/__cccl/epilogue.h>
82
+ # include <cuda/std/__cccl/epilogue.h>
83
+
84
+ #endif // !_CCCL_COMPILER(GCC, <, 8)
105
85
 
106
86
  #endif // _CUDA_STD___COMPLEX_LITERALS_H
@@ -31,8 +31,9 @@
31
31
  # include <cuda/std/__type_traits/enable_if.h>
32
32
  # include <cuda/std/__type_traits/is_constructible.h>
33
33
 
34
+ // todo: find a way to get rid of this include
34
35
  # if !_CCCL_COMPILER(NVRTC)
35
- # include <sstream> // for std::basic_ostringstream
36
+ # include <complex> // for std::complex stream operators
36
37
  # endif // !_CCCL_COMPILER(NVRTC)
37
38
 
38
39
  # include <cuda/std/__cccl/prologue.h>
@@ -31,8 +31,9 @@
31
31
  # include <cuda/std/__type_traits/enable_if.h>
32
32
  # include <cuda/std/__type_traits/is_constructible.h>
33
33
 
34
+ // todo: find a way to get rid of this include
34
35
  # if !_CCCL_COMPILER(NVRTC)
35
- # include <sstream> // for std::basic_ostringstream
36
+ # include <complex> // for std::complex stream operators
36
37
  # endif // !_CCCL_COMPILER(NVRTC)
37
38
 
38
39
  # include <cuda/std/__cccl/prologue.h>
@@ -294,7 +295,7 @@ struct __get_complex_impl<__half>
294
295
  }
295
296
  };
296
297
 
297
- # if !defined(_LIBCUDACXX_HAS_NO_LOCALIZATION) && !_CCCL_COMPILER(NVRTC)
298
+ # if !_CCCL_COMPILER(NVRTC)
298
299
  template <class _CharT, class _Traits>
299
300
  ::std::basic_istream<_CharT, _Traits>& operator>>(::std::basic_istream<_CharT, _Traits>& __is, complex<__half>& __x)
300
301
  {
@@ -310,7 +311,7 @@ operator<<(::std::basic_ostream<_CharT, _Traits>& __os, const complex<__half>& _
310
311
  {
311
312
  return __os << complex<float>{__x};
312
313
  }
313
- # endif // !_LIBCUDACXX_HAS_NO_LOCALIZATION && !_CCCL_COMPILER(NVRTC)
314
+ # endif // !_CCCL_COMPILER(NVRTC)
314
315
 
315
316
  _CCCL_END_NAMESPACE_CUDA_STD
316
317
 
@@ -35,8 +35,8 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD
35
35
 
36
36
  template <class _Fn, class... _Args>
37
37
  concept invocable = requires(_Fn&& __fn, _Args&&... __args) {
38
- ::cuda::std::__invoke(::cuda::std::forward<_Fn>(__fn), ::cuda::std::forward<_Args>(__args)...); // not required to be
39
- // equality preserving
38
+ ::cuda::std::invoke(::cuda::std::forward<_Fn>(__fn), ::cuda::std::forward<_Args>(__args)...); // not required to be
39
+ // equality preserving
40
40
  };
41
41
 
42
42
  // [concept.regular.invocable]
@@ -21,8 +21,9 @@
21
21
  # pragma system_header
22
22
  #endif // no system header
23
23
 
24
+ #include <cuda/__cmath/mul_hi.h>
24
25
  #include <cuda/std/__cstddef/types.h>
25
- #include <cuda/std/cstring>
26
+ #include <cuda/std/__cstring/memset.h>
26
27
 
27
28
  #if !_CCCL_COMPILER(NVRTC)
28
29
  # include <cstdlib>
@@ -44,7 +45,7 @@ using ::malloc;
44
45
 
45
46
  const size_t __nbytes = __n * __size;
46
47
 
47
- if (::__umul64hi(__n, __size) == 0)
48
+ if (::cuda::mul_hi(__n, __size) == 0)
48
49
  {
49
50
  __ptr = ::cuda::std::malloc(__nbytes);
50
51
  if (__ptr != nullptr)