cuda-cccl 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  2. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  3. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  4. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  5. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  6. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  7. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
  8. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  9. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  10. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  11. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  12. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  13. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  14. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  15. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
  16. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  17. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
  18. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
  19. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
  20. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  21. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  22. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  23. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  24. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  25. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  26. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  27. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  28. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  29. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  30. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  31. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  32. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  33. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  34. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  35. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  36. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  37. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  38. cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
  39. cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
  40. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  41. cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
  42. cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
  43. cuda/cccl/headers/include/cuda/__event/event.h +26 -26
  44. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  45. cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
  46. cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
  47. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  48. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  49. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  50. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  51. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  52. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  53. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
  54. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
  55. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
  56. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  57. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  58. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  59. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  60. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  61. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  62. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  63. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  64. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  65. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
  67. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  68. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
  69. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  70. cuda/cccl/headers/include/cuda/cmath +1 -0
  71. cuda/cccl/headers/include/cuda/devices +3 -0
  72. cuda/cccl/headers/include/cuda/memory +1 -0
  73. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  75. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  76. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  77. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  78. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  80. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  81. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  82. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  83. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  84. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  85. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  86. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  87. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  88. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  89. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  90. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  91. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  92. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  93. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  94. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  95. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  96. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  97. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  98. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  99. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  100. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  101. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  102. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  103. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  104. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  105. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  106. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  107. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  108. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  109. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  110. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  111. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  112. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  113. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  114. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  115. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  116. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  117. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  118. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  120. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  121. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  122. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  123. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  124. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  125. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  126. cuda/cccl/headers/include/cuda/std/string_view +146 -11
  127. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  128. cuda/cccl/headers/include/cuda/utility +1 -0
  129. cuda/cccl/headers/include/nv/target +7 -2
  130. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  131. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  132. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  133. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  134. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  135. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  136. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  137. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  138. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  139. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  140. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  141. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  142. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  143. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  144. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  145. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  146. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  147. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  148. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  149. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  150. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  151. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  152. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  153. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  154. cuda/compute/__init__.py +2 -0
  155. cuda/compute/_bindings.pyi +43 -1
  156. cuda/compute/_bindings_impl.pyx +156 -7
  157. cuda/compute/algorithms/_scan.py +108 -36
  158. cuda/compute/algorithms/_transform.py +32 -11
  159. cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  160. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  161. cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  162. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  163. cuda/compute/iterators/__init__.py +2 -0
  164. cuda/compute/iterators/_factories.py +28 -0
  165. cuda/compute/iterators/_iterators.py +206 -1
  166. cuda/compute/numba_utils.py +2 -2
  167. cuda/compute/typing.py +2 -0
  168. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  169. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
  170. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  171. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  172. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  173. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  174. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  175. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  176. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  177. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  178. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  179. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  180. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  181. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  182. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  183. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  184. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  185. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -114,6 +114,7 @@ CUB_NAMESPACE_BEGIN
114
114
  //! // Compute warp-wide prefix sums
115
115
  //! int warp_id = threadIdx.x / 32;
116
116
  //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
117
+ //! }
117
118
  //!
118
119
  //! Suppose the set of input ``thread_data`` across the block of threads is
119
120
  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps of
@@ -143,6 +144,8 @@ CUB_NAMESPACE_BEGIN
143
144
  //!
144
145
  //! // Compute warp-wide prefix sums
145
146
  //! WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
147
+ //! }
148
+ //! }
146
149
  //!
147
150
  //! Suppose the set of input ``thread_data`` across the warp of threads is
148
151
  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` will be
@@ -248,6 +251,7 @@ public:
248
251
  //! // Compute inclusive warp-wide prefix sums
249
252
  //! int warp_id = threadIdx.x / 32;
250
253
  //! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
254
+ //! }
251
255
  //!
252
256
  //! Suppose the set of input ``thread_data`` across the block of threads is
253
257
  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
@@ -294,9 +298,8 @@ public:
294
298
  //! // Compute inclusive warp-wide prefix sums
295
299
  //! int warp_aggregate;
296
300
  //! int warp_id = threadIdx.x / 32;
297
- //! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data,
298
- //! thread_data,
299
- //! warp_aggregate);
301
+ //! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
302
+ //! }
300
303
  //!
301
304
  //! Suppose the set of input ``thread_data`` across the block of threads is
302
305
  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
@@ -352,6 +355,7 @@ public:
352
355
  //! // Compute exclusive warp-wide prefix sums
353
356
  //! int warp_id = threadIdx.x / 32;
354
357
  //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
358
+ //! }
355
359
  //!
356
360
  //! Suppose the set of input ``thread_data`` across the block of threads is
357
361
  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
@@ -201,6 +201,7 @@ enum WarpStoreAlgorithm
201
201
  //!
202
202
  //! // Store items to linear memory
203
203
  //! WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
204
+ //! }
204
205
  //!
205
206
  //! Suppose the set of ``thread_data`` across the warp threads is
206
207
  //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``.
@@ -26,6 +26,7 @@
26
26
  #if _CCCL_CUDA_COMPILATION()
27
27
  # include <cuda/__ptx/instructions/get_sreg.h>
28
28
  # include <cuda/__ptx/instructions/mbarrier_arrive.h>
29
+ # include <cuda/__ptx/instructions/mbarrier_wait.h>
29
30
  # include <cuda/__ptx/ptx_dot_variants.h>
30
31
  # include <cuda/__ptx/ptx_helper_functions.h>
31
32
  #endif // _CCCL_CUDA_COMPILATION()
@@ -381,12 +382,30 @@ private:
381
382
  public:
382
383
  _CCCL_API inline void wait(arrival_token&& __phase) const
383
384
  {
385
+ // no need to back off on a barrier in SMEM on SM90+, SYNCS unit is taking care of this
386
+ NV_IF_TARGET(NV_PROVIDES_SM_90,
387
+ (if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared)) {
388
+ while (!::cuda::ptx::mbarrier_try_wait(
389
+ reinterpret_cast<uint64_t*>(const_cast<__barrier_base*>(&__barrier)), __phase))
390
+ ;
391
+ return;
392
+ }))
393
+ // fallback implementation
384
394
  ::cuda::std::__cccl_thread_poll_with_backoff(
385
395
  ::cuda::std::__barrier_poll_tester_phase<barrier>(this, ::cuda::std::move(__phase)));
386
396
  }
387
397
 
388
398
  _CCCL_API inline void wait_parity(bool __phase_parity) const
389
399
  {
400
+ // no need to back off on a barrier in SMEM on SM90+, SYNCS unit is taking care of this
401
+ NV_IF_TARGET(NV_PROVIDES_SM_90,
402
+ (if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared)) {
403
+ while (!::cuda::ptx::mbarrier_try_wait_parity(
404
+ reinterpret_cast<uint64_t*>(const_cast<__barrier_base*>(&__barrier)), __phase_parity))
405
+ ;
406
+ return;
407
+ }))
408
+ // fallback implementation
390
409
  ::cuda::std::__cccl_thread_poll_with_backoff(
391
410
  ::cuda::std::__barrier_poll_tester_parity<barrier>(this, __phase_parity));
392
411
  }
@@ -23,6 +23,7 @@
23
23
  #include <cuda/std/__cccl/exceptions.h> // IWYU pragma: export
24
24
  #include <cuda/std/__cccl/execution_space.h> // IWYU pragma: export
25
25
  #include <cuda/std/__cccl/extended_data_types.h> // IWYU pragma: export
26
+ #include <cuda/std/__cccl/host_std_lib.h> // IWYU pragma: export
26
27
  #include <cuda/std/__cccl/os.h> // IWYU pragma: export
27
28
  #include <cuda/std/__cccl/preprocessor.h> // IWYU pragma: export
28
29
  #include <cuda/std/__cccl/ptx_isa.h> // IWYU pragma: export
@@ -23,6 +23,7 @@
23
23
 
24
24
  #include <cuda/__cmath/ceil_div.h>
25
25
  #include <cuda/__cmath/ilog.h>
26
+ #include <cuda/__cmath/mul_hi.h>
26
27
  #include <cuda/__cmath/pow2.h>
27
28
  #include <cuda/std/__type_traits/common_type.h>
28
29
  #include <cuda/std/__type_traits/is_integer.h>
@@ -30,7 +31,6 @@
30
31
  #include <cuda/std/__type_traits/make_nbit_int.h>
31
32
  #include <cuda/std/__type_traits/make_unsigned.h>
32
33
  #include <cuda/std/__type_traits/num_bits.h>
33
- #include <cuda/std/__type_traits/promote.h>
34
34
  #include <cuda/std/__utility/pair.h>
35
35
  #include <cuda/std/cstdint>
36
36
  #include <cuda/std/limits>
@@ -39,78 +39,6 @@
39
39
 
40
40
  _CCCL_BEGIN_NAMESPACE_CUDA
41
41
 
42
- /***********************************************************************************************************************
43
- * Extract higher bits after multiplication
44
- **********************************************************************************************************************/
45
-
46
- template <typename _Tp, typename _Lhs>
47
- [[nodiscard]] _CCCL_API constexpr ::cuda::std::common_type_t<_Tp, _Lhs>
48
- __multiply_extract_higher_bits_fallback(_Tp __x, _Lhs __y)
49
- {
50
- using __ret_t = ::cuda::std::common_type_t<_Tp, _Lhs>;
51
- constexpr int __shift = ::cuda::std::__num_bits_v<__ret_t> / 2;
52
- using __half_bits_t = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<__ret_t>>;
53
- auto __x_high = static_cast<__half_bits_t>(__x >> __shift);
54
- auto __x_low = static_cast<__half_bits_t>(__x);
55
- auto __y_high = static_cast<__half_bits_t>(__y >> __shift);
56
- auto __y_low = static_cast<__half_bits_t>(__y);
57
- auto __p0 = __x_low * __y_low;
58
- auto __p1 = __x_low * __y_high;
59
- auto __p2 = __x_high * __y_low;
60
- auto __p3 = __x_high * __y_high;
61
- auto __mid = __p1 + __p2;
62
- __half_bits_t __carry = (__mid < __p1);
63
- auto __po_half = __p0 >> __shift;
64
- __mid = __mid + __po_half;
65
- __carry += (__mid < __po_half);
66
- return __p3 + (__mid >> __shift) + (__carry << __shift);
67
- }
68
-
69
- template <typename _Tp, typename _Lhs>
70
- [[nodiscard]] _CCCL_API constexpr ::cuda::std::common_type_t<_Tp, _Lhs> __multiply_extract_higher_bits(_Tp __x, _Lhs __y)
71
- {
72
- using ::cuda::std::__cccl_is_integer_v;
73
- using ::cuda::std::__num_bits_v;
74
- using ::cuda::std::is_signed_v;
75
- static_assert(__cccl_is_integer_v<_Tp>, "__multiply_extract_higher_bits: T is required to be an integer type");
76
- static_assert(__cccl_is_integer_v<_Lhs>, "__multiply_extract_higher_bits: T is required to be an integer type");
77
- if constexpr (is_signed_v<_Tp>)
78
- {
79
- _CCCL_ASSERT(__x >= 0, "__x must be non-negative");
80
- _CCCL_ASSUME(__x >= 0);
81
- }
82
- if constexpr (is_signed_v<_Lhs>)
83
- {
84
- _CCCL_ASSERT(__y >= 0, "__y must be non-negative");
85
- _CCCL_ASSUME(__y >= 0);
86
- }
87
- using __ret_t = ::cuda::std::common_type_t<_Tp, _Lhs>;
88
- if (!::cuda::std::__cccl_default_is_constant_evaluated())
89
- {
90
- if constexpr (sizeof(_Tp) == sizeof(uint32_t) && sizeof(_Lhs) == sizeof(uint32_t))
91
- {
92
- NV_IF_TARGET(NV_IS_DEVICE, (return ::__umulhi(static_cast<uint32_t>(__x), static_cast<uint32_t>(__y));));
93
- }
94
- #if !_CCCL_HAS_INT128()
95
- else if constexpr (sizeof(_Tp) == sizeof(uint64_t) && sizeof(_Lhs) == sizeof(uint64_t))
96
- {
97
- NV_DISPATCH_TARGET(NV_IS_DEVICE, (return ::__umul64hi(static_cast<uint64_t>(__x), static_cast<uint64_t>(__y));));
98
- }
99
- #endif // !_CCCL_HAS_INT128()
100
- }
101
- if constexpr (sizeof(__ret_t) < sizeof(uint64_t) || (sizeof(__ret_t) == sizeof(uint64_t) && _CCCL_HAS_INT128()))
102
- {
103
- constexpr auto __mul_bits = ::cuda::next_power_of_two(__num_bits_v<_Tp> + __num_bits_v<_Lhs>);
104
- using __larger_t = ::cuda::std::__make_nbit_uint_t<__mul_bits>;
105
- auto __ret = (static_cast<__larger_t>(__x) * __y) >> (__mul_bits / 2);
106
- return static_cast<__ret_t>(__ret);
107
- }
108
- else
109
- {
110
- return ::cuda::__multiply_extract_higher_bits_fallback(__x, __y);
111
- }
112
- }
113
-
114
42
  /***********************************************************************************************************************
115
43
  * Fast Modulo/Division based on Precomputation
116
44
  **********************************************************************************************************************/
@@ -184,6 +112,7 @@ public:
184
112
  _CCCL_ASSERT(__dividend >= 0, "dividend must be non-negative");
185
113
  }
186
114
  using __common_t = ::cuda::std::common_type_t<_Tp, _Lhs>;
115
+ using __ucommon_t = ::cuda::std::make_unsigned_t<__common_t>;
187
116
  using _Up = ::cuda::std::make_unsigned_t<_Lhs>;
188
117
  const auto __div = __divisor1.__divisor; // cannot use structure binding because of clang-14
189
118
  const auto __mul = __divisor1.__multiplier;
@@ -205,7 +134,7 @@ public:
205
134
  {
206
135
  return static_cast<__common_t>(__dividend);
207
136
  }
208
- auto __higher_bits = ::cuda::__multiply_extract_higher_bits(__udividend, __mul);
137
+ auto __higher_bits = ::cuda::mul_hi(static_cast<__ucommon_t>(__udividend), static_cast<__ucommon_t>(__mul));
209
138
  auto __quotient = static_cast<__common_t>(__higher_bits >> __shift_);
210
139
  _CCCL_ASSERT(__quotient == static_cast<__common_t>(__dividend / __div), "wrong __quotient");
211
140
  return __quotient;
@@ -0,0 +1,146 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA___CMATH_MUL_HI_H
12
+ #define _CUDA___CMATH_MUL_HI_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ #include <cuda/std/__type_traits/is_constant_evaluated.h>
25
+ #include <cuda/std/__type_traits/is_integer.h>
26
+ #include <cuda/std/__type_traits/is_signed.h>
27
+ #include <cuda/std/__type_traits/make_nbit_int.h>
28
+ #include <cuda/std/__type_traits/make_unsigned.h>
29
+ #include <cuda/std/__type_traits/num_bits.h>
30
+ #include <cuda/std/cstdint>
31
+
32
+ #if _CCCL_COMPILER(MSVC)
33
+ # include <intrin.h>
34
+ #endif // _CCCL_COMPILER(MSVC)
35
+
36
+ #include <cuda/std/__cccl/prologue.h>
37
+
38
+ _CCCL_BEGIN_NAMESPACE_CUDA
39
+
40
+ /***********************************************************************************************************************
41
+ * Extract higher bits after multiplication
42
+ **********************************************************************************************************************/
43
+
44
+ template <typename _Tp>
45
+ [[nodiscard]] _CCCL_API constexpr _Tp __mul_hi_fallback(_Tp __lhs, _Tp __rhs) noexcept
46
+ {
47
+ static_assert(::cuda::std::is_unsigned_v<_Tp>, "__mul_hi_fallback: T is required to be a unsigned integer type");
48
+ constexpr int __half_bits = ::cuda::std::__num_bits_v<_Tp> / 2;
49
+ using __half_bits_t = ::cuda::std::__make_nbit_uint_t<__half_bits>;
50
+ const auto __lhs_low = static_cast<__half_bits_t>(__lhs); // 32-bit
51
+ const auto __lhs_high = static_cast<__half_bits_t>(__lhs >> __half_bits); // 32-bit
52
+ const auto __rhs_low = static_cast<__half_bits_t>(__rhs); // 32-bit
53
+ const auto __rhs_high = static_cast<__half_bits_t>(__rhs >> __half_bits); // 32-bit
54
+ const auto __po_half = (static_cast<_Tp>(__lhs_low) * __rhs_low) >> __half_bits;
55
+ const auto __p1 = static_cast<_Tp>(__lhs_low) * __rhs_high; // 64-bit
56
+ const auto __p2 = static_cast<_Tp>(__lhs_high) * __rhs_low; // 64-bit
57
+ const auto __p3 = static_cast<_Tp>(__lhs_high) * __rhs_high; // 64-bit
58
+ const auto __p1_half = static_cast<__half_bits_t>(__p1); // 32-bit
59
+ const auto __p2_half = static_cast<__half_bits_t>(__p2); // 32-bit
60
+ const auto __carry = (__po_half + __p1_half + __p2_half) >> __half_bits; // 64-bit
61
+ return __p3 + (__p1 >> __half_bits) + (__p2 >> __half_bits) + __carry;
62
+ }
63
+
64
+ _CCCL_TEMPLATE(typename _Tp)
65
+ _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>)
66
+ [[nodiscard]]
67
+ _CCCL_API constexpr _Tp mul_hi(_Tp __lhs, _Tp __rhs) noexcept
68
+ {
69
+ using ::cuda::std::int64_t;
70
+ using ::cuda::std::is_signed_v;
71
+ if (!::cuda::std::__cccl_default_is_constant_evaluated())
72
+ {
73
+ if constexpr (sizeof(_Tp) == sizeof(int))
74
+ {
75
+ if constexpr (is_signed_v<_Tp>)
76
+ {
77
+ [[maybe_unused]] const auto __lhs1 = static_cast<int>(__lhs);
78
+ [[maybe_unused]] const auto __rhs1 = static_cast<int>(__rhs);
79
+ NV_IF_TARGET(NV_IS_DEVICE, (return ::__mulhi(__lhs1, __rhs1);));
80
+ }
81
+ else // is_unsigned_v<_Tp>
82
+ {
83
+ [[maybe_unused]] const auto __lhs1 = static_cast<unsigned>(__lhs);
84
+ [[maybe_unused]] const auto __rhs1 = static_cast<unsigned>(__rhs);
85
+ NV_IF_TARGET(NV_IS_DEVICE, (return ::__umulhi(__lhs1, __rhs1);));
86
+ }
87
+ }
88
+ else if constexpr (sizeof(_Tp) == sizeof(int64_t))
89
+ {
90
+ if constexpr (is_signed_v<_Tp>)
91
+ {
92
+ [[maybe_unused]] const auto __lhs1 = static_cast<long long>(__lhs);
93
+ [[maybe_unused]] const auto __rhs1 = static_cast<long long>(__rhs);
94
+ NV_IF_TARGET(NV_IS_DEVICE, (return ::__mul64hi(__lhs1, __rhs1);));
95
+ #if _CCCL_COMPILER(MSVC)
96
+ NV_IF_TARGET(NV_IS_HOST, (return ::__mulh(__lhs1, __rhs1);));
97
+ #endif // _CCCL_COMPILER(MSVC)
98
+ }
99
+ else // is_unsigned_v<_Tp>
100
+ {
101
+ [[maybe_unused]] const auto __lhs1 = static_cast<unsigned long long>(__lhs);
102
+ [[maybe_unused]] const auto __rhs1 = static_cast<unsigned long long>(__rhs);
103
+ NV_IF_TARGET(NV_IS_DEVICE, (return ::__umul64hi(__lhs1, __rhs1);));
104
+ #if _CCCL_COMPILER(MSVC)
105
+ NV_IF_TARGET(NV_IS_HOST, (return ::__umulh(__lhs1, __rhs1);));
106
+ #endif // _CCCL_COMPILER(MSVC)
107
+ }
108
+ }
109
+ }
110
+ if constexpr (sizeof(_Tp) < sizeof(int64_t) || (sizeof(_Tp) == sizeof(int64_t) && _CCCL_HAS_INT128()))
111
+ {
112
+ constexpr auto __bits = ::cuda::std::__num_bits_v<_Tp>;
113
+ using __larger_t = ::cuda::std::__make_nbit_int_t<__bits * 2, is_signed_v<_Tp>>;
114
+ const auto __ret = (static_cast<__larger_t>(__lhs) * __rhs) >> __bits;
115
+ return static_cast<_Tp>(__ret);
116
+ }
117
+ else // sizeof(_Tp) >= sizeof(int64_t) && !_CCCL_HAS_INT128()
118
+ {
119
+ if constexpr (is_signed_v<_Tp>)
120
+ {
121
+ using _Up = ::cuda::std::make_unsigned_t<_Tp>;
122
+ const auto __lhs1 = static_cast<_Up>(__lhs);
123
+ const auto __rhs1 = static_cast<_Up>(__rhs);
124
+ auto __hi = ::cuda::__mul_hi_fallback(__lhs1, __rhs1);
125
+ if (__lhs < 0)
126
+ {
127
+ __hi -= __rhs1;
128
+ }
129
+ if (__rhs < 0)
130
+ {
131
+ __hi -= __lhs1;
132
+ }
133
+ return static_cast<_Tp>(__hi);
134
+ }
135
+ else
136
+ {
137
+ return ::cuda::__mul_hi_fallback(__lhs, __rhs);
138
+ }
139
+ }
140
+ }
141
+
142
+ _CCCL_END_NAMESPACE_CUDA
143
+
144
+ #include <cuda/std/__cccl/epilogue.h>
145
+
146
+ #endif // _CUDA___CMATH_MULTIPLY_HIGH_HALF_H
@@ -24,10 +24,6 @@
24
24
  #include <cuda/__fwd/complex.h>
25
25
  #include <cuda/std/__fwd/complex.h>
26
26
 
27
- #if !_CCCL_COMPILER(NVRTC)
28
- # include <complex>
29
- #endif // !_CCCL_COMPILER(NVRTC)
30
-
31
27
  #include <cuda/std/__cccl/prologue.h>
32
28
 
33
29
  _CCCL_BEGIN_NAMESPACE_CUDA
@@ -0,0 +1,176 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA___DEVICE_ARCH_ID_H
12
+ #define _CUDA___DEVICE_ARCH_ID_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ #include <cuda/__device/compute_capability.h>
25
+ #include <cuda/__fwd/devices.h>
26
+ #include <cuda/std/__type_traits/always_false.h>
27
+ #include <cuda/std/__utility/to_underlying.h>
28
+
29
+ #include <cuda/std/__cccl/prologue.h>
30
+
31
+ _CCCL_BEGIN_NAMESPACE_CUDA
32
+
33
+ //! @brief Architecture identifier
34
+ //! This type identifies an architecture. It has more possible entries than just numeric values of the compute
35
+ //! capability. For example, sm_90 and sm_90a have the same compute capability, but the identifier is different.
36
+ enum class arch_id : int
37
+ {
38
+ sm_60 = 60,
39
+ sm_61 = 61,
40
+ sm_70 = 70,
41
+ sm_75 = 75,
42
+ sm_80 = 80,
43
+ sm_86 = 86,
44
+ sm_87 = 87,
45
+ sm_88 = 88,
46
+ sm_89 = 89,
47
+ sm_90 = 90,
48
+ sm_100 = 100,
49
+ sm_103 = 103,
50
+ sm_110 = 110,
51
+ sm_120 = 120,
52
+ sm_121 = 121,
53
+ sm_90a = 90 * __arch_specific_id_multiplier,
54
+ sm_100a = 100 * __arch_specific_id_multiplier,
55
+ sm_103a = 103 * __arch_specific_id_multiplier,
56
+ sm_110a = 110 * __arch_specific_id_multiplier,
57
+ sm_120a = 120 * __arch_specific_id_multiplier,
58
+ sm_121a = 121 * __arch_specific_id_multiplier,
59
+ };
60
+
61
+ [[nodiscard]] _CCCL_API constexpr bool __has_known_arch(compute_capability __cc) noexcept
62
+ {
63
+ switch (__cc.get())
64
+ {
65
+ case ::cuda::std::to_underlying(arch_id::sm_60):
66
+ case ::cuda::std::to_underlying(arch_id::sm_61):
67
+ case ::cuda::std::to_underlying(arch_id::sm_70):
68
+ case ::cuda::std::to_underlying(arch_id::sm_75):
69
+ case ::cuda::std::to_underlying(arch_id::sm_80):
70
+ case ::cuda::std::to_underlying(arch_id::sm_86):
71
+ case ::cuda::std::to_underlying(arch_id::sm_87):
72
+ case ::cuda::std::to_underlying(arch_id::sm_88):
73
+ case ::cuda::std::to_underlying(arch_id::sm_89):
74
+ case ::cuda::std::to_underlying(arch_id::sm_90):
75
+ case ::cuda::std::to_underlying(arch_id::sm_100):
76
+ case ::cuda::std::to_underlying(arch_id::sm_103):
77
+ case ::cuda::std::to_underlying(arch_id::sm_110):
78
+ case ::cuda::std::to_underlying(arch_id::sm_120):
79
+ case ::cuda::std::to_underlying(arch_id::sm_121):
80
+ return true;
81
+ default:
82
+ return false;
83
+ }
84
+ }
85
+
86
+ [[nodiscard]] _CCCL_API constexpr bool __has_known_specific_arch(compute_capability __cc) noexcept
87
+ {
88
+ switch (__cc.get() * __arch_specific_id_multiplier)
89
+ {
90
+ case ::cuda::std::to_underlying(arch_id::sm_90a):
91
+ case ::cuda::std::to_underlying(arch_id::sm_100a):
92
+ case ::cuda::std::to_underlying(arch_id::sm_103a):
93
+ case ::cuda::std::to_underlying(arch_id::sm_110a):
94
+ case ::cuda::std::to_underlying(arch_id::sm_120a):
95
+ case ::cuda::std::to_underlying(arch_id::sm_121a):
96
+ return true;
97
+ default:
98
+ return false;
99
+ }
100
+ }
101
+
102
+ //! @brief Converts the compute capability to the architecture id.
103
+ //!
104
+ //! @param __cc The compute capability. Must have a corresponding architecture id.
105
+ //!
106
+ //! @returns The architecture id.
107
+ [[nodiscard]] _CCCL_API constexpr arch_id to_arch_id(compute_capability __cc) noexcept
108
+ {
109
+ _CCCL_ASSERT(::cuda::__has_known_arch(__cc), "this compute capability cannot be converted to arch id");
110
+ return static_cast<arch_id>(__cc.get());
111
+ }
112
+
113
+ //! @brief Converts the compute capability to the architecture specific id.
114
+ //!
115
+ //! @param __cc The compute capability. Must have a corresponding architecture specific id.
116
+ //!
117
+ //! @returns The architecture specific id.
118
+ [[nodiscard]] _CCCL_API constexpr arch_id to_arch_specific_id(compute_capability __cc) noexcept
119
+ {
120
+ _CCCL_ASSERT(::cuda::__has_known_specific_arch(__cc),
121
+ "this compute capability cannot be converted to arch specific id");
122
+ return static_cast<arch_id>(__cc.get() * __arch_specific_id_multiplier);
123
+ }
124
+
125
+ _CCCL_END_NAMESPACE_CUDA
126
+
127
+ #if _CCCL_CUDA_COMPILATION()
128
+
129
+ _CCCL_BEGIN_NAMESPACE_CUDA_DEVICE
130
+
131
+ //! @brief This function should cause a link error. If it happens, you are trying to compile the code for an unsupported
132
+ //! architecture (too new/old).
133
+ _CCCL_DEVICE_API ::cuda::arch_id __unknown_cuda_architecture();
134
+
135
+ //! @brief Returns the \c cuda::arch_id that is currently being compiled.
136
+ //!
137
+ //! If the current architecture is not a known architecture from \c cuda::arch_id enumeration, the compilation
138
+ //! will fail.
139
+ //!
140
+ //! @note This API cannot be used in constexpr context when compiling with nvc++ in CUDA mode.
141
+ template <class _Dummy = void>
142
+ [[nodiscard]] _CCCL_DEVICE_API _CCCL_TARGET_CONSTEXPR ::cuda::arch_id current_arch_id() noexcept
143
+ {
144
+ # if _CCCL_CUDA_COMPILER(NVHPC)
145
+ const auto __cc = ::cuda::device::current_compute_capability();
146
+ if (::cuda::__is_known_arch_of(__cc))
147
+ {
148
+ return ::cuda::to_arch_id(__cc);
149
+ }
150
+ else
151
+ {
152
+ return ::cuda::device::__unknown_cuda_architecture();
153
+ }
154
+ # elif _CCCL_DEVICE_COMPILATION()
155
+ constexpr auto __cc = ::cuda::device::current_compute_capability();
156
+ # if defined(__CUDA_ARCH_SPECIFIC__)
157
+ constexpr auto __is_known_cc = ::cuda::std::__always_false_v<_Dummy> || ::cuda::__has_known_specific_arch(__cc);
158
+ static_assert(__is_known_cc, "unknown CUDA specific architecture");
159
+ return ::cuda::to_arch_specific_id(__cc);
160
+ # else // ^^^ __CUDA_ARCH_SPECIFIC__ ^^^ / vvv !__CUDA_ARCH_SPECIFIC__ vvv
161
+ constexpr auto __is_known_cc = ::cuda::std::__always_false_v<_Dummy> || ::cuda::__has_known_arch(__cc);
162
+ static_assert(__is_known_cc, "unknown CUDA architecture");
163
+ return ::cuda::to_arch_id(__cc);
164
+ # endif // ^^^ __CUDA_ARCH_SPECIFIC__ ^^^
165
+ # else
166
+ return {};
167
+ # endif // ^^^ single-pass cuda compiler ^^^
168
+ }
169
+
170
+ _CCCL_END_NAMESPACE_CUDA_DEVICE
171
+
172
+ #endif // _CCCL_CUDA_COMPILATION()
173
+
174
+ #include <cuda/std/__cccl/epilogue.h>
175
+
176
+ #endif // _CUDA___DEVICE_ARCH_ID_H