cuda-cccl 0.1.3.2.0.dev438__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  21. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  22. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  23. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  24. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  25. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  26. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  27. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  30. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  31. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  32. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  33. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  34. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
  35. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  39. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  49. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
  52. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  55. cuda/cccl/headers/include/cub/util_device.cuh +51 -35
  56. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  57. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  58. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  59. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  60. cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
  61. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  62. cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
  63. cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
  64. cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
  65. cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
  66. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  67. cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
  68. cuda/cccl/headers/include/cuda/__event/event.h +8 -8
  69. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  70. cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
  71. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  72. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  73. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
  74. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
  75. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
  76. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  77. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  78. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  79. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  80. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  81. cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
  82. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  83. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
  84. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  85. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  86. cuda/cccl/headers/include/cuda/algorithm +1 -1
  87. cuda/cccl/headers/include/cuda/devices +10 -0
  88. cuda/cccl/headers/include/cuda/iterator +1 -0
  89. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  90. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  91. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  92. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  93. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  94. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  95. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  96. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  97. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  98. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  99. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
  100. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  101. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  102. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  103. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  104. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  105. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  106. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  107. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  108. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  109. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
  110. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  111. cuda/cccl/headers/include/cuda/std/version +1 -4
  112. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  113. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  114. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  115. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  116. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  117. cuda/cccl/parallel/experimental/__init__.py +21 -70
  118. cuda/compute/__init__.py +77 -0
  119. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
  120. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
  121. cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
  122. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  123. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  124. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  125. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
  126. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
  127. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  128. cuda/compute/algorithms/_three_way_partition.py +261 -0
  129. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  130. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  131. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  132. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  133. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  134. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  135. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  136. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  137. cuda/coop/__init__.py +8 -0
  138. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  139. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  140. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  141. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  142. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  143. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  144. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  145. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  146. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  147. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  148. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  149. cuda/coop/warp/__init__.py +9 -0
  150. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  151. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  152. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  153. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  154. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
  155. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  156. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  157. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  158. cuda/cccl/parallel/experimental/.gitignore +0 -4
  159. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  160. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  161. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  162. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  163. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  164. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  165. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  166. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  167. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  168. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  169. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  170. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  171. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  172. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  173. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  174. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  175. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  176. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  177. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -84,70 +84,6 @@ CUB_NAMESPACE_BEGIN
84
84
  //! @endrst
85
85
  struct DeviceSegmentedReduce
86
86
  {
87
- private:
88
- template <typename InputIteratorT,
89
- typename OutputIteratorT,
90
- typename BeginOffsetIteratorT,
91
- typename EndOffsetIteratorT,
92
- typename OffsetT,
93
- typename ReductionOpT,
94
- typename InitT,
95
- typename... Ts>
96
- CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
97
- ::cuda::std::false_type,
98
- void* d_temp_storage,
99
- size_t& temp_storage_bytes,
100
- InputIteratorT d_in,
101
- OutputIteratorT d_out,
102
- ::cuda::std::int64_t num_segments,
103
- BeginOffsetIteratorT d_begin_offsets,
104
- EndOffsetIteratorT d_end_offsets,
105
- ReductionOpT reduction_op,
106
- InitT initial_value,
107
- cudaStream_t stream);
108
-
109
- template <typename InputIteratorT,
110
- typename OutputIteratorT,
111
- typename BeginOffsetIteratorT,
112
- typename EndOffsetIteratorT,
113
- typename OffsetT,
114
- typename ReductionOpT,
115
- typename InitT,
116
- typename... Ts>
117
- CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
118
- ::cuda::std::true_type,
119
- void* d_temp_storage,
120
- size_t& temp_storage_bytes,
121
- InputIteratorT d_in,
122
- OutputIteratorT d_out,
123
- ::cuda::std::int64_t num_segments,
124
- BeginOffsetIteratorT d_begin_offsets,
125
- EndOffsetIteratorT d_end_offsets,
126
- ReductionOpT reduction_op,
127
- InitT initial_value,
128
- cudaStream_t stream)
129
- {
130
- return DispatchSegmentedReduce<
131
- InputIteratorT,
132
- OutputIteratorT,
133
- BeginOffsetIteratorT,
134
- EndOffsetIteratorT,
135
- OffsetT,
136
- ReductionOpT,
137
- InitT,
138
- Ts...>::Dispatch(d_temp_storage,
139
- temp_storage_bytes,
140
- d_in,
141
- d_out,
142
- num_segments,
143
- d_begin_offsets,
144
- d_end_offsets,
145
- reduction_op,
146
- initial_value,
147
- stream);
148
- }
149
-
150
- public:
151
87
  //! @rst
152
88
  //! Computes a device-wide segmented reduction using the specified
153
89
  //! binary ``reduction_op`` functor.
@@ -261,24 +197,29 @@ public:
261
197
  {
262
198
  _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Reduce");
263
199
 
264
- // Integer type for global offsets
265
- using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
266
- using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
267
-
268
- static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
269
-
270
- return segmented_reduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, ReductionOpT>(
271
- integral_offset_check{},
272
- d_temp_storage,
273
- temp_storage_bytes,
274
- d_in,
275
- d_out,
276
- num_segments,
277
- d_begin_offsets,
278
- d_end_offsets,
279
- reduction_op,
280
- initial_value, // zero-initialize
281
- stream);
200
+ using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
201
+ static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
202
+ if constexpr (::cuda::std::is_integral_v<OffsetT>)
203
+ {
204
+ return DispatchSegmentedReduce<
205
+ InputIteratorT,
206
+ OutputIteratorT,
207
+ BeginOffsetIteratorT,
208
+ EndOffsetIteratorT,
209
+ OffsetT,
210
+ ReductionOpT,
211
+ T>::Dispatch(d_temp_storage,
212
+ temp_storage_bytes,
213
+ d_in,
214
+ d_out,
215
+ num_segments,
216
+ d_begin_offsets,
217
+ d_end_offsets,
218
+ reduction_op,
219
+ initial_value, // zero-initialize
220
+ stream);
221
+ }
222
+ _CCCL_UNREACHABLE();
282
223
  }
283
224
 
284
225
  //! @rst
@@ -465,32 +406,31 @@ public:
465
406
  {
466
407
  _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum");
467
408
 
468
- // Integer type for global offsets
469
409
  using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
470
-
471
- // The output value type
472
- using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
473
- using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
474
-
475
- static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
476
-
477
- return segmented_reduce<InputIteratorT,
478
- OutputIteratorT,
479
- BeginOffsetIteratorT,
480
- EndOffsetIteratorT,
481
- OffsetT,
482
- ::cuda::std::plus<>>(
483
- integral_offset_check{},
484
- d_temp_storage,
485
- temp_storage_bytes,
486
- d_in,
487
- d_out,
488
- num_segments,
489
- d_begin_offsets,
490
- d_end_offsets,
491
- ::cuda::std::plus<>{},
492
- OutputT(), // zero-initialize
493
- stream);
410
+ using OutputT = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
411
+ using init_t = OutputT;
412
+ static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
413
+ if constexpr (::cuda::std::is_integral_v<OffsetT>)
414
+ {
415
+ return DispatchSegmentedReduce<
416
+ InputIteratorT,
417
+ OutputIteratorT,
418
+ BeginOffsetIteratorT,
419
+ EndOffsetIteratorT,
420
+ OffsetT,
421
+ ::cuda::std::plus<>,
422
+ init_t>::Dispatch(d_temp_storage,
423
+ temp_storage_bytes,
424
+ d_in,
425
+ d_out,
426
+ num_segments,
427
+ d_begin_offsets,
428
+ d_end_offsets,
429
+ ::cuda::std::plus<>{},
430
+ init_t{}, // zero-initialize
431
+ stream);
432
+ }
433
+ _CCCL_UNREACHABLE();
494
434
  }
495
435
 
496
436
  //! @rst
@@ -556,9 +496,7 @@ public:
556
496
  // `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
557
497
  // integral constant or larger integral types
558
498
  using offset_t = int;
559
-
560
- // The output value type
561
- using output_t = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
499
+ using output_t = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
562
500
 
563
501
  return detail::reduce::
564
502
  DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::std::plus<>, output_t>::Dispatch(
@@ -673,32 +611,31 @@ public:
673
611
  {
674
612
  _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Min");
675
613
 
676
- // Integer type for global offsets
677
614
  using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
678
-
679
- // The input value type
680
- using InputT = cub::detail::it_value_t<InputIteratorT>;
681
- using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
682
-
683
- static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
684
-
685
- return segmented_reduce<InputIteratorT,
686
- OutputIteratorT,
687
- BeginOffsetIteratorT,
688
- EndOffsetIteratorT,
689
- OffsetT,
690
- ::cuda::minimum<>>(
691
- integral_offset_check{},
692
- d_temp_storage,
693
- temp_storage_bytes,
694
- d_in,
695
- d_out,
696
- num_segments,
697
- d_begin_offsets,
698
- d_end_offsets,
699
- ::cuda::minimum<>{},
700
- ::cuda::std::numeric_limits<InputT>::max(),
701
- stream);
615
+ using InputT = detail::it_value_t<InputIteratorT>;
616
+ using init_t = InputT;
617
+ static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
618
+ if constexpr (::cuda::std::is_integral_v<OffsetT>)
619
+ {
620
+ return DispatchSegmentedReduce<
621
+ InputIteratorT,
622
+ OutputIteratorT,
623
+ BeginOffsetIteratorT,
624
+ EndOffsetIteratorT,
625
+ OffsetT,
626
+ ::cuda::minimum<>,
627
+ init_t>::Dispatch(d_temp_storage,
628
+ temp_storage_bytes,
629
+ d_in,
630
+ d_out,
631
+ num_segments,
632
+ d_begin_offsets,
633
+ d_end_offsets,
634
+ ::cuda::minimum<>{},
635
+ ::cuda::std::numeric_limits<init_t>::max(),
636
+ stream);
637
+ }
638
+ _CCCL_UNREACHABLE();
702
639
  }
703
640
 
704
641
  //! @rst
@@ -769,9 +706,7 @@ public:
769
706
  // `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
770
707
  // integral constant or larger integral types
771
708
  using offset_t = int;
772
-
773
- // The input value type
774
- using input_t = cub::detail::it_value_t<InputIteratorT>;
709
+ using input_t = detail::it_value_t<InputIteratorT>;
775
710
 
776
711
  return detail::reduce::
777
712
  DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::minimum<>, input_t>::Dispatch(
@@ -890,54 +825,45 @@ public:
890
825
  {
891
826
  _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMin");
892
827
 
893
- // Integer type for global offsets
894
828
  // Using common iterator value type is a breaking change, see:
895
829
  // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
896
830
  using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
897
831
 
898
- // The input type
899
- using InputValueT = cub::detail::it_value_t<InputIteratorT>;
900
-
901
- // The output tuple type
902
- using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
903
-
904
- // The output value type
832
+ using InputValueT = detail::it_value_t<InputIteratorT>;
833
+ using OutputTupleT = detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
905
834
  using OutputValueT = typename OutputTupleT::Value;
906
-
907
- using AccumT = OutputTupleT;
908
-
909
- using InitT = detail::reduce::empty_problem_init_t<AccumT>;
835
+ using AccumT = OutputTupleT;
836
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
910
837
 
911
838
  // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
912
839
  using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
913
-
914
840
  ArgIndexInputIteratorT d_indexed_in(d_in);
915
841
 
916
- // Initial value
917
842
  InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
918
843
 
919
- using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
920
- static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
921
-
922
- return segmented_reduce<ArgIndexInputIteratorT,
923
- OutputIteratorT,
924
- BeginOffsetIteratorT,
925
- EndOffsetIteratorT,
926
- OffsetT,
927
- cub::ArgMin,
928
- InitT,
929
- AccumT>(
930
- integral_offset_check{},
931
- d_temp_storage,
932
- temp_storage_bytes,
933
- d_indexed_in,
934
- d_out,
935
- num_segments,
936
- d_begin_offsets,
937
- d_end_offsets,
938
- cub::ArgMin(),
939
- initial_value,
940
- stream);
844
+ static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
845
+ if constexpr (::cuda::std::is_integral_v<OffsetT>)
846
+ {
847
+ return DispatchSegmentedReduce<
848
+ ArgIndexInputIteratorT,
849
+ OutputIteratorT,
850
+ BeginOffsetIteratorT,
851
+ EndOffsetIteratorT,
852
+ OffsetT,
853
+ cub::ArgMin,
854
+ InitT,
855
+ AccumT>::Dispatch(d_temp_storage,
856
+ temp_storage_bytes,
857
+ d_indexed_in,
858
+ d_out,
859
+ num_segments,
860
+ d_begin_offsets,
861
+ d_end_offsets,
862
+ cub::ArgMin{},
863
+ initial_value,
864
+ stream);
865
+ }
866
+ _CCCL_UNREACHABLE();
941
867
  }
942
868
 
943
869
  //! @rst
@@ -1144,27 +1070,32 @@ public:
1144
1070
  {
1145
1071
  _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Max");
1146
1072
 
1147
- // Integer type for global offsets
1148
1073
  using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
1149
-
1150
- // The input value type
1151
- using InputT = cub::detail::it_value_t<InputIteratorT>;
1152
-
1153
- using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
1154
- static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
1155
-
1156
- return segmented_reduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>(
1157
- integral_offset_check{},
1158
- d_temp_storage,
1159
- temp_storage_bytes,
1160
- d_in,
1161
- d_out,
1162
- num_segments,
1163
- d_begin_offsets,
1164
- d_end_offsets,
1165
- ::cuda::maximum<>{},
1166
- ::cuda::std::numeric_limits<InputT>::lowest(),
1167
- stream);
1074
+ using InputT = cub::detail::it_value_t<InputIteratorT>;
1075
+ using init_t = InputT;
1076
+
1077
+ static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
1078
+ if constexpr (::cuda::std::is_integral_v<OffsetT>)
1079
+ {
1080
+ return DispatchSegmentedReduce<
1081
+ InputIteratorT,
1082
+ OutputIteratorT,
1083
+ BeginOffsetIteratorT,
1084
+ EndOffsetIteratorT,
1085
+ OffsetT,
1086
+ ::cuda::maximum<>,
1087
+ init_t>::Dispatch(d_temp_storage,
1088
+ temp_storage_bytes,
1089
+ d_in,
1090
+ d_out,
1091
+ num_segments,
1092
+ d_begin_offsets,
1093
+ d_end_offsets,
1094
+ ::cuda::maximum<>{},
1095
+ ::cuda::std::numeric_limits<init_t>::lowest(),
1096
+ stream);
1097
+ }
1098
+ _CCCL_UNREACHABLE();
1168
1099
  }
1169
1100
 
1170
1101
  //! @rst
@@ -1229,9 +1160,7 @@ public:
1229
1160
  // `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
1230
1161
  // integral constant or larger integral types
1231
1162
  using offset_t = int;
1232
-
1233
- // The input value type
1234
- using input_t = cub::detail::it_value_t<InputIteratorT>;
1163
+ using input_t = detail::it_value_t<InputIteratorT>;
1235
1164
 
1236
1165
  return detail::reduce::
1237
1166
  DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::maximum<>, input_t>::Dispatch(
@@ -1353,54 +1282,45 @@ public:
1353
1282
  {
1354
1283
  _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMax");
1355
1284
 
1356
- // Integer type for global offsets
1357
1285
  // Using common iterator value type is a breaking change, see:
1358
1286
  // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
1359
1287
  using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
1360
1288
 
1361
- // The input type
1362
- using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1363
-
1364
- // The output tuple type
1289
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1365
1290
  using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
1366
-
1367
- using AccumT = OutputTupleT;
1368
-
1369
- using InitT = detail::reduce::empty_problem_init_t<AccumT>;
1370
-
1371
- // The output value type
1291
+ using AccumT = OutputTupleT;
1292
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
1372
1293
  using OutputValueT = typename OutputTupleT::Value;
1373
1294
 
1374
1295
  // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
1375
1296
  using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
1376
-
1377
1297
  ArgIndexInputIteratorT d_indexed_in(d_in);
1378
1298
 
1379
- // Initial value
1380
1299
  InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
1381
1300
 
1382
- using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
1383
- static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
1384
-
1385
- return segmented_reduce<ArgIndexInputIteratorT,
1386
- OutputIteratorT,
1387
- BeginOffsetIteratorT,
1388
- EndOffsetIteratorT,
1389
- OffsetT,
1390
- cub::ArgMax,
1391
- InitT,
1392
- AccumT>(
1393
- integral_offset_check{},
1394
- d_temp_storage,
1395
- temp_storage_bytes,
1396
- d_indexed_in,
1397
- d_out,
1398
- num_segments,
1399
- d_begin_offsets,
1400
- d_end_offsets,
1401
- cub::ArgMax(),
1402
- initial_value,
1403
- stream);
1301
+ static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
1302
+ if constexpr (::cuda::std::is_integral_v<OffsetT>)
1303
+ {
1304
+ return DispatchSegmentedReduce<
1305
+ ArgIndexInputIteratorT,
1306
+ OutputIteratorT,
1307
+ BeginOffsetIteratorT,
1308
+ EndOffsetIteratorT,
1309
+ OffsetT,
1310
+ cub::ArgMax,
1311
+ InitT,
1312
+ AccumT>::Dispatch(d_temp_storage,
1313
+ temp_storage_bytes,
1314
+ d_indexed_in,
1315
+ d_out,
1316
+ num_segments,
1317
+ d_begin_offsets,
1318
+ d_end_offsets,
1319
+ cub::ArgMax{},
1320
+ initial_value,
1321
+ stream);
1322
+ }
1323
+ _CCCL_UNREACHABLE();
1404
1324
  }
1405
1325
 
1406
1326
  //! @rst
@@ -1476,34 +1396,25 @@ public:
1476
1396
  // integral constant or larger integral types
1477
1397
  using input_t = int;
1478
1398
 
1479
- // The input type
1480
- using input_value_t = cub::detail::it_value_t<InputIteratorT>;
1481
-
1482
- // The output tuple type
1483
- using output_tuple_t = cub::detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
1484
-
1485
- using accum_t = output_tuple_t;
1486
-
1487
- using init_t = detail::reduce::empty_problem_init_t<accum_t>;
1488
-
1489
- // The output value type
1399
+ using input_value_t = detail::it_value_t<InputIteratorT>;
1400
+ using output_tuple_t = detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
1401
+ using accum_t = output_tuple_t;
1402
+ using init_t = detail::reduce::empty_problem_init_t<accum_t>;
1490
1403
  using output_value_t = typename output_tuple_t::second_type;
1491
1404
 
1492
1405
  // Wrapped input iterator to produce index-value <input_t, InputT> tuples
1493
1406
  auto d_indexed_in = THRUST_NS_QUALIFIER::make_transform_iterator(
1494
1407
  THRUST_NS_QUALIFIER::counting_iterator<::cuda::std::int64_t>{0},
1495
1408
  detail::reduce::generate_idx_value<InputIteratorT, output_value_t>(d_in, segment_size));
1496
-
1497
1409
  using arg_index_input_iterator_t = decltype(d_indexed_in);
1498
1410
 
1499
- // Initial value
1500
1411
  init_t initial_value{accum_t(1, ::cuda::std::numeric_limits<input_value_t>::lowest())};
1501
1412
 
1502
1413
  return detail::reduce::DispatchFixedSizeSegmentedReduce<
1503
1414
  arg_index_input_iterator_t,
1504
1415
  OutputIteratorT,
1505
1416
  input_t,
1506
- cub::detail::arg_max,
1417
+ detail::arg_max,
1507
1418
  init_t,
1508
1419
  accum_t>::Dispatch(d_temp_storage,
1509
1420
  temp_storage_bytes,
@@ -1511,7 +1422,7 @@ public:
1511
1422
  d_out,
1512
1423
  num_segments,
1513
1424
  segment_size,
1514
- cub::detail::arg_max(),
1425
+ detail::arg_max(),
1515
1426
  initial_value,
1516
1427
  stream);
1517
1428
  }
@@ -0,0 +1,79 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3
+
4
+ //! @file
5
+ #pragma once
6
+
7
+ #include <cub/config.cuh>
8
+
9
+ #include <cuda/std/__type_traits/is_same.h>
10
+
11
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
12
+ # pragma GCC system_header
13
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
14
+ # pragma clang system_header
15
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
16
+ # pragma system_header
17
+ #endif // no system header
18
+
19
+ #include <cub/device/device_for.cuh>
20
+ #include <cub/device/device_transform.cuh>
21
+ #include <cub/util_debug.cuh>
22
+
23
+ #include <cuda/std/functional>
24
+ #include <cuda/std/mdspan>
25
+
26
+ CUB_NAMESPACE_BEGIN
27
+
28
+ namespace detail::copy_mdspan
29
+ {
30
+
31
+ template <typename MdspanIn, typename MdspanOut>
32
+ struct copy_mdspan_t
33
+ {
34
+ MdspanIn mdspan_in;
35
+ MdspanOut mdspan_out;
36
+
37
+ _CCCL_API copy_mdspan_t(MdspanIn mdspan_in, MdspanOut mdspan_out)
38
+ : mdspan_in{mdspan_in}
39
+ , mdspan_out{mdspan_out}
40
+ {}
41
+
42
+ template <typename Idx, typename... Indices>
43
+ _CCCL_DEVICE_API _CCCL_FORCEINLINE void operator()(Idx, Indices... indices)
44
+ {
45
+ mdspan_out(indices...) = mdspan_in(indices...);
46
+ }
47
+ };
48
+
49
+ template <typename T_In,
50
+ typename E_In,
51
+ typename L_In,
52
+ typename A_In,
53
+ typename T_Out,
54
+ typename E_Out,
55
+ typename L_Out,
56
+ typename A_Out>
57
+ [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
58
+ copy(::cuda::std::mdspan<T_In, E_In, L_In, A_In> mdspan_in,
59
+ ::cuda::std::mdspan<T_Out, E_Out, L_Out, A_Out> mdspan_out,
60
+ ::cudaStream_t stream)
61
+ {
62
+ if (mdspan_in.is_exhaustive() && mdspan_out.is_exhaustive()
63
+ && detail::have_same_strides(mdspan_in.mapping(), mdspan_out.mapping()))
64
+ {
65
+ return cub::DeviceTransform::Transform(
66
+ mdspan_in.data_handle(),
67
+ mdspan_out.data_handle(),
68
+ mdspan_in.size(),
69
+ ::cuda::proclaim_copyable_arguments(::cuda::std::identity{}),
70
+ stream);
71
+ }
72
+ // TODO (fbusato): add ForEachInLayout when mdspan_in and mdspan_out have compatible layouts
73
+ // Compatible layouts could use more efficient iteration patterns
74
+ return cub::DeviceFor::ForEachInExtents(mdspan_in.extents(), copy_mdspan_t{mdspan_in, mdspan_out}, stream);
75
+ }
76
+
77
+ } // namespace detail::copy_mdspan
78
+
79
+ CUB_NAMESPACE_END
@@ -144,11 +144,11 @@ __launch_bounds__(
144
144
  auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
145
145
  MergeAgent{
146
146
  temp_storage.Alias(),
147
- try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(keys1),
148
- try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(items1),
147
+ keys1,
148
+ items1,
149
149
  num_keys1,
150
- try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(keys2),
151
- try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(items2),
150
+ keys2,
151
+ items2,
152
152
  num_keys2,
153
153
  keys_result,
154
154
  items_result,
@@ -44,7 +44,6 @@
44
44
  # pragma system_header
45
45
  #endif // no system header
46
46
 
47
- #include <cub/device/dispatch/dispatch_advance_iterators.cuh>
48
47
  #include <cub/device/dispatch/kernels/radix_sort.cuh>
49
48
  #include <cub/device/dispatch/tuning/tuning_radix_sort.cuh>
50
49
  #include <cub/util_debug.cuh>
@@ -1379,14 +1378,6 @@ struct DispatchSegmentedRadixSort
1379
1378
  // Number of radix sort invocations until all segments have been processed
1380
1379
  const auto num_invocations = ::cuda::ceil_div(num_segments, max_num_segments_per_invocation);
1381
1380
 
1382
- // If d_begin_offsets and d_end_offsets do not support operator+ then we can't have more than
1383
- // max_num_segments_per_invocation segments per invocation
1384
- if (num_invocations > 1
1385
- && !detail::all_iterators_support_add_assign_operator(::cuda::std::int64_t{}, d_begin_offsets, d_end_offsets))
1386
- {
1387
- return cudaErrorInvalidValue;
1388
- }
1389
-
1390
1381
  BeginOffsetIteratorT begin_offsets_current_it = d_begin_offsets;
1391
1382
  EndOffsetIteratorT end_offsets_current_it = d_end_offsets;
1392
1383
 
@@ -1435,8 +1426,8 @@ struct DispatchSegmentedRadixSort
1435
1426
 
1436
1427
  if (invocation_index + 1 < num_invocations)
1437
1428
  {
1438
- detail::advance_iterators_inplace_if_supported(begin_offsets_current_it, num_current_segments);
1439
- detail::advance_iterators_inplace_if_supported(end_offsets_current_it, num_current_segments);
1429
+ begin_offsets_current_it += num_current_segments;
1430
+ end_offsets_current_it += num_current_segments;
1440
1431
  }
1441
1432
 
1442
1433
  // Sync the stream if specified to flush runtime errors