cuda-cccl 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  21. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  22. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  23. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  24. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  25. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  26. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  27. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  30. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  31. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  32. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  33. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  34. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
  35. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  39. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  49. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
  52. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  55. cuda/cccl/headers/include/cub/util_device.cuh +51 -35
  56. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  57. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  58. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  59. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  60. cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
  61. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  62. cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
  63. cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
  64. cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
  65. cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
  66. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  67. cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
  68. cuda/cccl/headers/include/cuda/__event/event.h +8 -8
  69. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  70. cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
  71. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  72. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  73. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
  74. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
  75. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
  76. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  77. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  78. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  79. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  80. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  81. cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
  82. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  83. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
  84. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  85. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  86. cuda/cccl/headers/include/cuda/algorithm +1 -1
  87. cuda/cccl/headers/include/cuda/devices +10 -0
  88. cuda/cccl/headers/include/cuda/iterator +1 -0
  89. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  90. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  91. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  92. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  93. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  94. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  95. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  96. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  97. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  98. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  99. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
  100. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  101. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  102. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  103. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  104. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  105. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  106. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  107. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  108. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  109. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
  110. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  111. cuda/cccl/headers/include/cuda/std/version +1 -4
  112. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  113. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  114. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  115. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  116. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  117. cuda/cccl/parallel/experimental/__init__.py +21 -70
  118. cuda/compute/__init__.py +77 -0
  119. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
  120. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
  121. cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
  122. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  123. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  124. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  125. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
  126. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
  127. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  128. cuda/compute/algorithms/_three_way_partition.py +261 -0
  129. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  130. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  131. cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  132. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  133. cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  134. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  135. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  136. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  137. cuda/coop/__init__.py +8 -0
  138. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  139. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  140. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  141. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  142. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  143. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  144. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  145. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  146. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  147. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  148. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  149. cuda/coop/warp/__init__.py +9 -0
  150. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  151. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  152. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  153. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  154. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
  155. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  156. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  157. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  158. cuda/cccl/parallel/experimental/.gitignore +0 -4
  159. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  160. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  161. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  162. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  163. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  164. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  165. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  166. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  167. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  168. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  169. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  170. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  171. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  172. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  173. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  174. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  175. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  176. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  177. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -47,9 +47,7 @@
47
47
 
48
48
  CUB_NAMESPACE_BEGIN
49
49
 
50
- namespace detail
51
- {
52
- namespace reduce
50
+ namespace detail::reduce
53
51
  {
54
52
 
55
53
  /**
@@ -172,6 +170,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
172
170
  AccumT,
173
171
  TransformOpT>;
174
172
 
173
+ static_assert(sizeof(typename AgentReduceT::TempStorage) <= max_smem_per_block,
174
+ "cub::DeviceReduce ran out of CUDA shared memory, which we judged to be extremely unlikely. Please "
175
+ "file an issue at: https://github.com/NVIDIA/cccl/issues");
176
+
175
177
  // Shared memory storage
176
178
  __shared__ typename AgentReduceT::TempStorage temp_storage;
177
179
 
@@ -253,6 +255,10 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
253
255
  AccumT,
254
256
  TransformOpT>;
255
257
 
258
+ static_assert(sizeof(typename AgentReduceT::TempStorage) <= max_smem_per_block,
259
+ "cub::DeviceReduce ran out of CUDA shared memory, which we judged to be extremely unlikely. Please "
260
+ "file an issue at: https://github.com/NVIDIA/cccl/issues");
261
+
256
262
  // Shared memory storage
257
263
  __shared__ typename AgentReduceT::TempStorage temp_storage;
258
264
 
@@ -572,7 +578,6 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
572
578
  }
573
579
  }
574
580
 
575
- } // namespace reduce
576
- } // namespace detail
581
+ } // namespace detail::reduce
577
582
 
578
583
  CUB_NAMESPACE_END
@@ -42,9 +42,7 @@
42
42
 
43
43
  CUB_NAMESPACE_BEGIN
44
44
 
45
- namespace detail
46
- {
47
- namespace scan
45
+ namespace detail::scan
48
46
  {
49
47
 
50
48
  /******************************************************************************
@@ -186,7 +184,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
186
184
  AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
187
185
  }
188
186
 
189
- } // namespace scan
190
- } // namespace detail
187
+ } // namespace detail::scan
191
188
 
192
189
  CUB_NAMESPACE_END
@@ -43,9 +43,7 @@
43
43
 
44
44
  CUB_NAMESPACE_BEGIN
45
45
 
46
- namespace detail
47
- {
48
- namespace reduce
46
+ namespace detail::reduce
49
47
  {
50
48
 
51
49
  /// Normalize input iterator to segment offset
@@ -318,7 +316,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
318
316
  }
319
317
  }
320
318
 
321
- } // namespace reduce
322
- } // namespace detail
319
+ } // namespace detail::reduce
323
320
 
324
321
  CUB_NAMESPACE_END
@@ -29,6 +29,56 @@ using local_segment_index_t = ::cuda::std::uint32_t;
29
29
  // Type used for total number of segments and to index within segments globally
30
30
  using global_segment_offset_t = ::cuda::std::int64_t;
31
31
 
32
+ template <typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
33
+ struct LargeSegmentsSelectorT
34
+ {
35
+ OffsetT value{};
36
+ BeginOffsetIteratorT d_offset_begin{};
37
+ EndOffsetIteratorT d_offset_end{};
38
+ global_segment_offset_t base_segment_offset{};
39
+
40
+ #if !_CCCL_COMPILER(NVRTC)
41
+ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
42
+ LargeSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end)
43
+ : value(value)
44
+ , d_offset_begin(d_offset_begin)
45
+ , d_offset_end(d_offset_end)
46
+ {}
47
+ #endif
48
+
49
+ _CCCL_DEVICE _CCCL_FORCEINLINE bool operator()(local_segment_index_t segment_id) const
50
+ {
51
+ const OffsetT segment_size =
52
+ d_offset_end[base_segment_offset + segment_id] - d_offset_begin[base_segment_offset + segment_id];
53
+ return segment_size > value;
54
+ }
55
+ };
56
+
57
+ template <typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
58
+ struct SmallSegmentsSelectorT
59
+ {
60
+ OffsetT value{};
61
+ BeginOffsetIteratorT d_offset_begin{};
62
+ EndOffsetIteratorT d_offset_end{};
63
+ global_segment_offset_t base_segment_offset{};
64
+
65
+ #if !_CCCL_COMPILER(NVRTC)
66
+ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
67
+ SmallSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end)
68
+ : value(value)
69
+ , d_offset_begin(d_offset_begin)
70
+ , d_offset_end(d_offset_end)
71
+ {}
72
+ #endif
73
+
74
+ _CCCL_DEVICE _CCCL_FORCEINLINE bool operator()(local_segment_index_t segment_id) const
75
+ {
76
+ const OffsetT segment_size =
77
+ d_offset_end[base_segment_offset + segment_id] - d_offset_begin[base_segment_offset + segment_id];
78
+ return segment_size < value;
79
+ }
80
+ };
81
+
32
82
  /**
33
83
  * @brief Fallback kernel, in case there's not enough segments to
34
84
  * take advantage of partitioning.
@@ -89,7 +139,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
89
139
  {
90
140
  using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
91
141
  using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
92
- using MediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT::MediumPolicyT;
142
+ using MediumPolicyT = typename ActivePolicyT::MediumSegmentPolicy;
93
143
 
94
144
  const auto segment_id = static_cast<local_segment_index_t>(blockIdx.x);
95
145
  OffsetT segment_begin = d_begin_offsets[segment_id];
@@ -253,7 +303,7 @@ template <SortOrder Order,
253
303
  typename BeginOffsetIteratorT,
254
304
  typename EndOffsetIteratorT,
255
305
  typename OffsetT>
256
- __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS)
306
+ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallSegmentPolicy::BLOCK_THREADS)
257
307
  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelSmall(
258
308
  local_segment_index_t small_segments,
259
309
  local_segment_index_t medium_segments,
@@ -272,10 +322,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
272
322
  const local_segment_index_t tid = threadIdx.x;
273
323
  const local_segment_index_t bid = blockIdx.x;
274
324
 
275
- using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
276
- using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
277
- using MediumPolicyT = typename SmallAndMediumPolicyT::MediumPolicyT;
278
- using SmallPolicyT = typename SmallAndMediumPolicyT::SmallPolicyT;
325
+ using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
326
+ using SmallPolicyT = typename ActivePolicyT::SmallSegmentPolicy;
327
+ using MediumPolicyT = typename ActivePolicyT::MediumSegmentPolicy;
279
328
 
280
329
  constexpr auto threads_per_medium_segment = static_cast<local_segment_index_t>(MediumPolicyT::WARP_THREADS);
281
330
  constexpr auto threads_per_small_segment = static_cast<local_segment_index_t>(SmallPolicyT::WARP_THREADS);
@@ -286,11 +335,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
286
335
  using SmallAgentWarpMergeSortT =
287
336
  sub_warp_merge_sort::AgentSubWarpSort<Order == SortOrder::Descending, SmallPolicyT, KeyT, ValueT, OffsetT>;
288
337
 
289
- constexpr auto segments_per_medium_block =
290
- static_cast<local_segment_index_t>(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK);
338
+ constexpr auto segments_per_medium_block = static_cast<local_segment_index_t>(MediumPolicyT::SEGMENTS_PER_BLOCK);
291
339
 
292
- constexpr auto segments_per_small_block =
293
- static_cast<local_segment_index_t>(SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK);
340
+ constexpr auto segments_per_small_block = static_cast<local_segment_index_t>(SmallPolicyT::SEGMENTS_PER_BLOCK);
294
341
 
295
342
  __shared__ union
296
343
  {
@@ -202,14 +202,18 @@ _CCCL_HOST_DEVICE _CCCL_CONSTEVAL auto load_store_type()
202
202
  }
203
203
  }
204
204
 
205
- template <typename VectorizedPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InputT>
205
+ template <typename VectorizedPolicy,
206
+ typename Offset,
207
+ typename F,
208
+ typename RandomAccessIteratorOut,
209
+ typename... RandomAccessIteratorsIn>
206
210
  _CCCL_DEVICE void transform_kernel_vectorized(
207
211
  Offset num_items,
208
212
  int num_elem_per_thread_prefetch,
209
213
  bool can_vectorize,
210
214
  F f,
211
215
  RandomAccessIteratorOut out,
212
- const InputT*... ins)
216
+ RandomAccessIteratorsIn... ins)
213
217
  {
214
218
  constexpr int block_dim = VectorizedPolicy::block_threads;
215
219
  constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
@@ -240,9 +244,12 @@ _CCCL_DEVICE void transform_kernel_vectorized(
240
244
  constexpr int load_store_size = VectorizedPolicy::load_store_word_size;
241
245
  using load_store_t = decltype(load_store_type<load_store_size>());
242
246
  using output_t = it_value_t<RandomAccessIteratorOut>;
243
- using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const InputT&...>>;
247
+ using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const it_value_t<RandomAccessIteratorsIn>&...>>;
244
248
  // picks output type size if there are no inputs
245
- constexpr int element_size = int{first_item(sizeof(InputT)..., size_of<output_t>)};
249
+ constexpr int element_size = int{first_nonzero_value(
250
+ (sizeof(it_value_t<RandomAccessIteratorsIn>)
251
+ * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
252
+ size_of<output_t>)};
246
253
  constexpr int load_store_count = (items_per_thread * element_size) / load_store_size;
247
254
 
248
255
  static_assert((items_per_thread * element_size) % load_store_size == 0);
@@ -258,18 +265,35 @@ _CCCL_DEVICE void transform_kernel_vectorized(
258
265
 
259
266
  auto provide_array = [&](auto... inputs) {
260
267
  // load inputs
261
- // TODO(bgruber): we could support fancy iterators for loading here as well (and only vectorize some inputs)
262
- [[maybe_unused]] auto load_tile_vectorized = [&](auto* in, auto& input) {
263
- auto in_vec = reinterpret_cast<const load_store_t*>(in);
264
- auto input_vec = reinterpret_cast<load_store_t*>(input.data());
265
- _CCCL_PRAGMA_UNROLL_FULL()
266
- for (int i = 0; i < load_store_count; ++i)
268
+ [[maybe_unused]] auto load_tile = [](auto in, auto& input) {
269
+ if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(in)>)
267
270
  {
268
- input_vec[i] = in_vec[i * VectorizedPolicy::block_threads + threadIdx.x];
271
+ auto in_vec = reinterpret_cast<const load_store_t*>(in) + threadIdx.x;
272
+ auto input_vec = reinterpret_cast<load_store_t*>(input.data());
273
+ _CCCL_PRAGMA_UNROLL_FULL()
274
+ for (int i = 0; i < load_store_count; ++i)
275
+ {
276
+ input_vec[i] = in_vec[i * VectorizedPolicy::block_threads];
277
+ }
278
+ }
279
+ else
280
+ {
281
+ constexpr int elems = load_store_size / element_size;
282
+ in += threadIdx.x * elems;
283
+ _CCCL_PRAGMA_UNROLL_FULL()
284
+ for (int i = 0; i < load_store_count; ++i)
285
+ {
286
+ _CCCL_PRAGMA_UNROLL_FULL()
287
+ for (int j = 0; j < elems; ++j)
288
+ {
289
+ input[i * elems + j] = in[i * elems * VectorizedPolicy::block_threads + j];
290
+ }
291
+ }
269
292
  }
270
293
  };
271
294
  _CCCL_PDL_GRID_DEPENDENCY_SYNC();
272
- (load_tile_vectorized(ins, inputs), ...);
295
+ (load_tile(ins, inputs), ...);
296
+
273
297
  // Benchmarks showed up to 38% slowdown on H200 (some improvements as well), so omitted. See #5249 for details.
274
298
  // _CCCL_PDL_TRIGGER_NEXT_LAUNCH();
275
299
 
@@ -280,7 +304,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
280
304
  output[i] = f(inputs[i]...);
281
305
  }
282
306
  };
283
- provide_array(uninitialized_array<InputT, items_per_thread>{}...);
307
+ provide_array(uninitialized_array<it_value_t<RandomAccessIteratorsIn>, items_per_thread>{}...);
284
308
 
285
309
  // write output
286
310
  if constexpr (can_vectorize_store)
@@ -43,9 +43,7 @@
43
43
 
44
44
  CUB_NAMESPACE_BEGIN
45
45
 
46
- namespace detail
47
- {
48
- namespace adjacent_difference
46
+ namespace detail::adjacent_difference
49
47
  {
50
48
  template <typename InputIteratorT, bool MayAlias>
51
49
  struct policy_hub
@@ -64,7 +62,6 @@ struct policy_hub
64
62
 
65
63
  using MaxPolicy = Policy500;
66
64
  };
67
- } // namespace adjacent_difference
68
- } // namespace detail
65
+ } // namespace detail::adjacent_difference
69
66
 
70
67
  CUB_NAMESPACE_END
@@ -43,9 +43,7 @@
43
43
 
44
44
  CUB_NAMESPACE_BEGIN
45
45
 
46
- namespace detail
47
- {
48
- namespace batch_memcpy
46
+ namespace detail::batch_memcpy
49
47
  {
50
48
  /**
51
49
  * Parameterizable tuning policy type for AgentBatchMemcpy
@@ -115,7 +113,6 @@ struct policy_hub
115
113
 
116
114
  using MaxPolicy = Policy700;
117
115
  };
118
- } // namespace batch_memcpy
119
- } // namespace detail
116
+ } // namespace detail::batch_memcpy
120
117
 
121
118
  CUB_NAMESPACE_END
@@ -42,9 +42,7 @@
42
42
 
43
43
  CUB_NAMESPACE_BEGIN
44
44
 
45
- namespace detail
46
- {
47
- namespace for_each
45
+ namespace detail::for_each
48
46
  {
49
47
 
50
48
  struct policy_hub_t
@@ -57,7 +55,6 @@ struct policy_hub_t
57
55
  using MaxPolicy = policy_500_t;
58
56
  };
59
57
 
60
- } // namespace for_each
61
- } // namespace detail
58
+ } // namespace detail::for_each
62
59
 
63
60
  CUB_NAMESPACE_END
@@ -46,9 +46,7 @@
46
46
 
47
47
  CUB_NAMESPACE_BEGIN
48
48
 
49
- namespace detail
50
- {
51
- namespace histogram
49
+ namespace detail::histogram
52
50
  {
53
51
  enum class primitive_sample
54
52
  {
@@ -272,7 +270,6 @@ struct policy_hub
272
270
 
273
271
  using MaxPolicy = Policy1000;
274
272
  };
275
- } // namespace histogram
276
- } // namespace detail
273
+ } // namespace detail::histogram
277
274
 
278
275
  CUB_NAMESPACE_END
@@ -42,9 +42,7 @@
42
42
 
43
43
  CUB_NAMESPACE_BEGIN
44
44
 
45
- namespace detail
46
- {
47
- namespace merge
45
+ namespace detail::merge
48
46
  {
49
47
  template <typename KeyT, typename ValueT>
50
48
  struct policy_hub
@@ -73,7 +71,6 @@ struct policy_hub
73
71
 
74
72
  using max_policy = policy600;
75
73
  };
76
- } // namespace merge
77
- } // namespace detail
74
+ } // namespace detail::merge
78
75
 
79
76
  CUB_NAMESPACE_END
@@ -62,6 +62,14 @@ struct MergeSortPolicyWrapper<StaticPolicyT, ::cuda::std::void_t<decltype(Static
62
62
  {}
63
63
 
64
64
  CUB_DEFINE_SUB_POLICY_GETTER(MergeSort);
65
+
66
+ #if defined(CUB_ENABLE_POLICY_PTX_JSON)
67
+ _CCCL_DEVICE static constexpr auto EncodedPolicy()
68
+ {
69
+ using namespace ptx_json;
70
+ return object<key<"MergeSortPolicy">() = MergeSort().EncodedPolicy()>();
71
+ }
72
+ #endif
65
73
  };
66
74
 
67
75
  template <typename PolicyT>
@@ -46,9 +46,7 @@
46
46
 
47
47
  CUB_NAMESPACE_BEGIN
48
48
 
49
- namespace detail
50
- {
51
- namespace radix
49
+ namespace detail::radix
52
50
  {
53
51
  // sm90 default
54
52
  template <size_t KeySize, size_t ValueSize, size_t OffsetSize>
@@ -1062,7 +1060,6 @@ struct policy_hub
1062
1060
  using MaxPolicy = Policy1000;
1063
1061
  };
1064
1062
 
1065
- } // namespace radix
1066
- } // namespace detail
1063
+ } // namespace detail::radix
1067
1064
 
1068
1065
  CUB_NAMESPACE_END
@@ -50,9 +50,7 @@
50
50
 
51
51
  CUB_NAMESPACE_BEGIN
52
52
 
53
- namespace detail
54
- {
55
- namespace reduce_by_key
53
+ namespace detail::reduce_by_key
56
54
  {
57
55
  enum class primitive_key
58
56
  {
@@ -939,7 +937,6 @@ struct policy_hub
939
937
  };
940
938
  using MaxPolicy = Policy1000;
941
939
  };
942
- } // namespace reduce_by_key
943
- } // namespace detail
940
+ } // namespace detail::reduce_by_key
944
941
 
945
942
  CUB_NAMESPACE_END
@@ -52,9 +52,7 @@
52
52
 
53
53
  CUB_NAMESPACE_BEGIN
54
54
 
55
- namespace detail
56
- {
57
- namespace rle
55
+ namespace detail::rle
58
56
  {
59
57
  enum class primitive_key
60
58
  {
@@ -670,7 +668,6 @@ struct policy_hub
670
668
  using MaxPolicy = Policy1000;
671
669
  };
672
670
  } // namespace non_trivial_runs
673
- } // namespace rle
674
- } // namespace detail
671
+ } // namespace detail::rle
675
672
 
676
673
  CUB_NAMESPACE_END
@@ -53,9 +53,7 @@
53
53
 
54
54
  CUB_NAMESPACE_BEGIN
55
55
 
56
- namespace detail
57
- {
58
- namespace scan
56
+ namespace detail::scan
59
57
  {
60
58
  enum class keep_rejects
61
59
  {
@@ -615,7 +613,6 @@ struct policy_hub
615
613
 
616
614
  using MaxPolicy = Policy1000;
617
615
  };
618
- } // namespace scan
619
- } // namespace detail
616
+ } // namespace detail::scan
620
617
 
621
618
  CUB_NAMESPACE_END
@@ -49,9 +49,7 @@
49
49
 
50
50
  CUB_NAMESPACE_BEGIN
51
51
 
52
- namespace detail
53
- {
54
- namespace scan_by_key
52
+ namespace detail::scan_by_key
55
53
  {
56
54
  enum class primitive_accum
57
55
  {
@@ -1007,7 +1005,6 @@ struct policy_hub
1007
1005
 
1008
1006
  using MaxPolicy = Policy1000;
1009
1007
  };
1010
- } // namespace scan_by_key
1011
- } // namespace detail
1008
+ } // namespace detail::scan_by_key
1012
1009
 
1013
1010
  CUB_NAMESPACE_END