cuda-cccl 0.2.1__cp312-cp312-manylinux_2_26_x86_64.whl → 0.3.0__cp312-cp312-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (322) hide show
  1. cuda/cccl/cooperative/experimental/_common.py +3 -1
  2. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +68 -62
  3. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +23 -0
  4. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +2 -18
  5. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +23 -0
  6. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +22 -14
  7. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +14 -0
  8. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  9. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +94 -13
  10. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  11. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +20 -6
  12. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +0 -2
  13. cuda/cccl/headers/include/cub/cub.cuh +8 -0
  14. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +13 -32
  15. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +61 -0
  16. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +3 -3
  17. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  18. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +3 -2
  19. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  20. cuda/cccl/headers/include/cub/device/device_for.cuh +2 -10
  21. cuda/cccl/headers/include/cub/device/device_histogram.cuh +8 -8
  22. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  23. cuda/cccl/headers/include/cub/device/device_reduce.cuh +775 -163
  24. cuda/cccl/headers/include/cub/device/device_scan.cuh +306 -0
  25. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1 -0
  26. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  27. cuda/cccl/headers/include/cub/device/device_transform.cuh +11 -2
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +43 -44
  30. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1 -1
  31. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +321 -262
  32. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +100 -171
  33. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +629 -0
  34. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +8 -1
  35. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +29 -24
  36. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +4 -10
  37. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  38. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +80 -0
  39. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +39 -15
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +3 -15
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +10 -0
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +203 -51
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +36 -0
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  46. cuda/cccl/headers/include/cub/util_device.cuh +72 -51
  47. cuda/cccl/headers/include/cub/util_ptx.cuh +8 -8
  48. cuda/cccl/headers/include/cub/util_type.cuh +15 -20
  49. cuda/cccl/headers/include/cuda/__algorithm/copy.h +63 -10
  50. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +3 -1
  51. cuda/cccl/headers/include/cuda/__device/all_devices.h +3 -6
  52. cuda/cccl/headers/include/cuda/__device/arch_traits.h +3 -3
  53. cuda/cccl/headers/include/cuda/__device/attributes.h +7 -7
  54. cuda/cccl/headers/include/cuda/__device/device_ref.h +12 -10
  55. cuda/cccl/headers/include/cuda/__driver/driver_api.h +260 -30
  56. cuda/cccl/headers/include/cuda/__event/event.h +7 -8
  57. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  58. cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -5
  59. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +97 -52
  60. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +5 -6
  61. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +5 -0
  62. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +5 -0
  63. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +6 -4
  64. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +5 -0
  65. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +6 -1
  66. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +28 -27
  67. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +39 -33
  68. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +18 -11
  69. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +10 -0
  70. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +83 -44
  71. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  72. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +1 -1
  73. cuda/cccl/headers/include/cuda/__memory/address_space.h +28 -12
  74. cuda/cccl/headers/include/cuda/__memory/check_address.h +34 -29
  75. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  76. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  77. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -12
  78. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +161 -92
  79. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +3 -2
  80. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  81. cuda/cccl/headers/include/cuda/pipeline +2 -1
  82. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +0 -6
  83. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +3 -3
  84. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +1 -1
  85. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +3 -3
  86. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +2 -2
  87. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +1 -1
  88. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  89. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -1
  90. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +2 -3
  91. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +2 -3
  92. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +2 -2
  93. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  94. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +6 -8
  95. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +3 -3
  96. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +4 -4
  97. cuda/cccl/headers/include/cuda/std/__atomic/order.h +1 -1
  98. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +1 -1
  99. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +18 -7
  100. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +4 -115
  101. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  102. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +8 -5
  103. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +22 -3
  104. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +4 -4
  105. cuda/cccl/headers/include/cuda/std/__cccl/os.h +6 -0
  106. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  107. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +25 -0
  108. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  109. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  110. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  111. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  112. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  113. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  114. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  115. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  116. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +258 -0
  117. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  118. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  119. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  120. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +67 -0
  121. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +1 -4
  122. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +12 -9
  123. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  124. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  125. cuda/cccl/headers/include/cuda/std/__expected/expected.h +31 -38
  126. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  127. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  128. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +4 -4
  129. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +2 -2
  130. cuda/cccl/headers/include/cuda/std/__functional/bind.h +2 -2
  131. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +4 -4
  132. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +2 -2
  133. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__functional/function.h +10 -11
  135. cuda/cccl/headers/include/cuda/std/__functional/hash.h +5 -6
  136. cuda/cccl/headers/include/cuda/std/__functional/identity.h +4 -8
  137. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +2 -4
  138. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +16 -18
  139. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +2 -3
  140. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +2 -3
  141. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +3 -3
  142. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +2 -2
  143. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +16 -25
  144. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +3 -3
  145. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +3 -3
  146. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +1 -1
  147. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +21 -28
  148. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +1 -1
  149. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +28 -39
  150. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +3 -4
  151. cuda/cccl/headers/include/cuda/std/__iterator/next.h +2 -2
  152. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +2 -3
  153. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +12 -41
  154. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +2 -2
  155. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +3 -4
  156. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +31 -31
  157. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +45 -45
  158. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +3 -2
  159. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +1 -1
  160. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +1 -1
  161. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +1 -1
  162. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  163. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +5 -2
  164. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +30 -30
  165. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -1
  166. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +66 -86
  167. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +2 -2
  168. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +3 -3
  169. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +5 -2
  170. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +30 -45
  171. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +8 -12
  172. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +21 -23
  173. cuda/cccl/headers/include/cuda/std/__new/launder.h +4 -0
  174. cuda/cccl/headers/include/cuda/std/__optional/hash.h +2 -2
  175. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +2 -1
  176. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +2 -1
  177. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +1 -1
  178. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +1 -1
  179. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +1 -1
  180. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +1 -1
  181. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +4 -13
  182. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +12 -22
  183. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +9 -18
  184. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +2 -2
  185. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +3 -4
  186. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +2 -2
  187. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +7 -8
  188. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +4 -13
  189. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +1 -1
  190. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +2 -0
  191. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +3 -5
  192. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +1 -1
  193. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +3 -44
  194. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +2 -28
  195. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +9 -5
  196. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +3 -3
  197. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -4
  198. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +3 -34
  199. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +3 -29
  200. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +2 -16
  201. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +1 -1
  202. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +4 -21
  203. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +1 -1
  204. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +1 -1
  205. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +3 -3
  206. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +4 -24
  207. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +3 -24
  208. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +13 -9
  209. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +3 -18
  210. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +21 -20
  211. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +3 -17
  212. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +4 -31
  213. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +3 -42
  214. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +5 -19
  215. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +3 -19
  216. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +3 -17
  217. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +2 -15
  218. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +13 -28
  219. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +2 -17
  220. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +2 -16
  221. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +3 -18
  222. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +1 -1
  223. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +1 -1
  224. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +4 -3
  225. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +2 -2
  226. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +2 -16
  227. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +2 -2
  228. cuda/cccl/headers/include/cuda/std/__utility/declval.h +17 -4
  229. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +1 -1
  230. cuda/cccl/headers/include/cuda/std/__utility/forward.h +1 -1
  231. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +29 -0
  232. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +2 -2
  233. cuda/cccl/headers/include/cuda/std/__utility/move.h +1 -1
  234. cuda/cccl/headers/include/cuda/std/__utility/pair.h +8 -9
  235. cuda/cccl/headers/include/cuda/std/array +2 -2
  236. cuda/cccl/headers/include/cuda/std/atomic +20 -28
  237. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  238. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +1 -32
  239. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +3 -3
  240. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +16 -1137
  241. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +12 -12
  242. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +4 -4
  243. cuda/cccl/headers/include/cuda/std/inplace_vector +9 -9
  244. cuda/cccl/headers/include/cuda/std/ratio +3 -4
  245. cuda/cccl/headers/include/cuda/std/version +2 -4
  246. cuda/cccl/headers/include/thrust/advance.h +6 -8
  247. cuda/cccl/headers/include/thrust/detail/execution_policy.h +61 -21
  248. cuda/cccl/headers/include/thrust/detail/internal_functional.h +37 -2
  249. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +1 -1
  250. cuda/cccl/headers/include/thrust/detail/reference.h +10 -16
  251. cuda/cccl/headers/include/thrust/detail/seq.h +37 -25
  252. cuda/cccl/headers/include/thrust/detail/vector_base.h +2 -4
  253. cuda/cccl/headers/include/thrust/detail/vector_base.inl +2 -2
  254. cuda/cccl/headers/include/thrust/distance.h +3 -3
  255. cuda/cccl/headers/include/thrust/execution_policy.h +202 -335
  256. cuda/cccl/headers/include/thrust/functional.h +1 -2
  257. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +6 -1
  258. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +1 -1
  259. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +24 -23
  260. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +66 -44
  261. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +4 -99
  262. cuda/cccl/headers/include/thrust/system/cuda/config.h +7 -4
  263. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +1 -1
  264. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +151 -40
  265. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +1 -1
  266. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +1 -1
  267. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +199 -48
  268. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +2 -2
  269. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +1 -1
  270. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +2 -2
  271. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +1 -1
  272. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +15 -13
  273. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +22 -19
  274. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +12 -42
  275. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +1 -1
  276. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +16 -4
  277. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +30 -30
  278. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +0 -1
  279. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  280. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +1 -1
  281. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +29 -15
  282. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +1 -1
  283. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -1
  284. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +1 -1
  285. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +1 -1
  286. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +1 -1
  287. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +7 -5
  288. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +3 -27
  289. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +10 -2
  290. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +16 -35
  291. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +75 -61
  292. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +4 -99
  293. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +68 -51
  294. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +2 -2
  295. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +4 -99
  296. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  297. cuda/cccl/parallel/experimental/__init__.py +4 -0
  298. cuda/cccl/parallel/experimental/_bindings.py +38 -15
  299. cuda/cccl/parallel/experimental/_bindings.pyi +28 -0
  300. cuda/cccl/parallel/experimental/_bindings_impl.pyx +176 -9
  301. cuda/cccl/parallel/experimental/_cccl_interop.py +3 -3
  302. cuda/cccl/parallel/experimental/algorithms/__init__.py +4 -0
  303. cuda/cccl/parallel/experimental/algorithms/_reduce.py +0 -2
  304. cuda/cccl/parallel/experimental/algorithms/_scan.py +0 -2
  305. cuda/cccl/parallel/experimental/algorithms/_three_way_partition.py +261 -0
  306. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  307. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  308. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  309. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  310. {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.0.dist-info}/METADATA +2 -3
  311. {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.0.dist-info}/RECORD +313 -303
  312. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  313. cuda/cccl/headers/include/thrust/detail/util/align.h +0 -59
  314. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +0 -59
  315. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +0 -204
  316. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +0 -92
  317. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +0 -237
  318. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +0 -95
  319. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +0 -62
  320. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +0 -62
  321. {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.0.dist-info}/WHEEL +0 -0
  322. {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -58,7 +58,9 @@ def make_binary_tempfile(content: bytes, suffix: str) -> BinaryIO:
58
58
 
59
59
  :return: A binary file-like object representing the temporary file.
60
60
  """
61
- tmp = tempfile.NamedTemporaryFile(mode="w+b", suffix=suffix, buffering=0)
61
+ tmp = tempfile.NamedTemporaryFile(
62
+ mode="w+b", suffix=suffix, buffering=0, delete=False
63
+ )
62
64
  tmp.write(content)
63
65
  return tmp
64
66
 
@@ -25,22 +25,15 @@
25
25
  #include <cuda/std/__algorithm/min.h>
26
26
 
27
27
  CUB_NAMESPACE_BEGIN
28
- namespace detail
28
+ namespace detail::merge
29
29
  {
30
- namespace merge
31
- {
32
- template <int ThreadsPerBlock,
33
- int ItemsPerThread,
34
- BlockLoadAlgorithm LoadAlgorithm,
35
- CacheLoadModifier LoadCacheModifier,
36
- BlockStoreAlgorithm StoreAlgorithm>
30
+ template <int ThreadsPerBlock, int ItemsPerThread, CacheLoadModifier LoadCacheModifier, BlockStoreAlgorithm StoreAlgorithm>
37
31
  struct agent_policy_t
38
32
  {
39
33
  // do not change data member names, policy_wrapper_t depends on it
40
34
  static constexpr int BLOCK_THREADS = ThreadsPerBlock;
41
35
  static constexpr int ITEMS_PER_THREAD = ItemsPerThread;
42
36
  static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;
43
- static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm;
44
37
  static constexpr CacheLoadModifier LOAD_MODIFIER = LoadCacheModifier;
45
38
  static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm;
46
39
  };
@@ -68,34 +61,27 @@ struct agent_t
68
61
  using items_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>;
69
62
  using items_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>;
70
63
 
71
- using block_load_keys1 = typename BlockLoadType<Policy, keys_load_it1>::type;
72
- using block_load_keys2 = typename BlockLoadType<Policy, keys_load_it2>::type;
73
- using block_load_items1 = typename BlockLoadType<Policy, items_load_it1>::type;
74
- using block_load_items2 = typename BlockLoadType<Policy, items_load_it2>::type;
75
-
76
64
  using block_store_keys = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
77
65
  using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
78
66
 
67
+ static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD;
68
+ static constexpr int threads_per_block = Policy::BLOCK_THREADS;
69
+ static constexpr int items_per_tile = Policy::ITEMS_PER_TILE;
70
+
79
71
  union temp_storages
80
72
  {
81
- typename block_load_keys1::TempStorage load_keys1;
82
- typename block_load_keys2::TempStorage load_keys2;
83
- typename block_load_items1::TempStorage load_items1;
84
- typename block_load_items2::TempStorage load_items2;
85
73
  typename block_store_keys::TempStorage store_keys;
86
74
  typename block_store_items::TempStorage store_items;
87
75
 
88
- key_type keys_shared[Policy::ITEMS_PER_TILE + 1];
89
- item_type items_shared[Policy::ITEMS_PER_TILE + 1];
76
+ // We could change SerialMerge to avoid reading one item out of bounds and drop the + 1 here. But that would
77
+ // introduce more branches (about 10% slower on 2^16 problem sizes on RTX 5090 in a first attempt)
78
+ key_type keys_shared[items_per_tile + 1];
79
+ item_type items_shared[items_per_tile + 1];
90
80
  };
91
81
 
92
82
  struct TempStorage : Uninitialized<temp_storages>
93
83
  {};
94
84
 
95
- static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD;
96
- static constexpr int threads_per_block = Policy::BLOCK_THREADS;
97
- static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE;
98
-
99
85
  // Per thread data
100
86
  temp_storages& storage;
101
87
  keys_load_it1 keys1_in;
@@ -107,61 +93,86 @@ struct agent_t
107
93
  KeysOutputIt keys_out;
108
94
  ItemsOutputIt items_out;
109
95
  CompareOp compare_op;
110
- Offset* merge_partitions;
96
+ Offset* key1_beg_offsets;
111
97
 
112
98
  template <bool IsFullTile>
113
99
  _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(Offset tile_idx, Offset tile_base, int num_remaining)
114
100
  {
115
- const Offset partition_beg = merge_partitions[tile_idx + 0];
116
- const Offset partition_end = merge_partitions[tile_idx + 1];
117
-
118
101
  const Offset diag0 = items_per_tile * tile_idx;
119
- const Offset diag1 = (::cuda::std::min) (keys1_count + keys2_count, diag0 + items_per_tile);
102
+ Offset diag1 = diag0 + items_per_tile;
103
+ if constexpr (IsFullTile)
104
+ {
105
+ _CCCL_ASSERT(diag1 <= keys1_count + keys2_count, "");
106
+ }
107
+ else
108
+ {
109
+ diag1 = keys1_count + keys2_count;
110
+ }
120
111
 
121
112
  // compute bounding box for keys1 & keys2
122
- const Offset keys1_beg = partition_beg;
123
- const Offset keys1_end = partition_end;
113
+ const Offset keys1_beg = key1_beg_offsets[tile_idx + 0];
114
+ const Offset keys1_end = key1_beg_offsets[tile_idx + 1];
124
115
  const Offset keys2_beg = diag0 - keys1_beg;
125
116
  const Offset keys2_end = diag1 - keys1_end;
126
117
 
127
118
  // number of keys per tile
128
- const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
129
- const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
119
+ const int keys1_count_tile = static_cast<int>(keys1_end - keys1_beg);
120
+ const int keys2_count_tile = static_cast<int>(keys2_end - keys2_beg);
121
+ if constexpr (IsFullTile)
122
+ {
123
+ _CCCL_ASSERT(keys1_count_tile + keys2_count_tile == items_per_tile, "");
124
+ }
125
+ else
126
+ {
127
+ _CCCL_ASSERT(keys1_count_tile + keys2_count_tile == num_remaining, "");
128
+ }
130
129
 
131
130
  key_type keys_loc[items_per_thread];
132
131
  merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
133
- keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
132
+ keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, keys1_count_tile, keys2_count_tile);
134
133
  merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
135
134
  __syncthreads();
136
135
 
137
- // use binary search in shared memory to find merge path for each of thread.
136
+ // now find the merge path for each of thread.
138
137
  // we can use int type here, because the number of items in shared memory is limited
139
- const int diag0_loc = (::cuda::std::min) (num_keys1 + num_keys2, static_cast<int>(items_per_thread * threadIdx.x));
138
+ int diag0_thread = items_per_thread * static_cast<int>(threadIdx.x);
139
+ if constexpr (IsFullTile)
140
+ {
141
+ _CCCL_ASSERT(num_remaining == items_per_tile, "");
142
+ _CCCL_ASSERT(diag0_thread < num_remaining, "");
143
+ }
144
+ else
145
+ { // for partial tiles, clamp the thread diagonal to the valid items
146
+ diag0_thread = (::cuda::std::min) (diag0_thread, num_remaining);
147
+ }
140
148
 
141
- const int keys1_beg_loc =
142
- MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
143
- const int keys1_end_loc = num_keys1;
144
- const int keys2_beg_loc = diag0_loc - keys1_beg_loc;
145
- const int keys2_end_loc = num_keys2;
149
+ const int keys1_beg_thread = MergePath(
150
+ &storage.keys_shared[0],
151
+ &storage.keys_shared[keys1_count_tile],
152
+ keys1_count_tile,
153
+ keys2_count_tile,
154
+ diag0_thread,
155
+ compare_op);
156
+ const int keys2_beg_thread = diag0_thread - keys1_beg_thread;
146
157
 
147
- const int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
148
- const int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
158
+ const int keys1_count_thread = keys1_count_tile - keys1_beg_thread;
159
+ const int keys2_count_thread = keys2_count_tile - keys2_beg_thread;
149
160
 
150
161
  // perform serial merge
151
162
  int indices[items_per_thread];
152
- cub::SerialMerge(
163
+ SerialMerge(
153
164
  &storage.keys_shared[0],
154
- keys1_beg_loc,
155
- keys2_beg_loc + num_keys1,
156
- num_keys1_loc,
157
- num_keys2_loc,
165
+ keys1_beg_thread,
166
+ keys2_beg_thread + keys1_count_tile,
167
+ keys1_count_thread,
168
+ keys2_count_thread,
158
169
  keys_loc,
159
170
  indices,
160
171
  compare_op);
161
- __syncthreads();
162
172
 
163
173
  // write keys
164
- if (IsFullTile)
174
+ __syncthreads(); // sync after reading from SMEM before so block store can use SMEM again
175
+ if constexpr (IsFullTile)
165
176
  {
166
177
  block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc);
167
178
  }
@@ -176,9 +187,8 @@ struct agent_t
176
187
  {
177
188
  item_type items_loc[items_per_thread];
178
189
  merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
179
- items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
180
- __syncthreads(); // block_store_keys above uses shared memory, so make sure all threads are done before we write
181
- // to it
190
+ items_loc, items1_in + keys1_beg, items2_in + keys2_beg, keys1_count_tile, keys2_count_tile);
191
+ __syncthreads(); // block_store_keys above uses SMEM, so make sure all threads are done before we write to it
182
192
  merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
183
193
  __syncthreads();
184
194
 
@@ -191,7 +201,7 @@ struct agent_t
191
201
  __syncthreads();
192
202
 
193
203
  // write from reg to gmem
194
- if (IsFullTile)
204
+ if constexpr (IsFullTile)
195
205
  {
196
206
  block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc);
197
207
  }
@@ -204,23 +214,19 @@ struct agent_t
204
214
 
205
215
  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
206
216
  {
207
- // XXX with 8.5 changing type to Offset (or long long) results in error!
208
- // TODO(bgruber): is the above still true?
209
- const int tile_idx = static_cast<int>(blockIdx.x);
217
+ const Offset tile_idx = blockIdx.x;
210
218
  const Offset tile_base = tile_idx * items_per_tile;
211
- // TODO(bgruber): random mixing of int and Offset
212
219
  const int items_in_tile =
213
220
  static_cast<int>((::cuda::std::min) (static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
214
221
  if (items_in_tile == items_per_tile)
215
222
  {
216
- consume_tile<true>(tile_idx, tile_base, items_per_tile); // full tile
223
+ consume_tile</* IsFullTile */ true>(tile_idx, tile_base, items_per_tile);
217
224
  }
218
225
  else
219
226
  {
220
- consume_tile<false>(tile_idx, tile_base, items_in_tile); // partial tile
227
+ consume_tile</* IsFullTile */ false>(tile_idx, tile_base, items_in_tile);
221
228
  }
222
229
  }
223
230
  };
224
- } // namespace merge
225
- } // namespace detail
231
+ } // namespace detail::merge
226
232
  CUB_NAMESPACE_END
@@ -51,6 +51,7 @@
51
51
  #include <cub/block/radix_rank_sort_operations.cuh>
52
52
  #include <cub/iterator/cache_modified_input_iterator.cuh>
53
53
  #include <cub/thread/thread_load.cuh>
54
+ #include <cub/util_device.cuh>
54
55
  #include <cub/util_type.cuh>
55
56
 
56
57
  #include <cuda/std/cstdint>
@@ -119,6 +120,28 @@ struct AgentRadixSortDownsweepPolicy : ScalingType
119
120
  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
120
121
  };
121
122
 
123
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
124
+ namespace detail
125
+ {
126
+ // Only define this when needed.
127
+ // Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
128
+ // either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
129
+ // version is always defined, and that's the only one needed for regular CUB operations.
130
+ //
131
+ // TODO: enable this unconditionally once concepts are always available
132
+ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
133
+ RadixSortDownsweepAgentPolicy,
134
+ (GenericAgentPolicy),
135
+ (BLOCK_THREADS, BlockThreads, int),
136
+ (ITEMS_PER_THREAD, ItemsPerThread, int),
137
+ (RADIX_BITS, RadixBits, int),
138
+ (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
139
+ (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
140
+ (RANK_ALGORITHM, RankAlgorithm, cub::RadixRankAlgorithm),
141
+ (SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
142
+ } // namespace detail
143
+ #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
144
+
122
145
  /******************************************************************************
123
146
  * Thread block abstractions
124
147
  ******************************************************************************/
@@ -50,10 +50,10 @@
50
50
  #include <cub/util_device.cuh>
51
51
  #include <cub/util_type.cuh>
52
52
 
53
- #include <cuda/__memory/is_aligned.h>
54
53
  #include <cuda/std/__algorithm/min.h>
55
54
  #include <cuda/std/__functional/identity.h>
56
55
  #include <cuda/std/__functional/operations.h>
56
+ #include <cuda/std/__memory/is_sufficiently_aligned.h>
57
57
  #include <cuda/std/__type_traits/conditional.h>
58
58
  #include <cuda/std/__type_traits/is_pointer.h>
59
59
 
@@ -175,9 +175,6 @@ namespace detail::reduce
175
175
  * @tparam InputIteratorT
176
176
  * Random-access iterator type for input
177
177
  *
178
- * @tparam OutputIteratorT
179
- * Random-access iterator type for output
180
- *
181
178
  * @tparam OffsetT
182
179
  * Signed integer type for global offsets
183
180
  *
@@ -202,7 +199,6 @@ namespace detail::reduce
202
199
  */
203
200
  template <typename AgentReducePolicy,
204
201
  typename InputIteratorT,
205
- typename OutputIteratorT,
206
202
  typename OffsetT,
207
203
  typename ReductionOp,
208
204
  typename AccumT,
@@ -274,7 +270,7 @@ struct AgentReduceImpl
274
270
  {
275
271
  if constexpr (AttemptVectorization)
276
272
  {
277
- return ::cuda::is_aligned(d_in, sizeof(VectorT));
273
+ return ::cuda::std::is_sufficiently_aligned<alignof(VectorT)>(d_in);
278
274
  }
279
275
  else
280
276
  {
@@ -506,9 +502,6 @@ private:
506
502
  * @tparam InputIteratorT
507
503
  * Random-access iterator type for input
508
504
  *
509
- * @tparam OutputIteratorT
510
- * Random-access iterator type for output
511
- *
512
505
  * @tparam OffsetT
513
506
  * Signed integer type for global offsets
514
507
  *
@@ -524,7 +517,6 @@ private:
524
517
  */
525
518
  template <typename AgentReducePolicy,
526
519
  typename InputIteratorT,
527
- typename OutputIteratorT,
528
520
  typename OffsetT,
529
521
  typename ReductionOp,
530
522
  typename AccumT,
@@ -532,7 +524,6 @@ template <typename AgentReducePolicy,
532
524
  struct AgentReduce
533
525
  : AgentReduceImpl<AgentReducePolicy,
534
526
  InputIteratorT,
535
- OutputIteratorT,
536
527
  OffsetT,
537
528
  ReductionOp,
538
529
  AccumT,
@@ -543,7 +534,6 @@ struct AgentReduce
543
534
  using base_t =
544
535
  AgentReduceImpl<AgentReducePolicy,
545
536
  InputIteratorT,
546
- OutputIteratorT,
547
537
  OffsetT,
548
538
  ReductionOp,
549
539
  AccumT,
@@ -574,9 +564,6 @@ struct AgentReduce
574
564
  * @tparam InputIteratorT
575
565
  * Random-access iterator type for input
576
566
  *
577
- * @tparam OutputIteratorT
578
- * Random-access iterator type for output
579
- *
580
567
  * @tparam OffsetT
581
568
  * Signed integer type for global offsets
582
569
  *
@@ -592,7 +579,6 @@ struct AgentReduce
592
579
  */
593
580
  template <typename AgentReducePolicy,
594
581
  typename InputIteratorT,
595
- typename OutputIteratorT,
596
582
  typename OffsetT,
597
583
  typename ReductionOp,
598
584
  typename AccumT,
@@ -600,7 +586,6 @@ template <typename AgentReducePolicy,
600
586
  struct AgentWarpReduce
601
587
  : AgentReduceImpl<AgentReducePolicy,
602
588
  InputIteratorT,
603
- OutputIteratorT,
604
589
  OffsetT,
605
590
  ReductionOp,
606
591
  AccumT,
@@ -612,7 +597,6 @@ struct AgentWarpReduce
612
597
  using base_t =
613
598
  AgentReduceImpl<AgentReducePolicy,
614
599
  InputIteratorT,
615
- OutputIteratorT,
616
600
  OffsetT,
617
601
  ReductionOp,
618
602
  AccumT,
@@ -47,7 +47,9 @@
47
47
  #include <cub/block/block_load.cuh>
48
48
  #include <cub/block/block_scan.cuh>
49
49
  #include <cub/block/block_store.cuh>
50
+ #include <cub/grid/grid_queue.cuh>
50
51
  #include <cub/iterator/cache_modified_input_iterator.cuh>
52
+ #include <cub/util_device.cuh>
51
53
 
52
54
  #include <cuda/std/__type_traits/conditional.h>
53
55
  #include <cuda/std/__type_traits/is_pointer.h>
@@ -110,6 +112,27 @@ struct AgentScanPolicy : ScalingType
110
112
  };
111
113
  };
112
114
 
115
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
116
+ namespace detail
117
+ {
118
+ // Only define this when needed.
119
+ // Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
120
+ // either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
121
+ // version is always defined, and that's the only one needed for regular CUB operations.
122
+ //
123
+ // TODO: enable this unconditionally once concepts are always available
124
+ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
125
+ ScanAgentPolicy,
126
+ (GenericAgentPolicy),
127
+ (BLOCK_THREADS, BlockThreads, int),
128
+ (ITEMS_PER_THREAD, ItemsPerThread, int),
129
+ (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
130
+ (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
131
+ (STORE_ALGORITHM, StoreAlgorithm, cub::BlockStoreAlgorithm),
132
+ (SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
133
+ } // namespace detail
134
+ #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
135
+
113
136
  /******************************************************************************
114
137
  * Thread block abstractions
115
138
  ******************************************************************************/
@@ -48,33 +48,41 @@
48
48
 
49
49
  CUB_NAMESPACE_BEGIN
50
50
 
51
- template <int WARP_THREADS_ARG,
51
+ template <int BLOCK_THREADS_ARG,
52
+ int WARP_THREADS_ARG,
52
53
  int ITEMS_PER_THREAD_ARG,
53
54
  cub::WarpLoadAlgorithm LOAD_ALGORITHM_ARG = cub::WARP_LOAD_DIRECT,
54
55
  cub::CacheLoadModifier LOAD_MODIFIER_ARG = cub::LOAD_LDG,
55
56
  cub::WarpStoreAlgorithm STORE_ALGORITHM_ARG = cub::WARP_STORE_DIRECT>
56
57
  struct AgentSubWarpMergeSortPolicy
57
58
  {
58
- static constexpr int WARP_THREADS = WARP_THREADS_ARG;
59
- static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_ARG;
60
- static constexpr int ITEMS_PER_TILE = WARP_THREADS * ITEMS_PER_THREAD;
59
+ static constexpr int BLOCK_THREADS = BLOCK_THREADS_ARG;
60
+ static constexpr int WARP_THREADS = WARP_THREADS_ARG;
61
+ static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_ARG;
62
+ static constexpr int ITEMS_PER_TILE = WARP_THREADS * ITEMS_PER_THREAD;
63
+ static constexpr int SEGMENTS_PER_BLOCK = BLOCK_THREADS / WARP_THREADS;
61
64
 
62
65
  static constexpr cub::WarpLoadAlgorithm LOAD_ALGORITHM = LOAD_ALGORITHM_ARG;
63
66
  static constexpr cub::CacheLoadModifier LOAD_MODIFIER = LOAD_MODIFIER_ARG;
64
67
  static constexpr cub::WarpStoreAlgorithm STORE_ALGORITHM = STORE_ALGORITHM_ARG;
65
68
  };
66
69
 
67
- template <int BLOCK_THREADS_ARG, typename SmallPolicy, typename MediumPolicy>
68
- struct AgentSmallAndMediumSegmentedSortPolicy
70
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
71
+ namespace detail
69
72
  {
70
- static constexpr int BLOCK_THREADS = BLOCK_THREADS_ARG;
71
- using SmallPolicyT = SmallPolicy;
72
- using MediumPolicyT = MediumPolicy;
73
-
74
- static constexpr int SEGMENTS_PER_MEDIUM_BLOCK = BLOCK_THREADS / MediumPolicyT::WARP_THREADS;
75
-
76
- static constexpr int SEGMENTS_PER_SMALL_BLOCK = BLOCK_THREADS / SmallPolicyT::WARP_THREADS;
77
- };
73
+ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
74
+ SubWarpMergeSortAgentPolicy,
75
+ (GenericAgentPolicy),
76
+ (BLOCK_THREADS, BlockThreads, int),
77
+ (WARP_THREADS, WarpThreads, int),
78
+ (ITEMS_PER_THREAD, ItemsPerThread, int),
79
+ (ITEMS_PER_TILE, ItemsPerTile, int),
80
+ (SEGMENTS_PER_BLOCK, SegmentsPerBlock, int),
81
+ (LOAD_ALGORITHM, LoadAlgorithm, cub::WarpLoadAlgorithm),
82
+ (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
83
+ (STORE_ALGORITHM, StoreAlgorithm, cub::WarpStoreAlgorithm))
84
+ } // namespace detail
85
+ #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
78
86
 
79
87
  namespace detail
80
88
  {
@@ -44,6 +44,7 @@
44
44
  #include <cub/block/block_scan.cuh>
45
45
  #include <cub/block/block_store.cuh>
46
46
  #include <cub/iterator/cache_modified_input_iterator.cuh>
47
+ #include <cub/util_device.cuh>
47
48
 
48
49
  #include <cuda/std/__functional/operations.h>
49
50
  #include <cuda/std/__type_traits/conditional.h>
@@ -76,9 +77,22 @@ struct AgentThreeWayPartitionPolicy
76
77
  };
77
78
  };
78
79
 
80
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
79
81
  namespace detail
80
82
  {
83
+ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
84
+ ThreeWayPartitionAgentPolicy,
85
+ (GenericAgentPolicy),
86
+ (BLOCK_THREADS, BlockThreads, int),
87
+ (ITEMS_PER_THREAD, ItemsPerThread, int),
88
+ (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
89
+ (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
90
+ (SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
91
+ } // namespace detail
92
+ #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
81
93
 
94
+ namespace detail
95
+ {
82
96
  namespace three_way_partition
83
97
  {
84
98