cuda-cccl 0.1.3.2.0.dev271__cp312-cp312-manylinux_2_26_x86_64.whl → 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (556) hide show
  1. cuda/cccl/_cuda_version_utils.py +0 -22
  2. cuda/cccl/cooperative/experimental/_common.py +3 -1
  3. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +6 -2
  4. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +3 -1
  5. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +68 -62
  6. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +5 -2
  7. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +4 -2
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +3 -2
  9. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +7 -20
  10. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +4 -2
  11. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +6 -3
  12. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +25 -1
  13. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +8 -2
  15. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +18 -1
  16. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  17. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +99 -17
  18. cuda/cccl/headers/include/cub/block/block_exchange.cuh +3 -1
  19. cuda/cccl/headers/include/cub/block/block_histogram.cuh +1 -1
  20. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +43 -30
  21. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +5 -3
  22. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +5 -4
  23. cuda/cccl/headers/include/cub/block/block_reduce.cuh +2 -1
  24. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +0 -3
  25. cuda/cccl/headers/include/cub/block/block_scan.cuh +2 -1
  26. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +9 -5
  27. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +1 -1
  28. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +2 -2
  29. cuda/cccl/headers/include/cub/cub.cuh +8 -0
  30. cuda/cccl/headers/include/cub/detail/array_utils.cuh +6 -5
  31. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +8 -2
  32. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +13 -32
  33. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +61 -0
  34. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +11 -4
  35. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +7 -3
  36. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +3 -3
  37. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +4 -4
  38. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +5 -4
  39. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +1 -1
  40. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +0 -18
  41. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +7 -5
  42. cuda/cccl/headers/include/cub/detail/rfa.cuh +9 -2
  43. cuda/cccl/headers/include/cub/detail/type_traits.cuh +15 -7
  44. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +3 -2
  45. cuda/cccl/headers/include/cub/device/device_for.cuh +7 -12
  46. cuda/cccl/headers/include/cub/device/device_histogram.cuh +11 -9
  47. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +1 -1
  48. cuda/cccl/headers/include/cub/device/device_merge.cuh +2 -1
  49. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -1
  50. cuda/cccl/headers/include/cub/device/device_reduce.cuh +785 -164
  51. cuda/cccl/headers/include/cub/device/device_scan.cuh +306 -0
  52. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +10 -2
  53. cuda/cccl/headers/include/cub/device/device_select.cuh +5 -1
  54. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  55. cuda/cccl/headers/include/cub/device/device_transform.cuh +118 -26
  56. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -1
  57. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +3 -1
  58. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +4 -3
  59. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +0 -2
  60. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +2 -1
  61. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +7 -3
  62. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +43 -44
  63. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +3 -2
  64. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +6 -2
  65. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +14 -3
  66. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +0 -1
  67. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +79 -40
  68. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +4 -3
  69. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +5 -2
  70. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +6 -3
  71. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +6 -2
  72. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +5 -4
  73. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +6 -2
  74. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +3 -2
  75. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +106 -172
  76. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +629 -0
  77. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +15 -12
  78. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +3 -1
  79. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +11 -3
  80. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +5 -1
  81. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +28 -41
  82. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +0 -2
  83. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +4 -10
  84. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +80 -0
  85. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +28 -6
  86. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +3 -15
  87. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +3 -2
  88. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +14 -2
  89. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -1
  90. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +36 -0
  91. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  92. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +10 -4
  93. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -2
  94. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -5
  95. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +6 -7
  96. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +4 -6
  97. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +4 -6
  98. cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -1
  99. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +7 -3
  100. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +11 -17
  101. cuda/cccl/headers/include/cub/thread/thread_search.cuh +2 -1
  102. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +7 -13
  103. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +3 -2
  104. cuda/cccl/headers/include/cub/thread/thread_store.cuh +2 -1
  105. cuda/cccl/headers/include/cub/util_device.cuh +30 -25
  106. cuda/cccl/headers/include/cub/util_macro.cuh +0 -2
  107. cuda/cccl/headers/include/cub/util_math.cuh +4 -1
  108. cuda/cccl/headers/include/cub/util_ptx.cuh +8 -8
  109. cuda/cccl/headers/include/cub/util_type.cuh +33 -49
  110. cuda/cccl/headers/include/cub/util_vsmem.cuh +5 -3
  111. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +1 -1
  112. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +1 -1
  113. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +9 -2
  114. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -1
  115. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +8 -6
  116. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +5 -3
  117. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +1 -1
  118. cuda/cccl/headers/include/cub/warp/warp_load.cuh +1 -1
  119. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -2
  120. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +5 -3
  121. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -2
  122. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -1
  123. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +3 -3
  124. cuda/cccl/headers/include/cuda/__algorithm/copy.h +63 -10
  125. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +3 -1
  126. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  127. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +93 -0
  128. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  129. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  130. cuda/cccl/headers/include/cuda/__device/device_ref.h +9 -0
  131. cuda/cccl/headers/include/cuda/__driver/driver_api.h +51 -13
  132. cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -1
  133. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  134. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +97 -52
  135. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +5 -6
  136. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  137. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +103 -60
  138. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +136 -113
  139. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +46 -36
  140. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +92 -60
  141. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +71 -29
  142. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +129 -64
  143. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +71 -62
  144. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +117 -120
  145. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +129 -124
  146. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +115 -106
  147. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +24 -6
  148. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +275 -141
  149. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +1 -1
  150. cuda/cccl/headers/include/cuda/__memory/address_space.h +28 -12
  151. cuda/cccl/headers/include/cuda/__memory/check_address.h +34 -29
  152. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +2 -2
  153. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  154. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +161 -92
  155. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +3 -2
  156. cuda/cccl/headers/include/cuda/pipeline +2 -1
  157. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +0 -6
  158. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +3 -3
  159. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +1 -1
  160. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +3 -3
  161. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +2 -2
  162. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +1 -2
  163. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  164. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  165. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +2 -3
  166. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +2 -3
  167. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +2 -2
  168. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  169. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +6 -8
  170. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +3 -3
  171. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +4 -4
  172. cuda/cccl/headers/include/cuda/std/__atomic/order.h +1 -1
  173. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +1 -1
  174. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +18 -7
  175. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +4 -115
  176. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +8 -5
  177. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +22 -3
  178. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +4 -4
  179. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +27 -0
  180. cuda/cccl/headers/include/cuda/std/__cccl/os.h +6 -0
  181. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +8 -0
  182. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +3 -3
  183. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +25 -0
  184. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  185. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  186. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  187. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  188. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  189. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  190. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  191. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  192. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +258 -0
  193. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  194. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  195. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +60 -0
  196. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  197. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +15 -0
  198. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +105 -153
  199. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  200. cuda/cccl/headers/include/cuda/std/__complex/complex.h +5 -7
  201. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +1 -0
  202. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +186 -119
  203. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +67 -0
  204. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +1 -4
  205. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +12 -9
  206. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +1 -1
  207. cuda/cccl/headers/include/cuda/std/__expected/expected.h +31 -38
  208. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +5 -3
  209. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  210. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -0
  211. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +6 -0
  212. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +4 -4
  213. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +2 -2
  214. cuda/cccl/headers/include/cuda/std/__functional/bind.h +2 -2
  215. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +4 -4
  216. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +2 -2
  217. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +2 -2
  218. cuda/cccl/headers/include/cuda/std/__functional/function.h +10 -11
  219. cuda/cccl/headers/include/cuda/std/__functional/hash.h +5 -6
  220. cuda/cccl/headers/include/cuda/std/__functional/identity.h +4 -8
  221. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +2 -4
  222. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +16 -18
  223. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +2 -3
  224. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +2 -3
  225. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +3 -3
  226. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +2 -2
  227. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +16 -25
  228. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +32 -0
  229. cuda/cccl/headers/include/cuda/std/__internal/features.h +6 -0
  230. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +3 -3
  231. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +3 -3
  232. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +3 -4
  233. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +1 -1
  234. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +21 -28
  235. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +1 -1
  236. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +41 -126
  237. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +3 -4
  238. cuda/cccl/headers/include/cuda/std/__iterator/next.h +2 -2
  239. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +2 -3
  240. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +12 -41
  241. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +2 -2
  242. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +4 -6
  243. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +31 -31
  244. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +45 -45
  245. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +6 -0
  246. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +3 -2
  247. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +1 -1
  248. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +1 -1
  249. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +1 -1
  250. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +10 -12
  251. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +5 -2
  252. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +30 -30
  253. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -1
  254. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +66 -86
  255. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +2 -2
  256. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +3 -3
  257. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +5 -2
  258. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +30 -45
  259. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +8 -12
  260. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +21 -23
  261. cuda/cccl/headers/include/cuda/std/__new/launder.h +4 -0
  262. cuda/cccl/headers/include/cuda/std/__optional/hash.h +2 -2
  263. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +8 -8
  264. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +2 -1
  265. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +2 -1
  266. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +1 -1
  267. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +1 -1
  268. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +1 -1
  269. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +1 -1
  270. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +4 -13
  271. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +12 -22
  272. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +9 -18
  273. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +2 -2
  274. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +0 -1
  275. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +3 -4
  276. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +2 -2
  277. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +7 -8
  278. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +4 -13
  279. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +1 -1
  280. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +2 -0
  281. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +3 -5
  282. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +1 -1
  283. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +3 -44
  284. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +2 -28
  285. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +9 -5
  286. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +3 -3
  287. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -4
  288. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +3 -34
  289. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +3 -29
  290. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +0 -2
  291. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +2 -16
  292. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +1 -1
  293. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +4 -21
  294. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +1 -1
  295. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +1 -1
  296. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +3 -3
  297. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +4 -24
  298. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +3 -24
  299. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +13 -9
  300. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +3 -18
  301. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +21 -20
  302. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +3 -17
  303. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +4 -31
  304. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +3 -42
  305. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +5 -19
  306. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +3 -19
  307. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +3 -17
  308. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +2 -15
  309. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +13 -28
  310. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +2 -17
  311. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +2 -16
  312. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +3 -18
  313. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +1 -1
  314. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +1 -1
  315. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +0 -1
  316. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +1 -2
  317. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +2 -2
  318. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +2 -16
  319. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +2 -2
  320. cuda/cccl/headers/include/cuda/std/__utility/declval.h +17 -4
  321. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +1 -1
  322. cuda/cccl/headers/include/cuda/std/__utility/forward.h +1 -1
  323. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +29 -0
  324. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +2 -2
  325. cuda/cccl/headers/include/cuda/std/__utility/move.h +1 -1
  326. cuda/cccl/headers/include/cuda/std/__utility/pair.h +8 -9
  327. cuda/cccl/headers/include/cuda/std/array +2 -2
  328. cuda/cccl/headers/include/cuda/std/atomic +20 -28
  329. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  330. cuda/cccl/headers/include/cuda/std/cmath +63 -1
  331. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +1 -32
  332. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +3 -4
  333. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +16 -1137
  334. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +12 -12
  335. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +4 -4
  336. cuda/cccl/headers/include/cuda/std/inplace_vector +9 -9
  337. cuda/cccl/headers/include/cuda/std/numbers +0 -1
  338. cuda/cccl/headers/include/cuda/std/ratio +3 -4
  339. cuda/cccl/headers/include/cuda/std/version +2 -4
  340. cuda/cccl/headers/include/thrust/advance.h +6 -8
  341. cuda/cccl/headers/include/thrust/detail/execution_policy.h +61 -21
  342. cuda/cccl/headers/include/thrust/detail/internal_functional.h +37 -2
  343. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +1 -1
  344. cuda/cccl/headers/include/thrust/detail/pointer.h +1 -1
  345. cuda/cccl/headers/include/thrust/detail/reference.h +10 -16
  346. cuda/cccl/headers/include/thrust/detail/seq.h +37 -25
  347. cuda/cccl/headers/include/thrust/detail/vector_base.h +2 -4
  348. cuda/cccl/headers/include/thrust/detail/vector_base.inl +2 -4
  349. cuda/cccl/headers/include/thrust/distance.h +3 -3
  350. cuda/cccl/headers/include/thrust/execution_policy.h +202 -335
  351. cuda/cccl/headers/include/thrust/functional.h +1 -2
  352. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +9 -0
  353. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +4 -1
  354. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +39 -56
  355. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +6 -1
  356. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +1 -1
  357. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +6 -10
  358. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +1 -2
  359. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +36 -24
  360. cuda/cccl/headers/include/thrust/iterator/{detail/iterator_traversal_tags.h → iterator_traversal_tags.h} +14 -0
  361. cuda/cccl/headers/include/thrust/iterator/retag.h +5 -5
  362. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +7 -7
  363. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +2 -2
  364. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +77 -107
  365. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +70 -51
  366. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +4 -99
  367. cuda/cccl/headers/include/thrust/system/cpp/memory.h +2 -5
  368. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +2 -5
  369. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +2 -5
  370. cuda/cccl/headers/include/thrust/system/cpp/vector.h +2 -5
  371. cuda/cccl/headers/include/thrust/system/cuda/config.h +7 -4
  372. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +1 -1
  373. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +151 -40
  374. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +1 -1
  375. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +1 -1
  376. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +0 -16
  377. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +199 -48
  378. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +36 -18
  379. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +1 -1
  380. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +2 -2
  381. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +26 -51
  382. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +25 -14
  383. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +1 -1
  384. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +16 -13
  385. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +40 -40
  386. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +12 -42
  387. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +1 -2
  388. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +16 -4
  389. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +49 -53
  390. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +0 -12
  391. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +1 -1
  392. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +29 -15
  393. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +1 -1
  394. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +8 -5
  395. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +16 -2
  396. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +1 -1
  397. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +1 -2
  398. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +7 -5
  399. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +3 -27
  400. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +2 -5
  401. cuda/cccl/headers/include/thrust/system/detail/errno.h +2 -7
  402. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +2 -8
  403. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +2 -8
  404. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +2 -8
  405. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +3 -10
  406. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +2 -8
  407. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +2 -8
  408. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +2 -8
  409. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +2 -8
  410. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +2 -8
  411. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +2 -8
  412. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +2 -8
  413. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +2 -8
  414. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +2 -8
  415. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +2 -8
  416. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +2 -8
  417. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +2 -8
  418. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +2 -8
  419. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +2 -8
  420. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +2 -8
  421. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +2 -8
  422. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +2 -8
  423. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +2 -8
  424. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +2 -8
  425. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +2 -8
  426. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +2 -8
  427. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +2 -8
  428. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +2 -8
  429. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +2 -8
  430. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +2 -8
  431. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +2 -8
  432. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +2 -8
  433. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +2 -8
  434. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +2 -8
  435. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +2 -8
  436. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +2 -8
  437. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +2 -8
  438. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +2 -8
  439. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +2 -8
  440. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +2 -8
  441. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +2 -8
  442. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +2 -8
  443. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +2 -8
  444. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +2 -8
  445. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +2 -17
  446. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +2 -17
  447. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +2 -8
  448. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +2 -8
  449. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +2 -8
  450. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +2 -8
  451. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +2 -8
  452. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +2 -8
  453. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +10 -2
  454. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +2 -8
  455. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +2 -8
  456. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +2 -8
  457. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +2 -8
  458. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +2 -8
  459. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +2 -8
  460. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +2 -8
  461. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +2 -8
  462. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +2 -8
  463. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +2 -8
  464. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +2 -8
  465. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +2 -8
  466. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +2 -8
  467. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +2 -8
  468. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +2 -8
  469. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +2 -8
  470. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +2 -8
  471. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +2 -8
  472. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -8
  473. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +2 -8
  474. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +2 -8
  475. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +2 -8
  476. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +2 -8
  477. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +2 -8
  478. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +2 -8
  479. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +2 -8
  480. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +2 -8
  481. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +2 -8
  482. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +2 -8
  483. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +2 -8
  484. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +2 -8
  485. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +2 -8
  486. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +2 -8
  487. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +18 -44
  488. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +2 -8
  489. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +2 -8
  490. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +2 -8
  491. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +2 -8
  492. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +2 -8
  493. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +2 -8
  494. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +2 -8
  495. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +3 -9
  496. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +2 -8
  497. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +2 -8
  498. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +2 -10
  499. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +2 -8
  500. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +2 -8
  501. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +2 -8
  502. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +2 -8
  503. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +2 -8
  504. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +2 -8
  505. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +2 -8
  506. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +2 -8
  507. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +2 -8
  508. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +2 -8
  509. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +3 -9
  510. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +2 -8
  511. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +3 -9
  512. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +2 -8
  513. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +2 -8
  514. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +2 -8
  515. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +75 -61
  516. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +4 -99
  517. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +68 -51
  518. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +2 -2
  519. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +4 -99
  520. cuda/cccl/headers/include_paths.py +6 -9
  521. cuda/cccl/parallel/experimental/__init__.py +2 -4
  522. cuda/cccl/parallel/experimental/_bindings.py +38 -15
  523. cuda/cccl/parallel/experimental/_bindings_impl.pyx +36 -9
  524. cuda/cccl/parallel/experimental/_cccl_interop.py +56 -30
  525. cuda/cccl/parallel/experimental/algorithms/_histogram.py +2 -2
  526. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +4 -4
  527. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +4 -4
  528. cuda/cccl/parallel/experimental/algorithms/_reduce.py +2 -2
  529. cuda/cccl/parallel/experimental/algorithms/_scan.py +4 -4
  530. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +4 -4
  531. cuda/cccl/parallel/experimental/algorithms/_transform.py +5 -5
  532. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +5 -5
  533. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  534. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  535. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  536. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  537. cuda/cccl/parallel/experimental/iterators/__init__.py +2 -4
  538. cuda/cccl/parallel/experimental/iterators/_factories.py +28 -51
  539. cuda/cccl/parallel/experimental/iterators/_iterators.py +189 -204
  540. cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +4 -12
  541. cuda/cccl/parallel/experimental/numba_utils.py +47 -0
  542. {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.1.3.2.0.dev438.dist-info}/METADATA +8 -6
  543. {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.1.3.2.0.dev438.dist-info}/RECORD +545 -530
  544. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +0 -520
  545. cuda/cccl/headers/include/thrust/detail/mpl/math.h +0 -164
  546. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +0 -44
  547. cuda/cccl/headers/include/thrust/detail/util/align.h +0 -59
  548. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +0 -62
  549. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +0 -204
  550. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +0 -92
  551. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +0 -237
  552. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +0 -95
  553. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +0 -62
  554. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +0 -62
  555. {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.1.3.2.0.dev438.dist-info}/WHEEL +0 -0
  556. {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.1.3.2.0.dev438.dist-info}/licenses/LICENSE +0 -0
@@ -6,9 +6,6 @@
6
6
  CUDA version detection utilities shared across the cccl package.
7
7
  """
8
8
 
9
- import os
10
- import shutil
11
- from pathlib import Path
12
9
  from typing import Optional
13
10
 
14
11
  import cuda.bindings
@@ -19,25 +16,6 @@ def detect_cuda_version() -> Optional[int]:
19
16
  return int(cuda_version.split(".")[0])
20
17
 
21
18
 
22
- def get_cuda_path() -> Optional[Path]:
23
- """Get the CUDA installation path."""
24
- cuda_path_str = os.environ.get("CUDA_PATH")
25
- if cuda_path_str:
26
- cuda_path = Path(cuda_path_str)
27
- if cuda_path.exists():
28
- return cuda_path
29
-
30
- nvcc_path = shutil.which("nvcc")
31
- if nvcc_path:
32
- return Path(nvcc_path).parent.parent
33
-
34
- default_path = Path("/usr/local/cuda")
35
- if default_path.exists():
36
- return default_path
37
-
38
- return None
39
-
40
-
41
19
  def get_recommended_extra(cuda_version: Optional[int]) -> str:
42
20
  """Get the recommended pip extra for the detected CUDA version."""
43
21
  if cuda_version == 13:
@@ -58,7 +58,9 @@ def make_binary_tempfile(content: bytes, suffix: str) -> BinaryIO:
58
58
 
59
59
  :return: A binary file-like object representing the temporary file.
60
60
  """
61
- tmp = tempfile.NamedTemporaryFile(mode="w+b", suffix=suffix, buffering=0)
61
+ tmp = tempfile.NamedTemporaryFile(
62
+ mode="w+b", suffix=suffix, buffering=0, delete=False
63
+ )
62
64
  tmp.write(content)
63
65
  return tmp
64
66
 
@@ -52,9 +52,13 @@
52
52
  #include <cub/util_ptx.cuh>
53
53
  #include <cub/util_type.cuh>
54
54
 
55
- #include <cuda/cmath>
55
+ #include <cuda/__cmath/ceil_div.h>
56
+ #include <cuda/__cmath/round_up.h>
57
+ #include <cuda/std/__functional/operations.h>
58
+ #include <cuda/std/__type_traits/conditional.h>
59
+ #include <cuda/std/__type_traits/enable_if.h>
60
+ #include <cuda/std/__type_traits/type_identity.h>
56
61
  #include <cuda/std/cstdint>
57
- #include <cuda/std/type_traits>
58
62
 
59
63
  CUB_NAMESPACE_BEGIN
60
64
 
@@ -49,7 +49,9 @@
49
49
  #include <cub/iterator/cache_modified_input_iterator.cuh>
50
50
  #include <cub/util_type.cuh>
51
51
 
52
- #include <cuda/std/type_traits>
52
+ #include <cuda/std/__type_traits/conditional.h>
53
+ #include <cuda/std/__type_traits/integral_constant.h>
54
+ #include <cuda/std/__type_traits/is_pointer.h>
53
55
 
54
56
  CUB_NAMESPACE_BEGIN
55
57
 
@@ -25,22 +25,15 @@
25
25
  #include <cuda/std/__algorithm/min.h>
26
26
 
27
27
  CUB_NAMESPACE_BEGIN
28
- namespace detail
28
+ namespace detail::merge
29
29
  {
30
- namespace merge
31
- {
32
- template <int ThreadsPerBlock,
33
- int ItemsPerThread,
34
- BlockLoadAlgorithm LoadAlgorithm,
35
- CacheLoadModifier LoadCacheModifier,
36
- BlockStoreAlgorithm StoreAlgorithm>
30
+ template <int ThreadsPerBlock, int ItemsPerThread, CacheLoadModifier LoadCacheModifier, BlockStoreAlgorithm StoreAlgorithm>
37
31
  struct agent_policy_t
38
32
  {
39
33
  // do not change data member names, policy_wrapper_t depends on it
40
34
  static constexpr int BLOCK_THREADS = ThreadsPerBlock;
41
35
  static constexpr int ITEMS_PER_THREAD = ItemsPerThread;
42
36
  static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;
43
- static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm;
44
37
  static constexpr CacheLoadModifier LOAD_MODIFIER = LoadCacheModifier;
45
38
  static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm;
46
39
  };
@@ -68,34 +61,27 @@ struct agent_t
68
61
  using items_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>;
69
62
  using items_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>;
70
63
 
71
- using block_load_keys1 = typename BlockLoadType<Policy, keys_load_it1>::type;
72
- using block_load_keys2 = typename BlockLoadType<Policy, keys_load_it2>::type;
73
- using block_load_items1 = typename BlockLoadType<Policy, items_load_it1>::type;
74
- using block_load_items2 = typename BlockLoadType<Policy, items_load_it2>::type;
75
-
76
64
  using block_store_keys = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
77
65
  using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
78
66
 
67
+ static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD;
68
+ static constexpr int threads_per_block = Policy::BLOCK_THREADS;
69
+ static constexpr int items_per_tile = Policy::ITEMS_PER_TILE;
70
+
79
71
  union temp_storages
80
72
  {
81
- typename block_load_keys1::TempStorage load_keys1;
82
- typename block_load_keys2::TempStorage load_keys2;
83
- typename block_load_items1::TempStorage load_items1;
84
- typename block_load_items2::TempStorage load_items2;
85
73
  typename block_store_keys::TempStorage store_keys;
86
74
  typename block_store_items::TempStorage store_items;
87
75
 
88
- key_type keys_shared[Policy::ITEMS_PER_TILE + 1];
89
- item_type items_shared[Policy::ITEMS_PER_TILE + 1];
76
+ // We could change SerialMerge to avoid reading one item out of bounds and drop the + 1 here. But that would
77
+ // introduce more branches (about 10% slower on 2^16 problem sizes on RTX 5090 in a first attempt)
78
+ key_type keys_shared[items_per_tile + 1];
79
+ item_type items_shared[items_per_tile + 1];
90
80
  };
91
81
 
92
82
  struct TempStorage : Uninitialized<temp_storages>
93
83
  {};
94
84
 
95
- static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD;
96
- static constexpr int threads_per_block = Policy::BLOCK_THREADS;
97
- static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE;
98
-
99
85
  // Per thread data
100
86
  temp_storages& storage;
101
87
  keys_load_it1 keys1_in;
@@ -107,61 +93,86 @@ struct agent_t
107
93
  KeysOutputIt keys_out;
108
94
  ItemsOutputIt items_out;
109
95
  CompareOp compare_op;
110
- Offset* merge_partitions;
96
+ Offset* key1_beg_offsets;
111
97
 
112
98
  template <bool IsFullTile>
113
99
  _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(Offset tile_idx, Offset tile_base, int num_remaining)
114
100
  {
115
- const Offset partition_beg = merge_partitions[tile_idx + 0];
116
- const Offset partition_end = merge_partitions[tile_idx + 1];
117
-
118
101
  const Offset diag0 = items_per_tile * tile_idx;
119
- const Offset diag1 = (::cuda::std::min) (keys1_count + keys2_count, diag0 + items_per_tile);
102
+ Offset diag1 = diag0 + items_per_tile;
103
+ if constexpr (IsFullTile)
104
+ {
105
+ _CCCL_ASSERT(diag1 <= keys1_count + keys2_count, "");
106
+ }
107
+ else
108
+ {
109
+ diag1 = keys1_count + keys2_count;
110
+ }
120
111
 
121
112
  // compute bounding box for keys1 & keys2
122
- const Offset keys1_beg = partition_beg;
123
- const Offset keys1_end = partition_end;
113
+ const Offset keys1_beg = key1_beg_offsets[tile_idx + 0];
114
+ const Offset keys1_end = key1_beg_offsets[tile_idx + 1];
124
115
  const Offset keys2_beg = diag0 - keys1_beg;
125
116
  const Offset keys2_end = diag1 - keys1_end;
126
117
 
127
118
  // number of keys per tile
128
- const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
129
- const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
119
+ const int keys1_count_tile = static_cast<int>(keys1_end - keys1_beg);
120
+ const int keys2_count_tile = static_cast<int>(keys2_end - keys2_beg);
121
+ if constexpr (IsFullTile)
122
+ {
123
+ _CCCL_ASSERT(keys1_count_tile + keys2_count_tile == items_per_tile, "");
124
+ }
125
+ else
126
+ {
127
+ _CCCL_ASSERT(keys1_count_tile + keys2_count_tile == num_remaining, "");
128
+ }
130
129
 
131
130
  key_type keys_loc[items_per_thread];
132
131
  merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
133
- keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
132
+ keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, keys1_count_tile, keys2_count_tile);
134
133
  merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
135
134
  __syncthreads();
136
135
 
137
- // use binary search in shared memory to find merge path for each of thread.
136
+ // now find the merge path for each of thread.
138
137
  // we can use int type here, because the number of items in shared memory is limited
139
- const int diag0_loc = (::cuda::std::min) (num_keys1 + num_keys2, static_cast<int>(items_per_thread * threadIdx.x));
138
+ int diag0_thread = items_per_thread * static_cast<int>(threadIdx.x);
139
+ if constexpr (IsFullTile)
140
+ {
141
+ _CCCL_ASSERT(num_remaining == items_per_tile, "");
142
+ _CCCL_ASSERT(diag0_thread < num_remaining, "");
143
+ }
144
+ else
145
+ { // for partial tiles, clamp the thread diagonal to the valid items
146
+ diag0_thread = (::cuda::std::min) (diag0_thread, num_remaining);
147
+ }
140
148
 
141
- const int keys1_beg_loc =
142
- MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
143
- const int keys1_end_loc = num_keys1;
144
- const int keys2_beg_loc = diag0_loc - keys1_beg_loc;
145
- const int keys2_end_loc = num_keys2;
149
+ const int keys1_beg_thread = MergePath(
150
+ &storage.keys_shared[0],
151
+ &storage.keys_shared[keys1_count_tile],
152
+ keys1_count_tile,
153
+ keys2_count_tile,
154
+ diag0_thread,
155
+ compare_op);
156
+ const int keys2_beg_thread = diag0_thread - keys1_beg_thread;
146
157
 
147
- const int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
148
- const int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
158
+ const int keys1_count_thread = keys1_count_tile - keys1_beg_thread;
159
+ const int keys2_count_thread = keys2_count_tile - keys2_beg_thread;
149
160
 
150
161
  // perform serial merge
151
162
  int indices[items_per_thread];
152
- cub::SerialMerge(
163
+ SerialMerge(
153
164
  &storage.keys_shared[0],
154
- keys1_beg_loc,
155
- keys2_beg_loc + num_keys1,
156
- num_keys1_loc,
157
- num_keys2_loc,
165
+ keys1_beg_thread,
166
+ keys2_beg_thread + keys1_count_tile,
167
+ keys1_count_thread,
168
+ keys2_count_thread,
158
169
  keys_loc,
159
170
  indices,
160
171
  compare_op);
161
- __syncthreads();
162
172
 
163
173
  // write keys
164
- if (IsFullTile)
174
+ __syncthreads(); // sync after reading from SMEM before so block store can use SMEM again
175
+ if constexpr (IsFullTile)
165
176
  {
166
177
  block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc);
167
178
  }
@@ -176,9 +187,8 @@ struct agent_t
176
187
  {
177
188
  item_type items_loc[items_per_thread];
178
189
  merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
179
- items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
180
- __syncthreads(); // block_store_keys above uses shared memory, so make sure all threads are done before we write
181
- // to it
190
+ items_loc, items1_in + keys1_beg, items2_in + keys2_beg, keys1_count_tile, keys2_count_tile);
191
+ __syncthreads(); // block_store_keys above uses SMEM, so make sure all threads are done before we write to it
182
192
  merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
183
193
  __syncthreads();
184
194
 
@@ -191,7 +201,7 @@ struct agent_t
191
201
  __syncthreads();
192
202
 
193
203
  // write from reg to gmem
194
- if (IsFullTile)
204
+ if constexpr (IsFullTile)
195
205
  {
196
206
  block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc);
197
207
  }
@@ -204,23 +214,19 @@ struct agent_t
204
214
 
205
215
  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
206
216
  {
207
- // XXX with 8.5 changing type to Offset (or long long) results in error!
208
- // TODO(bgruber): is the above still true?
209
- const int tile_idx = static_cast<int>(blockIdx.x);
217
+ const Offset tile_idx = blockIdx.x;
210
218
  const Offset tile_base = tile_idx * items_per_tile;
211
- // TODO(bgruber): random mixing of int and Offset
212
219
  const int items_in_tile =
213
220
  static_cast<int>((::cuda::std::min) (static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
214
221
  if (items_in_tile == items_per_tile)
215
222
  {
216
- consume_tile<true>(tile_idx, tile_base, items_per_tile); // full tile
223
+ consume_tile</* IsFullTile */ true>(tile_idx, tile_base, items_per_tile);
217
224
  }
218
225
  else
219
226
  {
220
- consume_tile<false>(tile_idx, tile_base, items_in_tile); // partial tile
227
+ consume_tile</* IsFullTile */ false>(tile_idx, tile_base, items_in_tile);
221
228
  }
222
229
  }
223
230
  };
224
- } // namespace merge
225
- } // namespace detail
231
+ } // namespace detail::merge
226
232
  CUB_NAMESPACE_END
@@ -50,8 +50,11 @@
50
50
  #include <cub/util_math.cuh>
51
51
  #include <cub/util_type.cuh>
52
52
 
53
- #include <cuda/ptx>
54
- #include <cuda/std/__algorithm_>
53
+ #include <cuda/__cmath/ceil_div.h>
54
+ #include <cuda/__ptx/instructions/get_sreg.h>
55
+ #include <cuda/std/__algorithm/max.h>
56
+ #include <cuda/std/__algorithm/min.h>
57
+ #include <cuda/std/__functional/operations.h>
55
58
 
56
59
  CUB_NAMESPACE_BEGIN
57
60
 
@@ -49,8 +49,10 @@
49
49
  #include <cub/util_ptx.cuh>
50
50
  #include <cub/util_type.cuh>
51
51
 
52
- #include <cuda/ptx>
53
- #include <cuda/std/type_traits>
52
+ #include <cuda/__ptx/instructions/get_sreg.h>
53
+ #include <cuda/std/__type_traits/conditional.h>
54
+ #include <cuda/std/__type_traits/integral_constant.h>
55
+ #include <cuda/std/__type_traits/is_same.h>
54
56
 
55
57
  CUB_NAMESPACE_BEGIN
56
58
 
@@ -52,8 +52,9 @@
52
52
  #include <cub/util_type.cuh>
53
53
  #include <cub/warp/warp_reduce.cuh>
54
54
 
55
- #include <cuda/ptx>
56
- #include <cuda/std/__algorithm_>
55
+ #include <cuda/__ptx/instructions/get_sreg.h>
56
+ #include <cuda/std/__algorithm/max.h>
57
+ #include <cuda/std/__algorithm/min.h>
57
58
 
58
59
  CUB_NAMESPACE_BEGIN
59
60
 
@@ -50,9 +50,12 @@
50
50
  #include <cub/util_device.cuh>
51
51
  #include <cub/util_type.cuh>
52
52
 
53
- #include <cuda/memory>
54
- #include <cuda/std/functional>
55
- #include <cuda/std/type_traits>
53
+ #include <cuda/std/__algorithm/min.h>
54
+ #include <cuda/std/__functional/identity.h>
55
+ #include <cuda/std/__functional/operations.h>
56
+ #include <cuda/std/__memory/is_sufficiently_aligned.h>
57
+ #include <cuda/std/__type_traits/conditional.h>
58
+ #include <cuda/std/__type_traits/is_pointer.h>
56
59
 
57
60
  CUB_NAMESPACE_BEGIN
58
61
 
@@ -172,9 +175,6 @@ namespace detail::reduce
172
175
  * @tparam InputIteratorT
173
176
  * Random-access iterator type for input
174
177
  *
175
- * @tparam OutputIteratorT
176
- * Random-access iterator type for output
177
- *
178
178
  * @tparam OffsetT
179
179
  * Signed integer type for global offsets
180
180
  *
@@ -199,7 +199,6 @@ namespace detail::reduce
199
199
  */
200
200
  template <typename AgentReducePolicy,
201
201
  typename InputIteratorT,
202
- typename OutputIteratorT,
203
202
  typename OffsetT,
204
203
  typename ReductionOp,
205
204
  typename AccumT,
@@ -271,7 +270,7 @@ struct AgentReduceImpl
271
270
  {
272
271
  if constexpr (AttemptVectorization)
273
272
  {
274
- return ::cuda::is_aligned(d_in, sizeof(VectorT));
273
+ return ::cuda::std::is_sufficiently_aligned<alignof(VectorT)>(d_in);
275
274
  }
276
275
  else
277
276
  {
@@ -503,9 +502,6 @@ private:
503
502
  * @tparam InputIteratorT
504
503
  * Random-access iterator type for input
505
504
  *
506
- * @tparam OutputIteratorT
507
- * Random-access iterator type for output
508
- *
509
505
  * @tparam OffsetT
510
506
  * Signed integer type for global offsets
511
507
  *
@@ -521,7 +517,6 @@ private:
521
517
  */
522
518
  template <typename AgentReducePolicy,
523
519
  typename InputIteratorT,
524
- typename OutputIteratorT,
525
520
  typename OffsetT,
526
521
  typename ReductionOp,
527
522
  typename AccumT,
@@ -529,7 +524,6 @@ template <typename AgentReducePolicy,
529
524
  struct AgentReduce
530
525
  : AgentReduceImpl<AgentReducePolicy,
531
526
  InputIteratorT,
532
- OutputIteratorT,
533
527
  OffsetT,
534
528
  ReductionOp,
535
529
  AccumT,
@@ -540,7 +534,6 @@ struct AgentReduce
540
534
  using base_t =
541
535
  AgentReduceImpl<AgentReducePolicy,
542
536
  InputIteratorT,
543
- OutputIteratorT,
544
537
  OffsetT,
545
538
  ReductionOp,
546
539
  AccumT,
@@ -571,9 +564,6 @@ struct AgentReduce
571
564
  * @tparam InputIteratorT
572
565
  * Random-access iterator type for input
573
566
  *
574
- * @tparam OutputIteratorT
575
- * Random-access iterator type for output
576
- *
577
567
  * @tparam OffsetT
578
568
  * Signed integer type for global offsets
579
569
  *
@@ -589,7 +579,6 @@ struct AgentReduce
589
579
  */
590
580
  template <typename AgentReducePolicy,
591
581
  typename InputIteratorT,
592
- typename OutputIteratorT,
593
582
  typename OffsetT,
594
583
  typename ReductionOp,
595
584
  typename AccumT,
@@ -597,7 +586,6 @@ template <typename AgentReducePolicy,
597
586
  struct AgentWarpReduce
598
587
  : AgentReduceImpl<AgentReducePolicy,
599
588
  InputIteratorT,
600
- OutputIteratorT,
601
589
  OffsetT,
602
590
  ReductionOp,
603
591
  AccumT,
@@ -609,7 +597,6 @@ struct AgentWarpReduce
609
597
  using base_t =
610
598
  AgentReduceImpl<AgentReducePolicy,
611
599
  InputIteratorT,
612
- OutputIteratorT,
613
600
  OffsetT,
614
601
  ReductionOp,
615
602
  AccumT,
@@ -50,8 +50,10 @@
50
50
  #include <cub/block/block_store.cuh>
51
51
  #include <cub/iterator/cache_modified_input_iterator.cuh>
52
52
 
53
- #include <cuda/std/type_traits>
54
-
53
+ #include <cuda/std/__functional/operations.h>
54
+ #include <cuda/std/__type_traits/conditional.h>
55
+ #include <cuda/std/__type_traits/is_pointer.h>
56
+ #include <cuda/std/__type_traits/is_same.h>
55
57
  CUB_NAMESPACE_BEGIN
56
58
 
57
59
  /******************************************************************************
@@ -50,11 +50,14 @@
50
50
  #include <cub/block/block_load.cuh>
51
51
  #include <cub/block/block_scan.cuh>
52
52
  #include <cub/block/block_store.cuh>
53
- #include <cub/grid/grid_queue.cuh>
54
53
  #include <cub/iterator/cache_modified_input_iterator.cuh>
55
54
 
56
- #include <cuda/ptx>
57
- #include <cuda/std/type_traits>
55
+ #include <cuda/__ptx/instructions/get_sreg.h>
56
+ #include <cuda/std/__functional/operations.h>
57
+ #include <cuda/std/__type_traits/conditional.h>
58
+ #include <cuda/std/__type_traits/integral_constant.h>
59
+ #include <cuda/std/__type_traits/is_pointer.h>
60
+ #include <cuda/std/__type_traits/is_same.h>
58
61
 
59
62
  CUB_NAMESPACE_BEGIN
60
63
 
@@ -49,8 +49,11 @@
49
49
  #include <cub/block/block_store.cuh>
50
50
  #include <cub/grid/grid_queue.cuh>
51
51
  #include <cub/iterator/cache_modified_input_iterator.cuh>
52
+ #include <cub/util_device.cuh>
52
53
 
53
- #include <cuda/std/type_traits>
54
+ #include <cuda/std/__type_traits/conditional.h>
55
+ #include <cuda/std/__type_traits/is_pointer.h>
56
+ #include <cuda/std/__type_traits/is_same.h>
54
57
 
55
58
  CUB_NAMESPACE_BEGIN
56
59
 
@@ -109,6 +112,27 @@ struct AgentScanPolicy : ScalingType
109
112
  };
110
113
  };
111
114
 
115
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
116
+ namespace detail
117
+ {
118
+ // Only define this when needed.
119
+ // Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
120
+ // either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
121
+ // version is always defined, and that's the only one needed for regular CUB operations.
122
+ //
123
+ // TODO: enable this unconditionally once concepts are always available
124
+ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
125
+ ScanAgentPolicy,
126
+ (GenericAgentPolicy),
127
+ (BLOCK_THREADS, BlockThreads, int),
128
+ (ITEMS_PER_THREAD, ItemsPerThread, int),
129
+ (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
130
+ (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
131
+ (STORE_ALGORITHM, StoreAlgorithm, cub::BlockStoreAlgorithm),
132
+ (SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
133
+ } // namespace detail
134
+ #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
135
+
112
136
  /******************************************************************************
113
137
  * Thread block abstractions
114
138
  ******************************************************************************/
@@ -50,7 +50,11 @@
50
50
  #include <cub/iterator/cache_modified_input_iterator.cuh>
51
51
  #include <cub/util_type.cuh>
52
52
 
53
- #include <cuda/std/type_traits>
53
+ #include <cuda/std/__type_traits/conditional.h>
54
+ #include <cuda/std/__type_traits/enable_if.h>
55
+ #include <cuda/std/__type_traits/integral_constant.h>
56
+ #include <cuda/std/__type_traits/is_pointer.h>
57
+ #include <cuda/std/__type_traits/is_same.h>
54
58
 
55
59
  CUB_NAMESPACE_BEGIN
56
60
 
@@ -50,11 +50,17 @@
50
50
  #include <cub/block/block_scan.cuh>
51
51
  #include <cub/block/block_store.cuh>
52
52
  #include <cub/device/dispatch/dispatch_common.cuh>
53
- #include <cub/grid/grid_queue.cuh>
54
53
  #include <cub/iterator/cache_modified_input_iterator.cuh>
55
54
  #include <cub/util_type.cuh>
56
55
 
57
- #include <cuda/std/type_traits>
56
+ #include <cuda/std/__functional/operations.h>
57
+ #include <cuda/std/__type_traits/conditional.h>
58
+ #include <cuda/std/__type_traits/enable_if.h>
59
+ #include <cuda/std/__type_traits/integral_constant.h>
60
+ #include <cuda/std/__type_traits/is_callable.h>
61
+ #include <cuda/std/__type_traits/is_pointer.h>
62
+ #include <cuda/std/__type_traits/is_same.h>
63
+ #include <cuda/std/cstdint>
58
64
 
59
65
  CUB_NAMESPACE_BEGIN
60
66
 
@@ -44,8 +44,12 @@
44
44
  #include <cub/block/block_scan.cuh>
45
45
  #include <cub/block/block_store.cuh>
46
46
  #include <cub/iterator/cache_modified_input_iterator.cuh>
47
+ #include <cub/util_device.cuh>
47
48
 
48
- #include <cuda/std/type_traits>
49
+ #include <cuda/std/__functional/operations.h>
50
+ #include <cuda/std/__type_traits/conditional.h>
51
+ #include <cuda/std/__type_traits/enable_if.h>
52
+ #include <cuda/std/__type_traits/is_pointer.h>
49
53
 
50
54
  CUB_NAMESPACE_BEGIN
51
55
 
@@ -73,9 +77,22 @@ struct AgentThreeWayPartitionPolicy
73
77
  };
74
78
  };
75
79
 
80
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
76
81
  namespace detail
77
82
  {
83
+ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
84
+ ThreeWayPartitionAgentPolicy,
85
+ (GenericAgentPolicy),
86
+ (BLOCK_THREADS, BlockThreads, int),
87
+ (ITEMS_PER_THREAD, ItemsPerThread, int),
88
+ (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
89
+ (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
90
+ (SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
91
+ } // namespace detail
92
+ #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
78
93
 
94
+ namespace detail
95
+ {
79
96
  namespace three_way_partition
80
97
  {
81
98