cuda-cccl 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.4__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (911) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +12 -38
  2. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +16 -40
  3. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -28
  4. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +24 -56
  5. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +12 -38
  6. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +31 -56
  7. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +31 -35
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +47 -48
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +39 -42
  10. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +33 -60
  11. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +18 -44
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +26 -55
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +22 -49
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +15 -41
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +9 -35
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +20 -49
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +14 -40
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +18 -40
  19. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +0 -2
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +20 -46
  21. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +3 -28
  22. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +7 -31
  23. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +10 -34
  24. cuda/cccl/headers/include/cub/block/block_exchange.cuh +120 -154
  25. cuda/cccl/headers/include/cub/block/block_histogram.cuh +28 -52
  26. cuda/cccl/headers/include/cub/block/block_load.cuh +124 -146
  27. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +0 -16
  28. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +58 -87
  29. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +81 -100
  30. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +92 -156
  31. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +8 -32
  32. cuda/cccl/headers/include/cub/block/block_reduce.cuh +21 -46
  33. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +51 -79
  34. cuda/cccl/headers/include/cub/block/block_scan.cuh +94 -401
  35. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +10 -34
  36. cuda/cccl/headers/include/cub/block/block_store.cuh +73 -97
  37. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +2 -29
  38. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +5 -29
  39. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +25 -49
  40. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +12 -34
  41. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +10 -34
  42. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +3 -27
  43. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +12 -36
  44. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +9 -33
  45. cuda/cccl/headers/include/cub/config.cuh +2 -26
  46. cuda/cccl/headers/include/cub/cub.cuh +3 -27
  47. cuda/cccl/headers/include/cub/detail/array_utils.cuh +2 -26
  48. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +2 -28
  49. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +3 -27
  50. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +0 -2
  51. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -3
  52. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +2 -28
  53. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +0 -2
  54. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +0 -2
  55. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +0 -2
  56. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +0 -2
  57. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +7 -12
  58. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +6 -33
  59. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +13 -36
  60. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +9 -38
  61. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +58 -32
  62. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +51 -51
  63. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +7 -31
  64. cuda/cccl/headers/include/cub/detail/rfa.cuh +2 -27
  65. cuda/cccl/headers/include/cub/detail/strong_load.cuh +3 -29
  66. cuda/cccl/headers/include/cub/detail/strong_store.cuh +3 -29
  67. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +2 -9
  68. cuda/cccl/headers/include/cub/detail/type_traits.cuh +0 -2
  69. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +6 -31
  70. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +2 -25
  71. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +2 -26
  72. cuda/cccl/headers/include/cub/device/device_for.cuh +3 -5
  73. cuda/cccl/headers/include/cub/device/device_histogram.cuh +3 -27
  74. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +2 -26
  75. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +2 -26
  76. cuda/cccl/headers/include/cub/device/device_partition.cuh +3 -27
  77. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -27
  78. cuda/cccl/headers/include/cub/device/device_reduce.cuh +10 -31
  79. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +3 -27
  80. cuda/cccl/headers/include/cub/device/device_scan.cuh +16 -34
  81. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +3 -27
  82. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +3 -27
  83. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2 -26
  84. cuda/cccl/headers/include/cub/device/device_select.cuh +3 -27
  85. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -28
  86. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +2 -27
  87. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +0 -2
  88. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +3 -29
  89. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +14 -34
  90. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +5 -30
  91. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +4 -29
  92. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +5 -32
  93. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +3 -29
  94. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -29
  95. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +1 -2
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +47 -59
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +21 -30
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +2 -27
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +3 -27
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +3 -27
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +0 -2
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +51 -36
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +3 -28
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +0 -1
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +27 -55
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +4 -28
  107. cuda/cccl/headers/include/cub/device/dispatch/kernels/{for_each.cuh → kernel_for_each.cuh} +0 -2
  108. cuda/cccl/headers/include/cub/device/dispatch/kernels/{histogram.cuh → kernel_histogram.cuh} +149 -157
  109. cuda/cccl/headers/include/cub/device/dispatch/kernels/{merge_sort.cuh → kernel_merge_sort.cuh} +0 -2
  110. cuda/cccl/headers/include/cub/device/dispatch/kernels/{radix_sort.cuh → kernel_radix_sort.cuh} +0 -2
  111. cuda/cccl/headers/include/cub/device/dispatch/kernels/{reduce.cuh → kernel_reduce.cuh} +2 -28
  112. cuda/cccl/headers/include/cub/device/dispatch/kernels/{scan.cuh → kernel_scan.cuh} +2 -28
  113. cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_reduce.cuh → kernel_segmented_reduce.cuh} +3 -29
  114. cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_sort.cuh → kernel_segmented_sort.cuh} +0 -1
  115. cuda/cccl/headers/include/cub/device/dispatch/kernels/{three_way_partition.cuh → kernel_three_way_partition.cuh} +0 -1
  116. cuda/cccl/headers/include/cub/device/dispatch/kernels/{transform.cuh → kernel_transform.cuh} +11 -11
  117. cuda/cccl/headers/include/cub/device/dispatch/kernels/{unique_by_key.cuh → kernel_unique_by_key.cuh} +0 -1
  118. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -26
  119. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -26
  120. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -28
  121. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +6 -26
  122. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -26
  123. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +5 -31
  124. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +31 -33
  125. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +15 -40
  126. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -26
  127. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -28
  128. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +20 -44
  129. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -26
  130. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +20 -45
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -27
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +11 -36
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +0 -1
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +2 -27
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +14 -40
  136. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -27
  137. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +3 -27
  138. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +3 -27
  139. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -27
  140. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +3 -27
  141. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +3 -27
  142. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +3 -27
  143. cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -28
  144. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +3 -27
  145. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +3 -26
  146. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +3 -29
  147. cuda/cccl/headers/include/cub/thread/thread_search.cuh +3 -27
  148. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +0 -2
  149. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +2 -26
  150. cuda/cccl/headers/include/cub/thread/thread_store.cuh +3 -27
  151. cuda/cccl/headers/include/cub/util_allocator.cuh +3 -27
  152. cuda/cccl/headers/include/cub/util_arch.cuh +3 -29
  153. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +2 -26
  154. cuda/cccl/headers/include/cub/util_debug.cuh +3 -27
  155. cuda/cccl/headers/include/cub/util_device.cuh +18 -59
  156. cuda/cccl/headers/include/cub/util_macro.cuh +4 -28
  157. cuda/cccl/headers/include/cub/util_math.cuh +2 -28
  158. cuda/cccl/headers/include/cub/util_namespace.cuh +3 -28
  159. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +3 -27
  160. cuda/cccl/headers/include/cub/util_ptx.cuh +6 -30
  161. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +3 -29
  162. cuda/cccl/headers/include/cub/util_type.cuh +5 -32
  163. cuda/cccl/headers/include/cub/util_vsmem.cuh +2 -28
  164. cuda/cccl/headers/include/cub/version.cuh +2 -26
  165. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +10 -35
  166. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +5 -30
  167. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +15 -39
  168. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +5 -35
  169. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +22 -46
  170. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +3 -27
  171. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +2 -26
  172. cuda/cccl/headers/include/cub/warp/warp_load.cuh +4 -27
  173. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -26
  174. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +3 -22
  175. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -27
  176. cuda/cccl/headers/include/cub/warp/warp_store.cuh +4 -27
  177. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +0 -2
  178. cuda/cccl/headers/include/cuda/__barrier/barrier.h +1 -1
  179. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +0 -1
  180. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +277 -235
  181. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +0 -1
  182. cuda/cccl/headers/include/cuda/__driver/driver_api.h +13 -0
  183. cuda/cccl/headers/include/cuda/__execution/determinism.h +0 -2
  184. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +0 -2
  185. cuda/cccl/headers/include/cuda/__functional/maximum.h +25 -7
  186. cuda/cccl/headers/include/cuda/__functional/minimum.h +25 -7
  187. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  188. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +0 -2
  189. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +13 -4
  190. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +4 -2
  191. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +0 -1
  192. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +28 -7
  193. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +1 -1
  194. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  195. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +2 -3
  196. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +1 -7
  197. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +0 -1
  198. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +1 -1
  199. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
  200. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
  201. cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
  202. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +3 -3
  203. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  204. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
  205. cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
  206. cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
  207. cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
  208. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +3 -3
  209. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +37 -3
  210. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +13 -3
  211. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  212. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +2 -2
  213. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
  214. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +0 -6
  215. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +1 -1
  216. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  217. cuda/cccl/headers/include/cuda/{std/__cuda → __runtime}/api_wrapper.h +3 -3
  218. cuda/cccl/headers/include/cuda/__stream/get_stream.h +0 -1
  219. cuda/cccl/headers/include/cuda/{__fwd/barrier_native_handle.h → __stream/internal_streams.h} +17 -15
  220. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +2 -2
  221. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +1 -0
  222. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +1 -0
  223. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +2 -1
  224. cuda/cccl/headers/include/cuda/barrier +42 -16
  225. cuda/cccl/headers/include/cuda/memory +1 -0
  226. cuda/cccl/headers/include/cuda/memory_resource +6 -1
  227. cuda/cccl/headers/include/cuda/numeric +2 -0
  228. cuda/cccl/headers/include/cuda/pipeline +3 -2
  229. cuda/cccl/headers/include/cuda/ptx +1 -0
  230. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +0 -2
  231. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +1 -1
  232. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +115 -58
  233. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +844 -378
  234. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +12 -5
  235. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +31 -0
  236. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +10 -0
  237. cuda/cccl/headers/include/cuda/std/__atomic/types.h +2 -3
  238. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +37 -13
  239. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +0 -28
  240. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +7 -0
  241. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +10 -0
  242. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +2 -45
  243. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +0 -2
  244. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +8 -0
  245. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +0 -2
  246. cuda/cccl/headers/include/cuda/std/__chrono/day.h +0 -2
  247. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +13 -17
  248. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +0 -2
  249. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +0 -2
  250. cuda/cccl/headers/include/cuda/std/__chrono/month.h +0 -2
  251. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +0 -2
  252. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +0 -2
  253. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +5 -8
  254. cuda/cccl/headers/include/cuda/std/__chrono/year.h +0 -2
  255. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +4 -0
  256. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +2 -3
  257. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +4 -0
  258. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +4 -0
  259. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +2 -3
  260. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +2 -3
  261. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +2 -3
  262. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +2 -3
  263. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +2 -3
  264. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +2 -3
  265. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +2 -3
  266. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +2 -3
  267. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +2 -3
  268. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +2 -3
  269. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +2 -2
  270. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +4 -0
  271. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +2 -3
  272. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +2 -3
  273. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +4 -0
  274. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +2 -3
  275. cuda/cccl/headers/include/cuda/std/__complex/complex.h +0 -6
  276. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +2 -2
  277. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +27 -1
  278. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +2 -4
  279. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +15 -36
  280. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
  281. cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/stdexcept → __exception/throw_error.h} +3 -3
  282. cuda/cccl/headers/include/cuda/std/__expected/expected.h +28 -43
  283. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +2 -10
  284. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +2 -2
  285. cuda/cccl/headers/include/cuda/std/__functional/bind.h +6 -6
  286. cuda/cccl/headers/include/cuda/std/__functional/function.h +2 -6
  287. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +5 -5
  288. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +5 -0
  289. cuda/cccl/headers/include/cuda/std/__fwd/array.h +2 -2
  290. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +12 -0
  291. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  292. cuda/cccl/headers/include/cuda/std/__fwd/get.h +21 -22
  293. cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/iosfwd → __fwd/ios.h} +5 -10
  294. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +19 -10
  295. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +2 -2
  296. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +5 -0
  297. cuda/cccl/headers/include/cuda/std/__fwd/span.h +2 -2
  298. cuda/cccl/headers/include/cuda/std/__fwd/string.h +7 -0
  299. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +18 -0
  300. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +3 -0
  301. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  302. cuda/cccl/headers/include/cuda/std/{__type_traits/is_reference_wrapper.h → __fwd/variant.h} +16 -15
  303. cuda/cccl/headers/include/cuda/std/__internal/features.h +14 -0
  304. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +1 -1
  305. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +1 -1
  306. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +58 -40
  307. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +1 -1
  308. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +1 -1
  309. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +0 -5
  310. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +4 -18
  311. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +1 -2
  312. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +0 -2
  313. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +0 -2
  314. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +0 -4
  315. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +0 -5
  316. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +3 -10
  317. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +4 -15
  318. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +4 -4
  319. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +4 -4
  320. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +2 -4
  321. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +3 -3
  322. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +1 -1
  323. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -0
  324. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +6 -12
  325. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -5
  326. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +7 -2
  327. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +1 -0
  328. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +5 -0
  329. cuda/cccl/headers/include/cuda/std/__new/allocate.h +5 -0
  330. cuda/cccl/headers/include/cuda/{__barrier/barrier_native_handle.h → std/__new/device_new.h} +9 -24
  331. cuda/cccl/headers/include/cuda/std/__new_ +1 -0
  332. cuda/cccl/headers/include/cuda/std/__optional/optional.h +5 -4
  333. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +4 -4
  334. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +1 -1
  335. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  336. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  337. cuda/cccl/headers/include/cuda/std/__random_ +2 -0
  338. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +7 -19
  339. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -4
  340. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +5 -4
  341. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +1 -1
  342. cuda/cccl/headers/include/cuda/std/__string/string_view.h +5 -5
  343. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  344. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  345. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +0 -160
  346. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +123 -129
  347. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  348. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  349. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  350. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  351. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +7 -0
  352. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  353. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +1 -2
  354. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  355. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -1
  356. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  357. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +0 -2
  358. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +4 -24
  359. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +0 -2
  360. cuda/cccl/headers/include/cuda/std/__utility/pair.h +20 -20
  361. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +0 -2
  362. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  363. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  364. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  365. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  366. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  367. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  368. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  369. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  370. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  371. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  372. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  373. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  374. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  375. cuda/cccl/headers/include/cuda/std/array +1 -1
  376. cuda/cccl/headers/include/cuda/std/atomic +1 -1
  377. cuda/cccl/headers/include/cuda/std/bitset +2 -10
  378. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +6 -6
  379. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1 -4
  380. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3 -6
  381. cuda/cccl/headers/include/cuda/std/functional +1 -1
  382. cuda/cccl/headers/include/cuda/std/initializer_list +8 -0
  383. cuda/cccl/headers/include/cuda/std/inplace_vector +6 -5
  384. cuda/cccl/headers/include/cuda/std/iterator +1 -1
  385. cuda/cccl/headers/include/cuda/std/numbers +0 -2
  386. cuda/cccl/headers/include/cuda/std/ratio +2 -2
  387. cuda/cccl/headers/include/cuda/std/span +2 -2
  388. cuda/cccl/headers/include/cuda/std/string_view +24 -42
  389. cuda/cccl/headers/include/cuda/std/tuple +18 -1
  390. cuda/cccl/headers/include/cuda/std/type_traits +0 -1
  391. cuda/cccl/headers/include/cuda/std/variant +8 -1
  392. cuda/cccl/headers/include/nv/target +2 -6
  393. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +15 -2
  394. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +0 -2
  395. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +0 -1
  396. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +0 -1
  397. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +0 -2
  398. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +0 -2
  399. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +0 -2
  400. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +0 -2
  401. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +0 -2
  402. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +0 -4
  403. cuda/cccl/headers/include/thrust/detail/binary_search.inl +14 -2
  404. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +2 -7
  405. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +2 -8
  406. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +2 -8
  407. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +2 -8
  408. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +2 -8
  409. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +2 -8
  410. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +2 -7
  411. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +2 -8
  412. cuda/cccl/headers/include/thrust/detail/complex/clog.h +2 -8
  413. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +2 -8
  414. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +2 -7
  415. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +2 -8
  416. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +2 -8
  417. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +2 -8
  418. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +2 -8
  419. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +2 -8
  420. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +2 -8
  421. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +2 -8
  422. cuda/cccl/headers/include/thrust/detail/config/device_system.h +2 -0
  423. cuda/cccl/headers/include/thrust/detail/config/host_system.h +2 -0
  424. cuda/cccl/headers/include/thrust/detail/config/namespace.h +0 -1
  425. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +0 -2
  426. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +0 -2
  427. cuda/cccl/headers/include/thrust/detail/copy.h +0 -2
  428. cuda/cccl/headers/include/thrust/detail/copy.inl +14 -4
  429. cuda/cccl/headers/include/thrust/detail/copy_if.inl +14 -2
  430. cuda/cccl/headers/include/thrust/detail/count.inl +14 -2
  431. cuda/cccl/headers/include/thrust/detail/equal.inl +14 -2
  432. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +4 -5
  433. cuda/cccl/headers/include/thrust/detail/extrema.inl +14 -2
  434. cuda/cccl/headers/include/thrust/detail/fill.inl +14 -2
  435. cuda/cccl/headers/include/thrust/detail/find.inl +14 -2
  436. cuda/cccl/headers/include/thrust/detail/for_each.inl +14 -2
  437. cuda/cccl/headers/include/thrust/detail/functional/actor.h +2 -5
  438. cuda/cccl/headers/include/thrust/detail/functional/operators.h +2 -5
  439. cuda/cccl/headers/include/thrust/detail/gather.inl +14 -2
  440. cuda/cccl/headers/include/thrust/detail/generate.inl +14 -2
  441. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +0 -2
  442. cuda/cccl/headers/include/thrust/detail/inner_product.inl +14 -2
  443. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -0
  444. cuda/cccl/headers/include/thrust/detail/logical.inl +14 -2
  445. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +13 -1
  446. cuda/cccl/headers/include/thrust/detail/merge.inl +14 -2
  447. cuda/cccl/headers/include/thrust/detail/mismatch.inl +14 -2
  448. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +0 -4
  449. cuda/cccl/headers/include/thrust/detail/partition.inl +14 -2
  450. cuda/cccl/headers/include/thrust/detail/random_bijection.h +0 -2
  451. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +0 -2
  452. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +0 -2
  453. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +0 -6
  454. cuda/cccl/headers/include/thrust/detail/reduce.inl +21 -3
  455. cuda/cccl/headers/include/thrust/detail/reference.h +27 -3
  456. cuda/cccl/headers/include/thrust/detail/remove.inl +14 -2
  457. cuda/cccl/headers/include/thrust/detail/replace.inl +14 -2
  458. cuda/cccl/headers/include/thrust/detail/reverse.inl +14 -2
  459. cuda/cccl/headers/include/thrust/detail/scan.inl +21 -3
  460. cuda/cccl/headers/include/thrust/detail/scatter.inl +14 -2
  461. cuda/cccl/headers/include/thrust/detail/sequence.inl +13 -1
  462. cuda/cccl/headers/include/thrust/detail/set_operations.inl +13 -1
  463. cuda/cccl/headers/include/thrust/detail/sort.inl +13 -1
  464. cuda/cccl/headers/include/thrust/detail/static_assert.h +0 -2
  465. cuda/cccl/headers/include/thrust/detail/static_map.h +0 -3
  466. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +13 -1
  467. cuda/cccl/headers/include/thrust/detail/tabulate.inl +14 -2
  468. cuda/cccl/headers/include/thrust/detail/temporary_array.h +0 -4
  469. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +0 -1
  470. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +14 -3
  471. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +13 -1
  472. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +13 -1
  473. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +0 -2
  474. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +0 -2
  475. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +2 -7
  476. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +0 -2
  477. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +0 -4
  478. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +0 -4
  479. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +14 -2
  480. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +14 -2
  481. cuda/cccl/headers/include/thrust/detail/unique.inl +21 -3
  482. cuda/cccl/headers/include/thrust/detail/vector_base.h +0 -2
  483. cuda/cccl/headers/include/thrust/detail/vector_base.inl +0 -2
  484. cuda/cccl/headers/include/thrust/execution_policy.h +10 -9
  485. cuda/cccl/headers/include/thrust/functional.h +0 -2
  486. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +9 -4
  487. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +8 -4
  488. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +0 -1
  489. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +0 -1
  490. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +0 -1
  491. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +0 -1
  492. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +0 -1
  493. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +2 -6
  494. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +0 -1
  495. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +0 -2
  496. cuda/cccl/headers/include/thrust/mr/allocator.h +0 -2
  497. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +9 -4
  498. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +10 -10
  499. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +0 -2
  500. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +0 -2
  501. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +0 -2
  502. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +8 -4
  503. cuda/cccl/headers/include/thrust/mr/memory_resource.h +0 -2
  504. cuda/cccl/headers/include/thrust/mr/new.h +0 -2
  505. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +0 -2
  506. cuda/cccl/headers/include/thrust/mr/pool.h +10 -10
  507. cuda/cccl/headers/include/thrust/mr/pool_options.h +4 -6
  508. cuda/cccl/headers/include/thrust/mr/sync_pool.h +0 -2
  509. cuda/cccl/headers/include/thrust/mr/tls_pool.h +0 -2
  510. cuda/cccl/headers/include/thrust/mr/validator.h +0 -2
  511. cuda/cccl/headers/include/thrust/per_device_resource.h +13 -1
  512. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +0 -2
  513. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +0 -2
  514. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +2 -9
  515. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +0 -2
  516. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +2 -9
  517. cuda/cccl/headers/include/thrust/random/detail/mod.h +2 -9
  518. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +0 -2
  519. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +2 -7
  520. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +2 -9
  521. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +0 -2
  522. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +0 -2
  523. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +0 -2
  524. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +0 -2
  525. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +0 -2
  526. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +0 -2
  527. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +0 -2
  528. cuda/cccl/headers/include/thrust/random/normal_distribution.h +0 -2
  529. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +0 -2
  530. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +0 -2
  531. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +0 -2
  532. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +0 -2
  533. cuda/cccl/headers/include/thrust/random.h +0 -2
  534. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +15 -11
  535. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +2 -7
  536. cuda/cccl/headers/include/thrust/system/cpp/memory.h +0 -1
  537. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +0 -2
  538. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +0 -2
  539. cuda/cccl/headers/include/thrust/system/cpp/vector.h +0 -1
  540. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +0 -4
  541. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +0 -1
  542. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +0 -4
  543. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +2 -9
  544. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +4 -32
  545. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +2 -9
  546. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +0 -2
  547. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +0 -2
  548. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +23 -2
  549. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +0 -2
  550. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +2 -11
  551. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +2 -0
  552. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +0 -4
  553. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +0 -1
  554. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +0 -5
  555. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +0 -1
  556. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +0 -2
  557. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +0 -2
  558. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +0 -1
  559. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +2 -8
  560. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +0 -2
  561. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +0 -2
  562. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +2 -26
  563. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +7 -142
  564. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +0 -2
  565. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +0 -4
  566. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +0 -2
  567. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +0 -5
  568. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +0 -4
  569. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +0 -2
  570. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +0 -1
  571. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +0 -4
  572. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +0 -4
  573. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +0 -3
  574. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +0 -2
  575. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +3 -5
  576. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +8 -10
  577. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +0 -2
  578. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +0 -1
  579. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +0 -4
  580. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +0 -2
  581. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -7
  582. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +2 -7
  583. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +0 -3
  584. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +0 -4
  585. cuda/cccl/headers/include/thrust/system/cuda/error.h +2 -11
  586. cuda/cccl/headers/include/thrust/system/cuda/memory.h +2 -6
  587. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +2 -9
  588. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +2 -7
  589. cuda/cccl/headers/include/thrust/system/cuda/vector.h +2 -6
  590. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +0 -2
  591. cuda/cccl/headers/include/thrust/system/detail/errno.h +0 -2
  592. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +0 -4
  593. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +0 -2
  594. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +0 -2
  595. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +0 -2
  596. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +0 -2
  597. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +0 -2
  598. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +0 -3
  599. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +0 -2
  600. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +0 -2
  601. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +0 -2
  602. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +0 -3
  603. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +0 -2
  604. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +0 -2
  605. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +0 -2
  606. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +0 -2
  607. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +0 -2
  608. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +0 -3
  609. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +0 -2
  610. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +0 -2
  611. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +0 -2
  612. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +0 -2
  613. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +0 -2
  614. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +0 -2
  615. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +0 -2
  616. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +0 -2
  617. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +0 -2
  618. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +0 -2
  619. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +0 -2
  620. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +0 -3
  621. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +0 -2
  622. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +0 -2
  623. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +0 -2
  624. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +0 -2
  625. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +0 -2
  626. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +0 -2
  627. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +0 -2
  628. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +0 -2
  629. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +0 -2
  630. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +0 -2
  631. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +0 -3
  632. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +0 -2
  633. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +0 -2
  634. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +0 -2
  635. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +0 -3
  636. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +0 -2
  637. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +0 -2
  638. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +0 -2
  639. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +0 -2
  640. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +26 -12
  641. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +0 -2
  642. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +0 -3
  643. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +0 -2
  644. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +0 -2
  645. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +0 -1
  646. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +0 -2
  647. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +0 -2
  648. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +0 -2
  649. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +0 -2
  650. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +0 -2
  651. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +0 -2
  652. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +0 -2
  653. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +0 -3
  654. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +0 -2
  655. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +0 -2
  656. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +0 -2
  657. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +0 -2
  658. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +0 -2
  659. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +0 -2
  660. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +0 -2
  661. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +0 -2
  662. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +0 -2
  663. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +0 -2
  664. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +0 -2
  665. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -4
  666. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +0 -2
  667. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +0 -3
  668. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +0 -2
  669. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +0 -2
  670. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +0 -2
  671. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +0 -2
  672. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +0 -2
  673. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +0 -2
  674. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +0 -2
  675. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +0 -2
  676. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +76 -5
  677. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +0 -2
  678. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +0 -2
  679. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +0 -2
  680. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +0 -2
  681. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +0 -2
  682. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +0 -3
  683. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +0 -2
  684. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +0 -2
  685. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +0 -2
  686. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +0 -2
  687. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +78 -6
  688. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +0 -4
  689. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +0 -2
  690. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +0 -2
  691. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +0 -2
  692. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +0 -2
  693. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +0 -2
  694. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +0 -2
  695. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +67 -6
  696. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +310 -11
  697. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +78 -5
  698. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +543 -7
  699. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +0 -2
  700. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +0 -2
  701. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +0 -2
  702. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +0 -2
  703. cuda/cccl/headers/include/thrust/system/error_code.h +0 -4
  704. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +5 -25
  705. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +2 -15
  706. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +5 -25
  707. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +40 -29
  708. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +11 -28
  709. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +2 -15
  710. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +26 -28
  711. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +2 -15
  712. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +18 -13
  713. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +5 -25
  714. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +2 -15
  715. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +5 -25
  716. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +47 -30
  717. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +2 -15
  718. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +2 -15
  719. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +2 -15
  720. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +2 -15
  721. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +2 -15
  722. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +2 -15
  723. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +2 -15
  724. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +2 -15
  725. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +2 -15
  726. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +26 -31
  727. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +2 -15
  728. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +2 -26
  729. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +35 -27
  730. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +13 -28
  731. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +56 -28
  732. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +26 -31
  733. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +2 -15
  734. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +2 -15
  735. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +176 -17
  736. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +8 -15
  737. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +2 -15
  738. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +2 -15
  739. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +2 -15
  740. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +213 -28
  741. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +2 -15
  742. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +2 -15
  743. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +2 -15
  744. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +2 -15
  745. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +2 -15
  746. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +2 -15
  747. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +2 -15
  748. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +2 -15
  749. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +21 -30
  750. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +17 -29
  751. cuda/cccl/headers/include/thrust/system/omp/memory.h +51 -9
  752. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +3 -7
  753. cuda/cccl/headers/include/thrust/system/omp/pointer.h +3 -7
  754. cuda/cccl/headers/include/thrust/system/omp/vector.h +3 -6
  755. cuda/cccl/headers/include/thrust/system/system_error.h +0 -2
  756. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +4 -25
  757. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +2 -15
  758. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +2 -15
  759. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +38 -29
  760. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +91 -24
  761. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +2 -15
  762. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +2 -15
  763. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +17 -13
  764. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +4 -25
  765. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +2 -15
  766. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +4 -25
  767. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +47 -28
  768. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +2 -15
  769. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +2 -15
  770. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +2 -15
  771. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +2 -15
  772. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +2 -15
  773. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +2 -15
  774. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +2 -15
  775. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +254 -29
  776. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +2 -15
  777. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +25 -31
  778. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +2 -15
  779. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +95 -29
  780. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +345 -28
  781. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +4 -26
  782. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +32 -42
  783. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +2 -15
  784. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +2 -15
  785. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +265 -30
  786. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +7 -17
  787. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +2 -15
  788. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +2 -15
  789. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +2 -15
  790. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +244 -32
  791. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +2 -15
  792. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +2 -15
  793. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +2 -15
  794. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +2 -15
  795. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +2 -15
  796. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +2 -15
  797. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +2 -15
  798. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +2 -15
  799. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +23 -33
  800. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +16 -29
  801. cuda/cccl/headers/include/thrust/system/tbb/memory.h +52 -24
  802. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +4 -22
  803. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +4 -22
  804. cuda/cccl/headers/include/thrust/system/tbb/vector.h +4 -21
  805. cuda/cccl/headers/include/thrust/transform.h +14 -3
  806. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +0 -4
  807. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +0 -1
  808. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +0 -4
  809. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +0 -4
  810. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +0 -4
  811. cuda/cccl/headers/include/thrust/universal_allocator.h +8 -0
  812. cuda/cccl/headers/include/thrust/universal_vector.h +9 -0
  813. cuda/cccl/headers/include/thrust/zip_function.h +2 -28
  814. cuda/compute/__init__.py +4 -0
  815. cuda/compute/_bindings.pyi +26 -3
  816. cuda/compute/_bindings_impl.pyx +143 -1
  817. cuda/compute/algorithms/__init__.py +9 -5
  818. cuda/compute/algorithms/_sort/__init__.py +23 -0
  819. cuda/compute/algorithms/{_merge_sort.py → _sort/_merge_sort.py} +10 -10
  820. cuda/compute/algorithms/{_radix_sort.py → _sort/_radix_sort.py} +9 -58
  821. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  822. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  823. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  824. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  825. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  826. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  827. cuda_cccl-0.3.4.dist-info/METADATA +78 -0
  828. {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/RECORD +830 -867
  829. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +0 -652
  830. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +0 -1365
  831. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +0 -2144
  832. cuda/cccl/headers/include/thrust/detail/integer_math.h +0 -113
  833. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +0 -51
  834. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +0 -51
  835. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +0 -51
  836. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +0 -51
  837. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +0 -52
  838. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +0 -51
  839. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +0 -51
  840. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +0 -51
  841. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +0 -51
  842. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +0 -51
  843. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +0 -51
  844. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +0 -51
  845. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +0 -51
  846. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +0 -51
  847. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +0 -51
  848. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +0 -51
  849. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +0 -51
  850. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +0 -51
  851. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +0 -51
  852. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +0 -51
  853. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +0 -51
  854. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +0 -51
  855. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +0 -51
  856. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +0 -51
  857. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +0 -51
  858. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +0 -51
  859. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +0 -51
  860. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +0 -51
  861. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +0 -51
  862. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +0 -51
  863. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +0 -51
  864. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +0 -51
  865. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +0 -51
  866. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +0 -51
  867. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +0 -51
  868. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +0 -51
  869. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +0 -51
  870. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +0 -51
  871. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +0 -51
  872. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +0 -51
  873. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +0 -51
  874. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +0 -51
  875. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +0 -51
  876. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +0 -85
  877. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +0 -119
  878. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +0 -145
  879. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +0 -116
  880. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +0 -356
  881. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +0 -124
  882. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +0 -586
  883. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +0 -74
  884. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +0 -59
  885. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +0 -65
  886. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +0 -87
  887. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +0 -93
  888. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +0 -102
  889. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +0 -78
  890. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +0 -65
  891. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +0 -103
  892. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +0 -87
  893. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +0 -265
  894. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +0 -71
  895. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +0 -75
  896. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +0 -73
  897. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +0 -136
  898. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +0 -91
  899. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +0 -94
  900. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +0 -327
  901. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +0 -98
  902. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +0 -137
  903. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +0 -400
  904. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +0 -87
  905. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +0 -312
  906. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +0 -295
  907. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +0 -71
  908. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +0 -75
  909. cuda_cccl-0.3.2.dist-info/METADATA +0 -42
  910. {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/WHEEL +0 -0
  911. {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/licenses/LICENSE +0 -0
@@ -29,7 +29,6 @@
29
29
  #include <cuda/__barrier/barrier_arrive_tx.h>
30
30
  #include <cuda/__barrier/barrier_block_scope.h>
31
31
  #include <cuda/__barrier/barrier_expect_tx.h>
32
- #include <cuda/__barrier/barrier_native_handle.h>
33
32
  #include <cuda/__barrier/barrier_thread_scope.h>
34
33
  #include <cuda/__memcpy_async/memcpy_async.h>
35
34
  #include <cuda/__memcpy_async/memcpy_async_tx.h>
@@ -69,8 +68,9 @@ _CCCL_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
69
68
  // device compilation begins.
70
69
  #ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
71
70
 
71
+ //! Deprecated [Since 3.2]
72
72
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
73
- inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
73
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk instead") inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
74
74
  void* __dest, const void* __src, ::cuda::std::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
75
75
  {
76
76
  _CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
@@ -88,8 +88,10 @@ inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
88
88
  ::cuda::device::barrier_native_handle(__bar));
89
89
  }
90
90
 
91
+ //! Deprecated [Since 3.2]
91
92
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
92
- inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void* __src, ::cuda::std::uint32_t __size)
93
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk instead") inline _CCCL_DEVICE void
94
+ cp_async_bulk_shared_to_global(void* __dest, const void* __src, ::cuda::std::uint32_t __size)
93
95
  {
94
96
  _CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
95
97
  _CCCL_ASSERT(::cuda::device::is_address_from(__dest, ::cuda::device::address_space::global),
@@ -100,8 +102,10 @@ inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void
100
102
  ::cuda::ptx::cp_async_bulk(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __dest, __src, __size);
101
103
  }
102
104
 
105
+ //! Deprecated [Since 3.2]
103
106
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
104
- inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_global_to_shared(
107
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
108
+ cp_async_bulk_tensor_1d_global_to_shared(
105
109
  void* __dest, const CUtensorMap* __tensor_map, int __c0, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
106
110
  {
107
111
  const ::cuda::std::int32_t __coords[]{__c0};
@@ -115,8 +119,10 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_global_to_shared(
115
119
  ::cuda::device::barrier_native_handle(__bar));
116
120
  }
117
121
 
122
+ //! Deprecated [Since 3.2]
118
123
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
119
- inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_global_to_shared(
124
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
125
+ cp_async_bulk_tensor_2d_global_to_shared(
120
126
  void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
121
127
  {
122
128
  const ::cuda::std::int32_t __coords[]{__c0, __c1};
@@ -130,8 +136,10 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_global_to_shared(
130
136
  ::cuda::device::barrier_native_handle(__bar));
131
137
  }
132
138
 
139
+ //! Deprecated [Since 3.2]
133
140
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
134
- inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_global_to_shared(
141
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
142
+ cp_async_bulk_tensor_3d_global_to_shared(
135
143
  void* __dest,
136
144
  const CUtensorMap* __tensor_map,
137
145
  int __c0,
@@ -150,8 +158,10 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_global_to_shared(
150
158
  ::cuda::device::barrier_native_handle(__bar));
151
159
  }
152
160
 
161
+ //! Deprecated [Since 3.2]
153
162
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
154
- inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_global_to_shared(
163
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
164
+ cp_async_bulk_tensor_4d_global_to_shared(
155
165
  void* __dest,
156
166
  const CUtensorMap* __tensor_map,
157
167
  int __c0,
@@ -171,8 +181,10 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_global_to_shared(
171
181
  ::cuda::device::barrier_native_handle(__bar));
172
182
  }
173
183
 
184
+ //! Deprecated [Since 3.2]
174
185
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
175
- inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_global_to_shared(
186
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
187
+ cp_async_bulk_tensor_5d_global_to_shared(
176
188
  void* __dest,
177
189
  const CUtensorMap* __tensor_map,
178
190
  int __c0,
@@ -193,8 +205,9 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_global_to_shared(
193
205
  ::cuda::device::barrier_native_handle(__bar));
194
206
  }
195
207
 
208
+ //! Deprecated [Since 3.2]
196
209
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
197
- inline _CCCL_DEVICE void
210
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
198
211
  cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, const void* __src)
199
212
  {
200
213
  const ::cuda::std::int32_t __coords[]{__c0};
@@ -202,8 +215,9 @@ cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __
202
215
  ::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
203
216
  }
204
217
 
218
+ //! Deprecated [Since 3.2]
205
219
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
206
- inline _CCCL_DEVICE void
220
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
207
221
  cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, int __c1, const void* __src)
208
222
  {
209
223
  const ::cuda::std::int32_t __coords[]{__c0, __c1};
@@ -211,8 +225,10 @@ cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __
211
225
  ::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
212
226
  }
213
227
 
228
+ //! Deprecated [Since 3.2]
214
229
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
215
- inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_shared_to_global(
230
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
231
+ cp_async_bulk_tensor_3d_shared_to_global(
216
232
  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, const void* __src)
217
233
  {
218
234
  const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2};
@@ -220,8 +236,10 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_shared_to_global(
220
236
  ::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
221
237
  }
222
238
 
239
+ //! Deprecated [Since 3.2]
223
240
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
224
- inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_shared_to_global(
241
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
242
+ cp_async_bulk_tensor_4d_shared_to_global(
225
243
  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, const void* __src)
226
244
  {
227
245
  const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3};
@@ -229,8 +247,10 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_shared_to_global(
229
247
  ::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
230
248
  }
231
249
 
250
+ //! Deprecated [Since 3.2]
232
251
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
233
- inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_shared_to_global(
252
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
253
+ cp_async_bulk_tensor_5d_shared_to_global(
234
254
  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void* __src)
235
255
  {
236
256
  const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
@@ -238,21 +258,27 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_shared_to_global(
238
258
  ::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
239
259
  }
240
260
 
261
+ //! Deprecated [Since 3.2]
241
262
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
242
- inline _CCCL_DEVICE void fence_proxy_async_shared_cta()
263
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::fence_proxy_async instead") inline _CCCL_DEVICE void
264
+ fence_proxy_async_shared_cta()
243
265
  {
244
266
  ::cuda::ptx::fence_proxy_async(::cuda::ptx::space_shared);
245
267
  }
246
268
 
269
+ //! Deprecated [Since 3.2]
247
270
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
248
- inline _CCCL_DEVICE void cp_async_bulk_commit_group()
271
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_commit_group instead") inline _CCCL_DEVICE void
272
+ cp_async_bulk_commit_group()
249
273
  {
250
274
  ::cuda::ptx::cp_async_bulk_commit_group();
251
275
  }
252
276
 
277
+ //! Deprecated [Since 3.2]
253
278
  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
254
279
  template <int __n_prior>
255
- inline _CCCL_DEVICE void cp_async_bulk_wait_group_read()
280
+ CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_wait_group_read instead") inline _CCCL_DEVICE void
281
+ cp_async_bulk_wait_group_read()
256
282
  {
257
283
  static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
258
284
  ::cuda::ptx::cp_async_bulk_wait_group_read(::cuda::ptx::n32_t<__n_prior>{});
@@ -30,6 +30,7 @@
30
30
  #include <cuda/__memory/is_aligned.h>
31
31
  #include <cuda/__memory/ptr_in_range.h>
32
32
  #include <cuda/__memory/ptr_rebind.h>
33
+ #include <cuda/__memory/ranges_overlap.h>
33
34
  #include <cuda/std/memory>
34
35
 
35
36
  #endif // _CUDA_MEMORY
@@ -26,10 +26,15 @@
26
26
  //!
27
27
  //!@endrst
28
28
 
29
+ #include <cuda/__memory_resource/any_resource.h>
30
+ #include <cuda/__memory_resource/device_memory_pool.h>
29
31
  #include <cuda/__memory_resource/get_memory_resource.h>
30
32
  #include <cuda/__memory_resource/get_property.h>
33
+ #include <cuda/__memory_resource/legacy_managed_memory_resource.h>
34
+ #include <cuda/__memory_resource/legacy_pinned_memory_resource.h>
35
+ #include <cuda/__memory_resource/managed_memory_pool.h>
36
+ #include <cuda/__memory_resource/pinned_memory_pool.h>
31
37
  #include <cuda/__memory_resource/properties.h>
32
38
  #include <cuda/__memory_resource/resource.h>
33
- #include <cuda/__memory_resource/resource_ref.h>
34
39
 
35
40
  #endif //_CCCL_BEGIN_NAMESPACE_CUDA
@@ -21,9 +21,11 @@
21
21
  #endif // no system header
22
22
 
23
23
  #include <cuda/__numeric/add_overflow.h>
24
+ #include <cuda/__numeric/div_overflow.h>
24
25
  #include <cuda/__numeric/narrow.h>
25
26
  #include <cuda/__numeric/overflow_cast.h>
26
27
  #include <cuda/__numeric/overflow_result.h>
28
+ #include <cuda/__numeric/sub_overflow.h>
27
29
  #include <cuda/std/numeric>
28
30
 
29
31
  #endif // _CUDA_NUMERIC
@@ -29,6 +29,7 @@
29
29
  #include <cuda/atomic>
30
30
  #include <cuda/barrier>
31
31
  #include <cuda/std/__algorithm/max.h>
32
+ #include <cuda/std/__bit/popcount.h>
32
33
  #include <cuda/std/__chrono/duration.h>
33
34
  #include <cuda/std/__chrono/time_point.h>
34
35
  #include <cuda/std/cstdint>
@@ -112,7 +113,7 @@ public:
112
113
  ::__match_any_sync(::__activemask(), reinterpret_cast<uintptr_t>(__shared_state_get_refcount()));
113
114
  const uint32_t __elected_id = ::__ffs(__match_mask) - 1;
114
115
  __elected = (::cuda::ptx::get_sreg_laneid() == __elected_id);
115
- __sub_count = ::__popc(__match_mask);
116
+ __sub_count = ::cuda::std::popcount(__match_mask);
116
117
  , __elected = true;
117
118
  __sub_count = 1;)
118
119
  bool __released = false;
@@ -301,7 +302,7 @@ make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count
301
302
  ::__match_any_sync(::__activemask(), reinterpret_cast<uintptr_t>(&__shared_state->__refcount));
302
303
  const uint32_t __elected_id = ::__ffs(__match_mask) - 1;
303
304
  __elected = (::cuda::ptx::get_sreg_laneid() == __elected_id);
304
- __add_count = ::__popc(__match_mask);
305
+ __add_count = ::cuda::std::popcount(__match_mask);
305
306
  , __elected = true;
306
307
  __add_count = 1;)
307
308
  if (__elected)
@@ -114,6 +114,7 @@
114
114
  #include <cuda/__ptx/instructions/tensormap_cp_fenceproxy.h>
115
115
  #include <cuda/__ptx/instructions/tensormap_replace.h>
116
116
  #include <cuda/__ptx/instructions/trap.h>
117
+ #include <cuda/__ptx/pragmas/enable_smem_spilling.h>
117
118
 
118
119
  #include <cuda/std/__cccl/prologue.h>
119
120
 
@@ -35,14 +35,12 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD
35
35
 
36
36
  namespace __unique_copy_tags
37
37
  {
38
-
39
38
  struct __reread_from_input_tag
40
39
  {};
41
40
  struct __reread_from_output_tag
42
41
  {};
43
42
  struct __read_from_tmp_value_tag
44
43
  {};
45
-
46
44
  } // namespace __unique_copy_tags
47
45
 
48
46
  _CCCL_EXEC_CHECK_DISABLE
@@ -24,7 +24,7 @@
24
24
  #include <cuda/std/__atomic/api/common.h>
25
25
  #include <cuda/std/__atomic/order.h>
26
26
  #include <cuda/std/__atomic/scopes.h>
27
- #include <cuda/std/__atomic/types/reference.h>
27
+ #include <cuda/std/__atomic/types.h>
28
28
  #include <cuda/std/__atomic/wait/notify_wait.h>
29
29
  #include <cuda/std/__atomic/wait/polling.h>
30
30
  #include <cuda/std/__type_traits/conditional.h>
@@ -22,6 +22,7 @@
22
22
  #endif // no system header
23
23
 
24
24
  #include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
25
+ #include <cuda/std/__functional/operations.h>
25
26
  #include <cuda/std/__type_traits/conditional.h>
26
27
  #include <cuda/std/__type_traits/enable_if.h>
27
28
  #include <cuda/std/__type_traits/is_scalar.h>
@@ -147,6 +148,45 @@ _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Or
147
148
  }
148
149
  }
149
150
 
151
+ template <class _Type, template <class> class _Op>
152
+ struct __cuda_atomic_op_bind
153
+ {
154
+ _Type __val;
155
+
156
+ [[nodiscard]] _CCCL_DEVICE _Type operator()(_Type __old) const
157
+ {
158
+ return _Op<_Type>{}(__val, __old);
159
+ }
160
+ };
161
+
162
+ template <class _Type>
163
+ struct __cuda_atomic_op_store
164
+ {
165
+ // Just return first value
166
+ [[nodiscard]] _CCCL_DEVICE _Type operator()(_Type __val, _Type) const
167
+ {
168
+ return __val;
169
+ }
170
+ };
171
+
172
+ template <class _Type>
173
+ struct __cuda_atomic_op_fetch_min
174
+ {
175
+ [[nodiscard]] _CCCL_DEVICE _Type operator()(_Type __op, _Type __old) const
176
+ {
177
+ return __op < __old ? __op : __old;
178
+ }
179
+ };
180
+
181
+ template <class _Type>
182
+ struct __cuda_atomic_op_fetch_max
183
+ {
184
+ [[nodiscard]] _CCCL_DEVICE _Type operator()(_Type __op, _Type __old) const
185
+ {
186
+ return __old < __op ? __op : __old;
187
+ }
188
+ };
189
+
150
190
  // Optimized fetch_update CAS loop with op determined after first load reducing waste.
151
191
  template <class _Type,
152
192
  class _Fn,
@@ -179,9 +219,7 @@ __cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_
179
219
  // Store requires cas on 8/16b types
180
220
  __cuda_atomic_fetch_update(
181
221
  __ptr,
182
- [__val](_Type __old) {
183
- return __val;
184
- },
222
+ __cuda_atomic_op_bind<_Type, ::cuda::std::__cuda_atomic_op_store>{__val},
185
223
  _Order{},
186
224
  __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
187
225
  _Sco{});
@@ -192,9 +230,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __d
192
230
  {
193
231
  __dst = __cuda_atomic_fetch_update(
194
232
  __ptr,
195
- [__op](_Type __old) {
196
- return __old + __op;
197
- },
233
+ __cuda_atomic_op_bind<_Type, ::cuda::std::plus>{__op},
198
234
  _Order{},
199
235
  __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
200
236
  _Sco{});
@@ -205,9 +241,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __d
205
241
  {
206
242
  __dst = __cuda_atomic_fetch_update(
207
243
  __ptr,
208
- [__op](_Type __old) {
209
- return __old & __op;
210
- },
244
+ __cuda_atomic_op_bind<_Type, ::cuda::std::bit_and>{__op},
211
245
  _Order{},
212
246
  __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
213
247
  _Sco{});
@@ -218,9 +252,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __d
218
252
  {
219
253
  __dst = __cuda_atomic_fetch_update(
220
254
  __ptr,
221
- [__op](_Type __old) {
222
- return __old ^ __op;
223
- },
255
+ __cuda_atomic_op_bind<_Type, ::cuda::std::bit_xor>{__op},
224
256
  _Order{},
225
257
  __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
226
258
  _Sco{});
@@ -231,9 +263,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __ds
231
263
  {
232
264
  __dst = __cuda_atomic_fetch_update(
233
265
  __ptr,
234
- [__op](_Type __old) {
235
- return __old | __op;
236
- },
266
+ __cuda_atomic_op_bind<_Type, ::cuda::std::bit_or>{__op},
237
267
  _Order{},
238
268
  __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
239
269
  _Sco{});
@@ -244,21 +274,18 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __d
244
274
  {
245
275
  __dst = __cuda_atomic_fetch_update(
246
276
  __ptr,
247
- [__op](_Type __old) {
248
- return __op < __old ? __op : __old;
249
- },
277
+ __cuda_atomic_op_bind<_Type, __cuda_atomic_op_fetch_min>{__op},
250
278
  _Order{},
251
279
  __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
252
280
  _Sco{});
253
281
  }
282
+
254
283
  template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_arithmetic<_Operand> = 0>
255
284
  static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
256
285
  {
257
286
  __dst = __cuda_atomic_fetch_update(
258
287
  __ptr,
259
- [__op](_Type __old) {
260
- return __old < __op ? __op : __old;
261
- },
288
+ __cuda_atomic_op_bind<_Type, __cuda_atomic_op_fetch_max>{__op},
262
289
  _Order{},
263
290
  __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
264
291
  _Sco{});
@@ -269,16 +296,15 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __ds
269
296
  {
270
297
  __dst = __cuda_atomic_fetch_update(
271
298
  __ptr,
272
- [__op](_Type __old) {
273
- return __op;
274
- },
299
+ __cuda_atomic_op_bind<_Type, ::cuda::std::__cuda_atomic_op_store>{__op},
275
300
  _Order{},
276
301
  __atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
277
302
  _Sco{});
278
303
  }
279
304
 
280
305
  template <typename _Tp, typename _Fn, typename _Sco>
281
- _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco)
306
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp
307
+ __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco)
282
308
  {
283
309
  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
284
310
  _Tp __desired = __op(__expected);
@@ -289,7 +315,8 @@ _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __m
289
315
  return __expected;
290
316
  }
291
317
  template <typename _Tp, typename _Fn, typename _Sco>
292
- _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp volatile* __ptr, const _Fn& __op, int __memorder, _Sco)
318
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp
319
+ __atomic_fetch_update_cuda(_Tp volatile* __ptr, const _Fn& __op, int __memorder, _Sco)
293
320
  {
294
321
  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
295
322
  _Tp __desired = __op(__expected);
@@ -301,14 +328,14 @@ _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp volatile* __ptr, const _Fn& __op
301
328
  }
302
329
 
303
330
  template <typename _Tp, typename _Sco>
304
- _CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp* __ptr, int __memorder, _Sco)
331
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp* __ptr, int __memorder, _Sco)
305
332
  {
306
333
  _Tp __ret;
307
334
  __atomic_load_cuda(__ptr, __ret, __memorder, _Sco{});
308
335
  return __ret;
309
336
  }
310
337
  template <typename _Tp, typename _Sco>
311
- _CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder, _Sco)
338
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder, _Sco)
312
339
  {
313
340
  _Tp __ret;
314
341
  __atomic_load_cuda(__ptr, __ret, __memorder, _Sco{});
@@ -327,64 +354,94 @@ _CCCL_DEVICE void __atomic_store_n_cuda(_Tp volatile* __ptr, _Tp __val, int __me
327
354
  }
328
355
 
329
356
  template <typename _Tp, typename _Sco>
330
- _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco)
357
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco)
331
358
  {
332
359
  _Tp __ret;
333
360
  __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{});
334
361
  return __ret;
335
362
  }
336
363
  template <typename _Tp, typename _Sco>
337
- _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
364
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp
365
+ __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
338
366
  {
339
367
  _Tp __ret;
340
368
  __atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{});
341
369
  return __ret;
342
370
  }
343
371
 
372
+ template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_arithmetic<_Tp> = 0>
373
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_add_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
374
+ {
375
+ return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::plus>{__val}, __memorder, _Sco{});
376
+ }
377
+ template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_arithmetic<_Tp> = 0>
378
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp
379
+ __atomic_fetch_add_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
380
+ {
381
+ return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::plus>{__val}, __memorder, _Sco{});
382
+ }
383
+
384
+ template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_bitwise<_Tp> = 0>
385
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_and_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
386
+ {
387
+ return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::bit_and>{__val}, __memorder, _Sco{});
388
+ }
389
+ template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_bitwise<_Tp> = 0>
390
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp
391
+ __atomic_fetch_and_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
392
+ {
393
+ return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::bit_and>{__val}, __memorder, _Sco{});
394
+ }
395
+
396
+ template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_bitwise<_Tp> = 0>
397
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_or_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
398
+ {
399
+ return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::bit_or>{__val}, __memorder, _Sco{});
400
+ }
401
+ template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_bitwise<_Tp> = 0>
402
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_or_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
403
+ {
404
+ return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::bit_or>{__val}, __memorder, _Sco{});
405
+ }
406
+
407
+ template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_bitwise<_Tp> = 0>
408
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_xor_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
409
+ {
410
+ return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::bit_xor>{__val}, __memorder, _Sco{});
411
+ }
412
+ template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_bitwise<_Tp> = 0>
413
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp
414
+ __atomic_fetch_xor_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
415
+ {
416
+ return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::bit_xor>{__val}, __memorder, _Sco{});
417
+ }
418
+
344
419
  template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
345
- _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
420
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
346
421
  {
347
422
  return __atomic_fetch_update_cuda(
348
- __ptr,
349
- [__val](_Tp __old) {
350
- return __val < __old ? __val : __old;
351
- },
352
- __memorder,
353
- _Sco{});
423
+ __ptr, __cuda_atomic_op_bind<_Tp, __cuda_atomic_op_fetch_min>{__val}, __memorder, _Sco{});
354
424
  }
355
425
  template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
356
- _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
426
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp
427
+ __atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
357
428
  {
358
429
  return __atomic_fetch_update_cuda(
359
- __ptr,
360
- [__val](_Tp __old) {
361
- return __val < __old ? __val : __old;
362
- },
363
- __memorder,
364
- _Sco{});
430
+ __ptr, __cuda_atomic_op_bind<_Tp, __cuda_atomic_op_fetch_min>{__val}, __memorder, _Sco{});
365
431
  }
366
432
 
367
433
  template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
368
- _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
434
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
369
435
  {
370
436
  return __atomic_fetch_update_cuda(
371
- __ptr,
372
- [__val](_Tp __old) {
373
- return __old < __val ? __val : __old;
374
- },
375
- __memorder,
376
- _Sco{});
437
+ __ptr, __cuda_atomic_op_bind<_Tp, __cuda_atomic_op_fetch_max>{__val}, __memorder, _Sco{});
377
438
  }
378
439
  template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
379
- _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
440
+ [[nodiscard]] static inline _CCCL_DEVICE _Tp
441
+ __atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
380
442
  {
381
443
  return __atomic_fetch_update_cuda(
382
- __ptr,
383
- [__val](_Tp __old) {
384
- return __old < __val ? __val : __old;
385
- },
386
- __memorder,
387
- _Sco{});
444
+ __ptr, __cuda_atomic_op_bind<_Tp, __cuda_atomic_op_fetch_max>{__val}, __memorder, _Sco{});
388
445
  }
389
446
 
390
447
  _CCCL_DEVICE static inline void __atomic_signal_fence_cuda(int)