cuda-cccl 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.4__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (911) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +12 -38
  2. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +16 -40
  3. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -28
  4. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +24 -56
  5. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +12 -38
  6. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +31 -56
  7. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +31 -35
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +47 -48
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +39 -42
  10. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +33 -60
  11. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +18 -44
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +26 -55
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +22 -49
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +15 -41
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +9 -35
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +20 -49
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +14 -40
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +18 -40
  19. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +0 -2
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +20 -46
  21. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +3 -28
  22. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +7 -31
  23. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +10 -34
  24. cuda/cccl/headers/include/cub/block/block_exchange.cuh +120 -154
  25. cuda/cccl/headers/include/cub/block/block_histogram.cuh +28 -52
  26. cuda/cccl/headers/include/cub/block/block_load.cuh +124 -146
  27. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +0 -16
  28. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +58 -87
  29. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +81 -100
  30. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +92 -156
  31. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +8 -32
  32. cuda/cccl/headers/include/cub/block/block_reduce.cuh +21 -46
  33. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +51 -79
  34. cuda/cccl/headers/include/cub/block/block_scan.cuh +94 -401
  35. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +10 -34
  36. cuda/cccl/headers/include/cub/block/block_store.cuh +73 -97
  37. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +2 -29
  38. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +5 -29
  39. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +25 -49
  40. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +12 -34
  41. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +10 -34
  42. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +3 -27
  43. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +12 -36
  44. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +9 -33
  45. cuda/cccl/headers/include/cub/config.cuh +2 -26
  46. cuda/cccl/headers/include/cub/cub.cuh +3 -27
  47. cuda/cccl/headers/include/cub/detail/array_utils.cuh +2 -26
  48. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +2 -28
  49. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +3 -27
  50. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +0 -2
  51. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -3
  52. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +2 -28
  53. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +0 -2
  54. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +0 -2
  55. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +0 -2
  56. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +0 -2
  57. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +7 -12
  58. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +6 -33
  59. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +13 -36
  60. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +9 -38
  61. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +58 -32
  62. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +51 -51
  63. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +7 -31
  64. cuda/cccl/headers/include/cub/detail/rfa.cuh +2 -27
  65. cuda/cccl/headers/include/cub/detail/strong_load.cuh +3 -29
  66. cuda/cccl/headers/include/cub/detail/strong_store.cuh +3 -29
  67. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +2 -9
  68. cuda/cccl/headers/include/cub/detail/type_traits.cuh +0 -2
  69. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +6 -31
  70. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +2 -25
  71. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +2 -26
  72. cuda/cccl/headers/include/cub/device/device_for.cuh +3 -5
  73. cuda/cccl/headers/include/cub/device/device_histogram.cuh +3 -27
  74. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +2 -26
  75. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +2 -26
  76. cuda/cccl/headers/include/cub/device/device_partition.cuh +3 -27
  77. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -27
  78. cuda/cccl/headers/include/cub/device/device_reduce.cuh +10 -31
  79. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +3 -27
  80. cuda/cccl/headers/include/cub/device/device_scan.cuh +16 -34
  81. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +3 -27
  82. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +3 -27
  83. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2 -26
  84. cuda/cccl/headers/include/cub/device/device_select.cuh +3 -27
  85. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -28
  86. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +2 -27
  87. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +0 -2
  88. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +3 -29
  89. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +14 -34
  90. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +5 -30
  91. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +4 -29
  92. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +5 -32
  93. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +3 -29
  94. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -29
  95. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +1 -2
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +47 -59
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +21 -30
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +2 -27
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +3 -27
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +3 -27
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +0 -2
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +51 -36
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +3 -28
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +0 -1
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +27 -55
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +4 -28
  107. cuda/cccl/headers/include/cub/device/dispatch/kernels/{for_each.cuh → kernel_for_each.cuh} +0 -2
  108. cuda/cccl/headers/include/cub/device/dispatch/kernels/{histogram.cuh → kernel_histogram.cuh} +149 -157
  109. cuda/cccl/headers/include/cub/device/dispatch/kernels/{merge_sort.cuh → kernel_merge_sort.cuh} +0 -2
  110. cuda/cccl/headers/include/cub/device/dispatch/kernels/{radix_sort.cuh → kernel_radix_sort.cuh} +0 -2
  111. cuda/cccl/headers/include/cub/device/dispatch/kernels/{reduce.cuh → kernel_reduce.cuh} +2 -28
  112. cuda/cccl/headers/include/cub/device/dispatch/kernels/{scan.cuh → kernel_scan.cuh} +2 -28
  113. cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_reduce.cuh → kernel_segmented_reduce.cuh} +3 -29
  114. cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_sort.cuh → kernel_segmented_sort.cuh} +0 -1
  115. cuda/cccl/headers/include/cub/device/dispatch/kernels/{three_way_partition.cuh → kernel_three_way_partition.cuh} +0 -1
  116. cuda/cccl/headers/include/cub/device/dispatch/kernels/{transform.cuh → kernel_transform.cuh} +11 -11
  117. cuda/cccl/headers/include/cub/device/dispatch/kernels/{unique_by_key.cuh → kernel_unique_by_key.cuh} +0 -1
  118. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -26
  119. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -26
  120. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -28
  121. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +6 -26
  122. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -26
  123. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +5 -31
  124. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +31 -33
  125. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +15 -40
  126. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -26
  127. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -28
  128. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +20 -44
  129. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -26
  130. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +20 -45
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -27
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +11 -36
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +0 -1
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +2 -27
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +14 -40
  136. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -27
  137. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +3 -27
  138. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +3 -27
  139. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -27
  140. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +3 -27
  141. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +3 -27
  142. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +3 -27
  143. cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -28
  144. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +3 -27
  145. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +3 -26
  146. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +3 -29
  147. cuda/cccl/headers/include/cub/thread/thread_search.cuh +3 -27
  148. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +0 -2
  149. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +2 -26
  150. cuda/cccl/headers/include/cub/thread/thread_store.cuh +3 -27
  151. cuda/cccl/headers/include/cub/util_allocator.cuh +3 -27
  152. cuda/cccl/headers/include/cub/util_arch.cuh +3 -29
  153. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +2 -26
  154. cuda/cccl/headers/include/cub/util_debug.cuh +3 -27
  155. cuda/cccl/headers/include/cub/util_device.cuh +18 -59
  156. cuda/cccl/headers/include/cub/util_macro.cuh +4 -28
  157. cuda/cccl/headers/include/cub/util_math.cuh +2 -28
  158. cuda/cccl/headers/include/cub/util_namespace.cuh +3 -28
  159. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +3 -27
  160. cuda/cccl/headers/include/cub/util_ptx.cuh +6 -30
  161. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +3 -29
  162. cuda/cccl/headers/include/cub/util_type.cuh +5 -32
  163. cuda/cccl/headers/include/cub/util_vsmem.cuh +2 -28
  164. cuda/cccl/headers/include/cub/version.cuh +2 -26
  165. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +10 -35
  166. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +5 -30
  167. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +15 -39
  168. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +5 -35
  169. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +22 -46
  170. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +3 -27
  171. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +2 -26
  172. cuda/cccl/headers/include/cub/warp/warp_load.cuh +4 -27
  173. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -26
  174. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +3 -22
  175. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -27
  176. cuda/cccl/headers/include/cub/warp/warp_store.cuh +4 -27
  177. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +0 -2
  178. cuda/cccl/headers/include/cuda/__barrier/barrier.h +1 -1
  179. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +0 -1
  180. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +277 -235
  181. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +0 -1
  182. cuda/cccl/headers/include/cuda/__driver/driver_api.h +13 -0
  183. cuda/cccl/headers/include/cuda/__execution/determinism.h +0 -2
  184. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +0 -2
  185. cuda/cccl/headers/include/cuda/__functional/maximum.h +25 -7
  186. cuda/cccl/headers/include/cuda/__functional/minimum.h +25 -7
  187. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  188. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +0 -2
  189. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +13 -4
  190. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +4 -2
  191. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +0 -1
  192. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +28 -7
  193. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +1 -1
  194. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  195. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +2 -3
  196. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +1 -7
  197. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +0 -1
  198. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +1 -1
  199. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
  200. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
  201. cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
  202. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +3 -3
  203. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  204. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
  205. cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
  206. cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
  207. cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
  208. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +3 -3
  209. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +37 -3
  210. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +13 -3
  211. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  212. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +2 -2
  213. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
  214. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +0 -6
  215. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +1 -1
  216. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  217. cuda/cccl/headers/include/cuda/{std/__cuda → __runtime}/api_wrapper.h +3 -3
  218. cuda/cccl/headers/include/cuda/__stream/get_stream.h +0 -1
  219. cuda/cccl/headers/include/cuda/{__fwd/barrier_native_handle.h → __stream/internal_streams.h} +17 -15
  220. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +2 -2
  221. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +1 -0
  222. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +1 -0
  223. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +2 -1
  224. cuda/cccl/headers/include/cuda/barrier +42 -16
  225. cuda/cccl/headers/include/cuda/memory +1 -0
  226. cuda/cccl/headers/include/cuda/memory_resource +6 -1
  227. cuda/cccl/headers/include/cuda/numeric +2 -0
  228. cuda/cccl/headers/include/cuda/pipeline +3 -2
  229. cuda/cccl/headers/include/cuda/ptx +1 -0
  230. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +0 -2
  231. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +1 -1
  232. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +115 -58
  233. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +844 -378
  234. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +12 -5
  235. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +31 -0
  236. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +10 -0
  237. cuda/cccl/headers/include/cuda/std/__atomic/types.h +2 -3
  238. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +37 -13
  239. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +0 -28
  240. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +7 -0
  241. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +10 -0
  242. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +2 -45
  243. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +0 -2
  244. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +8 -0
  245. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +0 -2
  246. cuda/cccl/headers/include/cuda/std/__chrono/day.h +0 -2
  247. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +13 -17
  248. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +0 -2
  249. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +0 -2
  250. cuda/cccl/headers/include/cuda/std/__chrono/month.h +0 -2
  251. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +0 -2
  252. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +0 -2
  253. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +5 -8
  254. cuda/cccl/headers/include/cuda/std/__chrono/year.h +0 -2
  255. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +4 -0
  256. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +2 -3
  257. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +4 -0
  258. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +4 -0
  259. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +2 -3
  260. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +2 -3
  261. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +2 -3
  262. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +2 -3
  263. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +2 -3
  264. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +2 -3
  265. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +2 -3
  266. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +2 -3
  267. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +2 -3
  268. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +2 -3
  269. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +2 -2
  270. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +4 -0
  271. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +2 -3
  272. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +2 -3
  273. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +4 -0
  274. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +2 -3
  275. cuda/cccl/headers/include/cuda/std/__complex/complex.h +0 -6
  276. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +2 -2
  277. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +27 -1
  278. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +2 -4
  279. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +15 -36
  280. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
  281. cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/stdexcept → __exception/throw_error.h} +3 -3
  282. cuda/cccl/headers/include/cuda/std/__expected/expected.h +28 -43
  283. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +2 -10
  284. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +2 -2
  285. cuda/cccl/headers/include/cuda/std/__functional/bind.h +6 -6
  286. cuda/cccl/headers/include/cuda/std/__functional/function.h +2 -6
  287. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +5 -5
  288. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +5 -0
  289. cuda/cccl/headers/include/cuda/std/__fwd/array.h +2 -2
  290. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +12 -0
  291. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  292. cuda/cccl/headers/include/cuda/std/__fwd/get.h +21 -22
  293. cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/iosfwd → __fwd/ios.h} +5 -10
  294. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +19 -10
  295. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +2 -2
  296. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +5 -0
  297. cuda/cccl/headers/include/cuda/std/__fwd/span.h +2 -2
  298. cuda/cccl/headers/include/cuda/std/__fwd/string.h +7 -0
  299. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +18 -0
  300. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +3 -0
  301. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  302. cuda/cccl/headers/include/cuda/std/{__type_traits/is_reference_wrapper.h → __fwd/variant.h} +16 -15
  303. cuda/cccl/headers/include/cuda/std/__internal/features.h +14 -0
  304. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +1 -1
  305. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +1 -1
  306. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +58 -40
  307. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +1 -1
  308. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +1 -1
  309. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +0 -5
  310. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +4 -18
  311. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +1 -2
  312. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +0 -2
  313. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +0 -2
  314. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +0 -4
  315. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +0 -5
  316. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +3 -10
  317. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +4 -15
  318. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +4 -4
  319. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +4 -4
  320. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +2 -4
  321. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +3 -3
  322. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +1 -1
  323. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -0
  324. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +6 -12
  325. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -5
  326. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +7 -2
  327. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +1 -0
  328. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +5 -0
  329. cuda/cccl/headers/include/cuda/std/__new/allocate.h +5 -0
  330. cuda/cccl/headers/include/cuda/{__barrier/barrier_native_handle.h → std/__new/device_new.h} +9 -24
  331. cuda/cccl/headers/include/cuda/std/__new_ +1 -0
  332. cuda/cccl/headers/include/cuda/std/__optional/optional.h +5 -4
  333. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +4 -4
  334. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +1 -1
  335. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  336. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  337. cuda/cccl/headers/include/cuda/std/__random_ +2 -0
  338. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +7 -19
  339. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -4
  340. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +5 -4
  341. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +1 -1
  342. cuda/cccl/headers/include/cuda/std/__string/string_view.h +5 -5
  343. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  344. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  345. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +0 -160
  346. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +123 -129
  347. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  348. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  349. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  350. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  351. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +7 -0
  352. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  353. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +1 -2
  354. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  355. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -1
  356. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  357. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +0 -2
  358. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +4 -24
  359. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +0 -2
  360. cuda/cccl/headers/include/cuda/std/__utility/pair.h +20 -20
  361. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +0 -2
  362. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  363. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  364. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  365. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  366. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  367. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  368. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  369. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  370. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  371. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  372. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  373. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  374. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  375. cuda/cccl/headers/include/cuda/std/array +1 -1
  376. cuda/cccl/headers/include/cuda/std/atomic +1 -1
  377. cuda/cccl/headers/include/cuda/std/bitset +2 -10
  378. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +6 -6
  379. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1 -4
  380. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3 -6
  381. cuda/cccl/headers/include/cuda/std/functional +1 -1
  382. cuda/cccl/headers/include/cuda/std/initializer_list +8 -0
  383. cuda/cccl/headers/include/cuda/std/inplace_vector +6 -5
  384. cuda/cccl/headers/include/cuda/std/iterator +1 -1
  385. cuda/cccl/headers/include/cuda/std/numbers +0 -2
  386. cuda/cccl/headers/include/cuda/std/ratio +2 -2
  387. cuda/cccl/headers/include/cuda/std/span +2 -2
  388. cuda/cccl/headers/include/cuda/std/string_view +24 -42
  389. cuda/cccl/headers/include/cuda/std/tuple +18 -1
  390. cuda/cccl/headers/include/cuda/std/type_traits +0 -1
  391. cuda/cccl/headers/include/cuda/std/variant +8 -1
  392. cuda/cccl/headers/include/nv/target +2 -6
  393. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +15 -2
  394. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +0 -2
  395. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +0 -1
  396. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +0 -1
  397. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +0 -2
  398. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +0 -2
  399. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +0 -2
  400. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +0 -2
  401. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +0 -2
  402. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +0 -4
  403. cuda/cccl/headers/include/thrust/detail/binary_search.inl +14 -2
  404. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +2 -7
  405. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +2 -8
  406. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +2 -8
  407. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +2 -8
  408. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +2 -8
  409. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +2 -8
  410. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +2 -7
  411. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +2 -8
  412. cuda/cccl/headers/include/thrust/detail/complex/clog.h +2 -8
  413. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +2 -8
  414. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +2 -7
  415. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +2 -8
  416. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +2 -8
  417. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +2 -8
  418. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +2 -8
  419. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +2 -8
  420. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +2 -8
  421. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +2 -8
  422. cuda/cccl/headers/include/thrust/detail/config/device_system.h +2 -0
  423. cuda/cccl/headers/include/thrust/detail/config/host_system.h +2 -0
  424. cuda/cccl/headers/include/thrust/detail/config/namespace.h +0 -1
  425. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +0 -2
  426. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +0 -2
  427. cuda/cccl/headers/include/thrust/detail/copy.h +0 -2
  428. cuda/cccl/headers/include/thrust/detail/copy.inl +14 -4
  429. cuda/cccl/headers/include/thrust/detail/copy_if.inl +14 -2
  430. cuda/cccl/headers/include/thrust/detail/count.inl +14 -2
  431. cuda/cccl/headers/include/thrust/detail/equal.inl +14 -2
  432. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +4 -5
  433. cuda/cccl/headers/include/thrust/detail/extrema.inl +14 -2
  434. cuda/cccl/headers/include/thrust/detail/fill.inl +14 -2
  435. cuda/cccl/headers/include/thrust/detail/find.inl +14 -2
  436. cuda/cccl/headers/include/thrust/detail/for_each.inl +14 -2
  437. cuda/cccl/headers/include/thrust/detail/functional/actor.h +2 -5
  438. cuda/cccl/headers/include/thrust/detail/functional/operators.h +2 -5
  439. cuda/cccl/headers/include/thrust/detail/gather.inl +14 -2
  440. cuda/cccl/headers/include/thrust/detail/generate.inl +14 -2
  441. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +0 -2
  442. cuda/cccl/headers/include/thrust/detail/inner_product.inl +14 -2
  443. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -0
  444. cuda/cccl/headers/include/thrust/detail/logical.inl +14 -2
  445. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +13 -1
  446. cuda/cccl/headers/include/thrust/detail/merge.inl +14 -2
  447. cuda/cccl/headers/include/thrust/detail/mismatch.inl +14 -2
  448. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +0 -4
  449. cuda/cccl/headers/include/thrust/detail/partition.inl +14 -2
  450. cuda/cccl/headers/include/thrust/detail/random_bijection.h +0 -2
  451. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +0 -2
  452. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +0 -2
  453. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +0 -6
  454. cuda/cccl/headers/include/thrust/detail/reduce.inl +21 -3
  455. cuda/cccl/headers/include/thrust/detail/reference.h +27 -3
  456. cuda/cccl/headers/include/thrust/detail/remove.inl +14 -2
  457. cuda/cccl/headers/include/thrust/detail/replace.inl +14 -2
  458. cuda/cccl/headers/include/thrust/detail/reverse.inl +14 -2
  459. cuda/cccl/headers/include/thrust/detail/scan.inl +21 -3
  460. cuda/cccl/headers/include/thrust/detail/scatter.inl +14 -2
  461. cuda/cccl/headers/include/thrust/detail/sequence.inl +13 -1
  462. cuda/cccl/headers/include/thrust/detail/set_operations.inl +13 -1
  463. cuda/cccl/headers/include/thrust/detail/sort.inl +13 -1
  464. cuda/cccl/headers/include/thrust/detail/static_assert.h +0 -2
  465. cuda/cccl/headers/include/thrust/detail/static_map.h +0 -3
  466. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +13 -1
  467. cuda/cccl/headers/include/thrust/detail/tabulate.inl +14 -2
  468. cuda/cccl/headers/include/thrust/detail/temporary_array.h +0 -4
  469. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +0 -1
  470. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +14 -3
  471. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +13 -1
  472. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +13 -1
  473. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +0 -2
  474. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +0 -2
  475. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +2 -7
  476. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +0 -2
  477. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +0 -4
  478. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +0 -4
  479. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +14 -2
  480. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +14 -2
  481. cuda/cccl/headers/include/thrust/detail/unique.inl +21 -3
  482. cuda/cccl/headers/include/thrust/detail/vector_base.h +0 -2
  483. cuda/cccl/headers/include/thrust/detail/vector_base.inl +0 -2
  484. cuda/cccl/headers/include/thrust/execution_policy.h +10 -9
  485. cuda/cccl/headers/include/thrust/functional.h +0 -2
  486. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +9 -4
  487. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +8 -4
  488. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +0 -1
  489. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +0 -1
  490. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +0 -1
  491. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +0 -1
  492. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +0 -1
  493. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +2 -6
  494. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +0 -1
  495. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +0 -2
  496. cuda/cccl/headers/include/thrust/mr/allocator.h +0 -2
  497. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +9 -4
  498. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +10 -10
  499. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +0 -2
  500. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +0 -2
  501. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +0 -2
  502. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +8 -4
  503. cuda/cccl/headers/include/thrust/mr/memory_resource.h +0 -2
  504. cuda/cccl/headers/include/thrust/mr/new.h +0 -2
  505. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +0 -2
  506. cuda/cccl/headers/include/thrust/mr/pool.h +10 -10
  507. cuda/cccl/headers/include/thrust/mr/pool_options.h +4 -6
  508. cuda/cccl/headers/include/thrust/mr/sync_pool.h +0 -2
  509. cuda/cccl/headers/include/thrust/mr/tls_pool.h +0 -2
  510. cuda/cccl/headers/include/thrust/mr/validator.h +0 -2
  511. cuda/cccl/headers/include/thrust/per_device_resource.h +13 -1
  512. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +0 -2
  513. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +0 -2
  514. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +2 -9
  515. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +0 -2
  516. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +2 -9
  517. cuda/cccl/headers/include/thrust/random/detail/mod.h +2 -9
  518. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +0 -2
  519. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +2 -7
  520. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +2 -9
  521. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +0 -2
  522. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +0 -2
  523. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +0 -2
  524. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +0 -2
  525. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +0 -2
  526. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +0 -2
  527. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +0 -2
  528. cuda/cccl/headers/include/thrust/random/normal_distribution.h +0 -2
  529. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +0 -2
  530. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +0 -2
  531. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +0 -2
  532. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +0 -2
  533. cuda/cccl/headers/include/thrust/random.h +0 -2
  534. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +15 -11
  535. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +2 -7
  536. cuda/cccl/headers/include/thrust/system/cpp/memory.h +0 -1
  537. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +0 -2
  538. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +0 -2
  539. cuda/cccl/headers/include/thrust/system/cpp/vector.h +0 -1
  540. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +0 -4
  541. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +0 -1
  542. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +0 -4
  543. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +2 -9
  544. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +4 -32
  545. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +2 -9
  546. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +0 -2
  547. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +0 -2
  548. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +23 -2
  549. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +0 -2
  550. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +2 -11
  551. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +2 -0
  552. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +0 -4
  553. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +0 -1
  554. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +0 -5
  555. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +0 -1
  556. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +0 -2
  557. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +0 -2
  558. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +0 -1
  559. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +2 -8
  560. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +0 -2
  561. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +0 -2
  562. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +2 -26
  563. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +7 -142
  564. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +0 -2
  565. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +0 -4
  566. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +0 -2
  567. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +0 -5
  568. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +0 -4
  569. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +0 -2
  570. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +0 -1
  571. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +0 -4
  572. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +0 -4
  573. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +0 -3
  574. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +0 -2
  575. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +3 -5
  576. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +8 -10
  577. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +0 -2
  578. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +0 -1
  579. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +0 -4
  580. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +0 -2
  581. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -7
  582. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +2 -7
  583. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +0 -3
  584. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +0 -4
  585. cuda/cccl/headers/include/thrust/system/cuda/error.h +2 -11
  586. cuda/cccl/headers/include/thrust/system/cuda/memory.h +2 -6
  587. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +2 -9
  588. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +2 -7
  589. cuda/cccl/headers/include/thrust/system/cuda/vector.h +2 -6
  590. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +0 -2
  591. cuda/cccl/headers/include/thrust/system/detail/errno.h +0 -2
  592. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +0 -4
  593. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +0 -2
  594. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +0 -2
  595. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +0 -2
  596. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +0 -2
  597. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +0 -2
  598. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +0 -3
  599. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +0 -2
  600. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +0 -2
  601. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +0 -2
  602. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +0 -3
  603. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +0 -2
  604. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +0 -2
  605. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +0 -2
  606. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +0 -2
  607. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +0 -2
  608. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +0 -3
  609. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +0 -2
  610. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +0 -2
  611. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +0 -2
  612. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +0 -2
  613. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +0 -2
  614. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +0 -2
  615. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +0 -2
  616. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +0 -2
  617. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +0 -2
  618. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +0 -2
  619. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +0 -2
  620. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +0 -3
  621. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +0 -2
  622. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +0 -2
  623. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +0 -2
  624. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +0 -2
  625. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +0 -2
  626. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +0 -2
  627. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +0 -2
  628. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +0 -2
  629. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +0 -2
  630. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +0 -2
  631. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +0 -3
  632. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +0 -2
  633. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +0 -2
  634. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +0 -2
  635. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +0 -3
  636. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +0 -2
  637. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +0 -2
  638. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +0 -2
  639. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +0 -2
  640. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +26 -12
  641. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +0 -2
  642. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +0 -3
  643. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +0 -2
  644. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +0 -2
  645. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +0 -1
  646. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +0 -2
  647. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +0 -2
  648. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +0 -2
  649. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +0 -2
  650. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +0 -2
  651. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +0 -2
  652. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +0 -2
  653. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +0 -3
  654. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +0 -2
  655. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +0 -2
  656. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +0 -2
  657. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +0 -2
  658. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +0 -2
  659. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +0 -2
  660. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +0 -2
  661. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +0 -2
  662. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +0 -2
  663. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +0 -2
  664. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +0 -2
  665. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -4
  666. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +0 -2
  667. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +0 -3
  668. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +0 -2
  669. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +0 -2
  670. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +0 -2
  671. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +0 -2
  672. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +0 -2
  673. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +0 -2
  674. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +0 -2
  675. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +0 -2
  676. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +76 -5
  677. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +0 -2
  678. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +0 -2
  679. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +0 -2
  680. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +0 -2
  681. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +0 -2
  682. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +0 -3
  683. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +0 -2
  684. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +0 -2
  685. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +0 -2
  686. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +0 -2
  687. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +78 -6
  688. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +0 -4
  689. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +0 -2
  690. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +0 -2
  691. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +0 -2
  692. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +0 -2
  693. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +0 -2
  694. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +0 -2
  695. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +67 -6
  696. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +310 -11
  697. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +78 -5
  698. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +543 -7
  699. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +0 -2
  700. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +0 -2
  701. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +0 -2
  702. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +0 -2
  703. cuda/cccl/headers/include/thrust/system/error_code.h +0 -4
  704. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +5 -25
  705. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +2 -15
  706. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +5 -25
  707. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +40 -29
  708. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +11 -28
  709. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +2 -15
  710. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +26 -28
  711. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +2 -15
  712. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +18 -13
  713. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +5 -25
  714. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +2 -15
  715. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +5 -25
  716. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +47 -30
  717. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +2 -15
  718. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +2 -15
  719. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +2 -15
  720. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +2 -15
  721. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +2 -15
  722. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +2 -15
  723. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +2 -15
  724. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +2 -15
  725. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +2 -15
  726. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +26 -31
  727. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +2 -15
  728. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +2 -26
  729. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +35 -27
  730. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +13 -28
  731. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +56 -28
  732. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +26 -31
  733. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +2 -15
  734. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +2 -15
  735. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +176 -17
  736. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +8 -15
  737. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +2 -15
  738. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +2 -15
  739. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +2 -15
  740. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +213 -28
  741. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +2 -15
  742. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +2 -15
  743. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +2 -15
  744. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +2 -15
  745. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +2 -15
  746. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +2 -15
  747. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +2 -15
  748. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +2 -15
  749. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +21 -30
  750. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +17 -29
  751. cuda/cccl/headers/include/thrust/system/omp/memory.h +51 -9
  752. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +3 -7
  753. cuda/cccl/headers/include/thrust/system/omp/pointer.h +3 -7
  754. cuda/cccl/headers/include/thrust/system/omp/vector.h +3 -6
  755. cuda/cccl/headers/include/thrust/system/system_error.h +0 -2
  756. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +4 -25
  757. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +2 -15
  758. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +2 -15
  759. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +38 -29
  760. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +91 -24
  761. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +2 -15
  762. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +2 -15
  763. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +17 -13
  764. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +4 -25
  765. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +2 -15
  766. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +4 -25
  767. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +47 -28
  768. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +2 -15
  769. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +2 -15
  770. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +2 -15
  771. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +2 -15
  772. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +2 -15
  773. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +2 -15
  774. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +2 -15
  775. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +254 -29
  776. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +2 -15
  777. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +25 -31
  778. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +2 -15
  779. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +95 -29
  780. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +345 -28
  781. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +4 -26
  782. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +32 -42
  783. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +2 -15
  784. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +2 -15
  785. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +265 -30
  786. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +7 -17
  787. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +2 -15
  788. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +2 -15
  789. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +2 -15
  790. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +244 -32
  791. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +2 -15
  792. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +2 -15
  793. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +2 -15
  794. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +2 -15
  795. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +2 -15
  796. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +2 -15
  797. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +2 -15
  798. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +2 -15
  799. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +23 -33
  800. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +16 -29
  801. cuda/cccl/headers/include/thrust/system/tbb/memory.h +52 -24
  802. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +4 -22
  803. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +4 -22
  804. cuda/cccl/headers/include/thrust/system/tbb/vector.h +4 -21
  805. cuda/cccl/headers/include/thrust/transform.h +14 -3
  806. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +0 -4
  807. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +0 -1
  808. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +0 -4
  809. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +0 -4
  810. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +0 -4
  811. cuda/cccl/headers/include/thrust/universal_allocator.h +8 -0
  812. cuda/cccl/headers/include/thrust/universal_vector.h +9 -0
  813. cuda/cccl/headers/include/thrust/zip_function.h +2 -28
  814. cuda/compute/__init__.py +4 -0
  815. cuda/compute/_bindings.pyi +26 -3
  816. cuda/compute/_bindings_impl.pyx +143 -1
  817. cuda/compute/algorithms/__init__.py +9 -5
  818. cuda/compute/algorithms/_sort/__init__.py +23 -0
  819. cuda/compute/algorithms/{_merge_sort.py → _sort/_merge_sort.py} +10 -10
  820. cuda/compute/algorithms/{_radix_sort.py → _sort/_radix_sort.py} +9 -58
  821. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  822. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  823. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  824. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  825. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  826. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  827. cuda_cccl-0.3.4.dist-info/METADATA +78 -0
  828. {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/RECORD +830 -867
  829. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +0 -652
  830. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +0 -1365
  831. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +0 -2144
  832. cuda/cccl/headers/include/thrust/detail/integer_math.h +0 -113
  833. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +0 -51
  834. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +0 -51
  835. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +0 -51
  836. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +0 -51
  837. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +0 -52
  838. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +0 -51
  839. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +0 -51
  840. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +0 -51
  841. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +0 -51
  842. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +0 -51
  843. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +0 -51
  844. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +0 -51
  845. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +0 -51
  846. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +0 -51
  847. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +0 -51
  848. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +0 -51
  849. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +0 -51
  850. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +0 -51
  851. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +0 -51
  852. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +0 -51
  853. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +0 -51
  854. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +0 -51
  855. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +0 -51
  856. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +0 -51
  857. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +0 -51
  858. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +0 -51
  859. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +0 -51
  860. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +0 -51
  861. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +0 -51
  862. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +0 -51
  863. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +0 -51
  864. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +0 -51
  865. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +0 -51
  866. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +0 -51
  867. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +0 -51
  868. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +0 -51
  869. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +0 -51
  870. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +0 -51
  871. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +0 -51
  872. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +0 -51
  873. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +0 -51
  874. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +0 -51
  875. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +0 -51
  876. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +0 -85
  877. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +0 -119
  878. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +0 -145
  879. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +0 -116
  880. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +0 -356
  881. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +0 -124
  882. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +0 -586
  883. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +0 -74
  884. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +0 -59
  885. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +0 -65
  886. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +0 -87
  887. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +0 -93
  888. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +0 -102
  889. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +0 -78
  890. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +0 -65
  891. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +0 -103
  892. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +0 -87
  893. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +0 -265
  894. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +0 -71
  895. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +0 -75
  896. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +0 -73
  897. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +0 -136
  898. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +0 -91
  899. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +0 -94
  900. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +0 -327
  901. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +0 -98
  902. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +0 -137
  903. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +0 -400
  904. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +0 -87
  905. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +0 -312
  906. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +0 -295
  907. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +0 -71
  908. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +0 -75
  909. cuda_cccl-0.3.2.dist-info/METADATA +0 -42
  910. {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/WHEEL +0 -0
  911. {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/licenses/LICENSE +0 -0
@@ -44,6 +44,10 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD
44
44
 
45
45
  #if _CCCL_HAS_CUDA_COMPILER()
46
46
 
47
+ extern "C" _CCCL_DEVICE void __atomic_cas_128b_unsupported_before_SM_90();
48
+ extern "C" _CCCL_DEVICE void __atomic_exchange_128b_unsupported_before_SM_90();
49
+ extern "C" _CCCL_DEVICE void __atomic_ldst_128b_unsupported_before_SM_70();
50
+
47
51
  static inline _CCCL_DEVICE void __cuda_atomic_membar(__thread_scope_block_tag)
48
52
  { asm volatile("membar.cta;" ::: "memory"); }
49
53
  static inline _CCCL_DEVICE void __cuda_atomic_membar(__thread_scope_device_tag)
@@ -695,131 +699,222 @@ static inline _CCCL_DEVICE void __cuda_atomic_load(
695
699
  static inline _CCCL_DEVICE void __cuda_atomic_load(
696
700
  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
697
701
  {
702
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
703
+ NV_DISPATCH_TARGET(
704
+ NV_PROVIDES_SM_70, (),
705
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
706
+ )
698
707
  asm volatile(R"YYY(
699
- .reg .b128 _d;
700
- ld.acquire.cta.b128 [%2],_d;
701
- mov.b128 _d, {%0, %1};
702
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
708
+ {
709
+ .reg .b128 _d;
710
+ ld.acquire.cta.b128 _d,[%2];
711
+ mov.b128 {%0, %1}, _d;
712
+ }
713
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
703
714
  }
704
715
  template <class _Type>
705
716
  static inline _CCCL_DEVICE void __cuda_atomic_load(
706
717
  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
707
718
  {
719
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
720
+ NV_DISPATCH_TARGET(
721
+ NV_PROVIDES_SM_70, (),
722
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
723
+ )
708
724
  asm volatile(R"YYY(
709
- .reg .b128 _d;
710
- ld.acquire.cluster.b128 [%2],_d;
711
- mov.b128 _d, {%0, %1};
712
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
725
+ {
726
+ .reg .b128 _d;
727
+ ld.acquire.cluster.b128 _d,[%2];
728
+ mov.b128 {%0, %1}, _d;
729
+ }
730
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
713
731
  }
714
732
  template <class _Type>
715
733
  static inline _CCCL_DEVICE void __cuda_atomic_load(
716
734
  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
717
735
  {
736
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
737
+ NV_DISPATCH_TARGET(
738
+ NV_PROVIDES_SM_70, (),
739
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
740
+ )
718
741
  asm volatile(R"YYY(
719
- .reg .b128 _d;
720
- ld.acquire.gpu.b128 [%2],_d;
721
- mov.b128 _d, {%0, %1};
722
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
742
+ {
743
+ .reg .b128 _d;
744
+ ld.acquire.gpu.b128 _d,[%2];
745
+ mov.b128 {%0, %1}, _d;
746
+ }
747
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
723
748
  }
724
749
  template <class _Type>
725
750
  static inline _CCCL_DEVICE void __cuda_atomic_load(
726
751
  const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
727
752
  {
753
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
754
+ NV_DISPATCH_TARGET(
755
+ NV_PROVIDES_SM_70, (),
756
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
757
+ )
728
758
  asm volatile(R"YYY(
729
- .reg .b128 _d;
730
- ld.acquire.sys.b128 [%2],_d;
731
- mov.b128 _d, {%0, %1};
732
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
759
+ {
760
+ .reg .b128 _d;
761
+ ld.acquire.sys.b128 _d,[%2];
762
+ mov.b128 {%0, %1}, _d;
763
+ }
764
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
733
765
  }
734
766
  template <class _Type>
735
767
  static inline _CCCL_DEVICE void __cuda_atomic_load(
736
768
  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
737
769
  {
770
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
771
+ NV_DISPATCH_TARGET(
772
+ NV_PROVIDES_SM_70, (),
773
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
774
+ )
738
775
  asm volatile(R"YYY(
739
- .reg .b128 _d;
740
- ld.relaxed.cta.b128 [%2],_d;
741
- mov.b128 _d, {%0, %1};
742
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
776
+ {
777
+ .reg .b128 _d;
778
+ ld.relaxed.cta.b128 _d,[%2];
779
+ mov.b128 {%0, %1}, _d;
780
+ }
781
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
743
782
  }
744
783
  template <class _Type>
745
784
  static inline _CCCL_DEVICE void __cuda_atomic_load(
746
785
  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
747
786
  {
787
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
788
+ NV_DISPATCH_TARGET(
789
+ NV_PROVIDES_SM_70, (),
790
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
791
+ )
748
792
  asm volatile(R"YYY(
749
- .reg .b128 _d;
750
- ld.relaxed.cluster.b128 [%2],_d;
751
- mov.b128 _d, {%0, %1};
752
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
793
+ {
794
+ .reg .b128 _d;
795
+ ld.relaxed.cluster.b128 _d,[%2];
796
+ mov.b128 {%0, %1}, _d;
797
+ }
798
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
753
799
  }
754
800
  template <class _Type>
755
801
  static inline _CCCL_DEVICE void __cuda_atomic_load(
756
802
  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
757
803
  {
804
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
805
+ NV_DISPATCH_TARGET(
806
+ NV_PROVIDES_SM_70, (),
807
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
808
+ )
758
809
  asm volatile(R"YYY(
759
- .reg .b128 _d;
760
- ld.relaxed.gpu.b128 [%2],_d;
761
- mov.b128 _d, {%0, %1};
762
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
810
+ {
811
+ .reg .b128 _d;
812
+ ld.relaxed.gpu.b128 _d,[%2];
813
+ mov.b128 {%0, %1}, _d;
814
+ }
815
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
763
816
  }
764
817
  template <class _Type>
765
818
  static inline _CCCL_DEVICE void __cuda_atomic_load(
766
819
  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
767
820
  {
821
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
822
+ NV_DISPATCH_TARGET(
823
+ NV_PROVIDES_SM_70, (),
824
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
825
+ )
768
826
  asm volatile(R"YYY(
769
- .reg .b128 _d;
770
- ld.relaxed.sys.b128 [%2],_d;
771
- mov.b128 _d, {%0, %1};
772
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
827
+ {
828
+ .reg .b128 _d;
829
+ ld.relaxed.sys.b128 _d,[%2];
830
+ mov.b128 {%0, %1}, _d;
831
+ }
832
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
773
833
  }
774
834
  template <class _Type>
775
835
  static inline _CCCL_DEVICE void __cuda_atomic_load(
776
836
  const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
777
837
  {
838
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
839
+ NV_DISPATCH_TARGET(
840
+ NV_PROVIDES_SM_70, (),
841
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
842
+ )
778
843
  asm volatile(R"YYY(
779
- .reg .b128 _d;
780
- ld.mmio.relaxed.sys.b128 [%2],_d;
781
- mov.b128 _d, {%0, %1};
782
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
844
+ {
845
+ .reg .b128 _d;
846
+ ld.mmio.relaxed.sys.b128 _d,[%2];
847
+ mov.b128 {%0, %1}, _d;
848
+ }
849
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
783
850
  }
784
851
  template <class _Type>
785
852
  static inline _CCCL_DEVICE void __cuda_atomic_load(
786
853
  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
787
854
  {
855
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
856
+ NV_DISPATCH_TARGET(
857
+ NV_PROVIDES_SM_70, (),
858
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
859
+ )
788
860
  asm volatile(R"YYY(
789
- .reg .b128 _d;
790
- ld.volatile.b128 [%2],_d;
791
- mov.b128 _d, {%0, %1};
792
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
861
+ {
862
+ .reg .b128 _d;
863
+ ld.volatile.b128 _d,[%2];
864
+ mov.b128 {%0, %1}, _d;
865
+ }
866
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
793
867
  }
794
868
  template <class _Type>
795
869
  static inline _CCCL_DEVICE void __cuda_atomic_load(
796
870
  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
797
871
  {
872
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
873
+ NV_DISPATCH_TARGET(
874
+ NV_PROVIDES_SM_70, (),
875
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
876
+ )
798
877
  asm volatile(R"YYY(
799
- .reg .b128 _d;
800
- ld.volatile.b128 [%2],_d;
801
- mov.b128 _d, {%0, %1};
802
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
878
+ {
879
+ .reg .b128 _d;
880
+ ld.volatile.b128 _d,[%2];
881
+ mov.b128 {%0, %1}, _d;
882
+ }
883
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
803
884
  }
804
885
  template <class _Type>
805
886
  static inline _CCCL_DEVICE void __cuda_atomic_load(
806
887
  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
807
888
  {
889
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
890
+ NV_DISPATCH_TARGET(
891
+ NV_PROVIDES_SM_70, (),
892
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
893
+ )
808
894
  asm volatile(R"YYY(
809
- .reg .b128 _d;
810
- ld.volatile.b128 [%2],_d;
811
- mov.b128 _d, {%0, %1};
812
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
895
+ {
896
+ .reg .b128 _d;
897
+ ld.volatile.b128 _d,[%2];
898
+ mov.b128 {%0, %1}, _d;
899
+ }
900
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
813
901
  }
814
902
  template <class _Type>
815
903
  static inline _CCCL_DEVICE void __cuda_atomic_load(
816
904
  const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
817
905
  {
906
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
907
+ NV_DISPATCH_TARGET(
908
+ NV_PROVIDES_SM_70, (),
909
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
910
+ )
818
911
  asm volatile(R"YYY(
819
- .reg .b128 _d;
820
- ld.volatile.b128 [%2],_d;
821
- mov.b128 _d, {%0, %1};
822
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
912
+ {
913
+ .reg .b128 _d;
914
+ ld.volatile.b128 _d,[%2];
915
+ mov.b128 {%0, %1}, _d;
916
+ }
917
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
823
918
  }
824
919
 
825
920
  template <typename _Type, typename _Tag, typename _Sco, typename _Mmio>
@@ -1037,131 +1132,222 @@ template <class _Type>
1037
1132
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1038
1133
  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
1039
1134
  {
1135
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1136
+ NV_DISPATCH_TARGET(
1137
+ NV_PROVIDES_SM_70, (),
1138
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1139
+ )
1040
1140
  asm volatile(R"YYY(
1041
- .reg .b128 _v;
1042
- mov.b128 {%1, %2}, _v;
1043
- st.release.cta.b128 [%0],_v;
1044
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1141
+ {
1142
+ .reg .b128 _v;
1143
+ mov.b128 _v, {%1, %2};
1144
+ st.release.cta.b128 [%0],_v;
1145
+ }
1146
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1045
1147
  }
1046
1148
  template <class _Type>
1047
1149
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1048
1150
  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
1049
1151
  {
1152
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1153
+ NV_DISPATCH_TARGET(
1154
+ NV_PROVIDES_SM_70, (),
1155
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1156
+ )
1050
1157
  asm volatile(R"YYY(
1051
- .reg .b128 _v;
1052
- mov.b128 {%1, %2}, _v;
1053
- st.release.cluster.b128 [%0],_v;
1054
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1158
+ {
1159
+ .reg .b128 _v;
1160
+ mov.b128 _v, {%1, %2};
1161
+ st.release.cluster.b128 [%0],_v;
1162
+ }
1163
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1055
1164
  }
1056
1165
  template <class _Type>
1057
1166
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1058
1167
  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
1059
1168
  {
1169
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1170
+ NV_DISPATCH_TARGET(
1171
+ NV_PROVIDES_SM_70, (),
1172
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1173
+ )
1060
1174
  asm volatile(R"YYY(
1061
- .reg .b128 _v;
1062
- mov.b128 {%1, %2}, _v;
1063
- st.release.gpu.b128 [%0],_v;
1064
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1175
+ {
1176
+ .reg .b128 _v;
1177
+ mov.b128 _v, {%1, %2};
1178
+ st.release.gpu.b128 [%0],_v;
1179
+ }
1180
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1065
1181
  }
1066
1182
  template <class _Type>
1067
1183
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1068
1184
  _Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
1069
1185
  {
1186
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1187
+ NV_DISPATCH_TARGET(
1188
+ NV_PROVIDES_SM_70, (),
1189
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1190
+ )
1070
1191
  asm volatile(R"YYY(
1071
- .reg .b128 _v;
1072
- mov.b128 {%1, %2}, _v;
1073
- st.release.sys.b128 [%0],_v;
1074
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1192
+ {
1193
+ .reg .b128 _v;
1194
+ mov.b128 _v, {%1, %2};
1195
+ st.release.sys.b128 [%0],_v;
1196
+ }
1197
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1075
1198
  }
1076
1199
  template <class _Type>
1077
1200
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1078
1201
  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
1079
1202
  {
1203
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1204
+ NV_DISPATCH_TARGET(
1205
+ NV_PROVIDES_SM_70, (),
1206
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1207
+ )
1080
1208
  asm volatile(R"YYY(
1081
- .reg .b128 _v;
1082
- mov.b128 {%1, %2}, _v;
1083
- st.relaxed.cta.b128 [%0],_v;
1084
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1209
+ {
1210
+ .reg .b128 _v;
1211
+ mov.b128 _v, {%1, %2};
1212
+ st.relaxed.cta.b128 [%0],_v;
1213
+ }
1214
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1085
1215
  }
1086
1216
  template <class _Type>
1087
1217
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1088
1218
  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
1089
1219
  {
1220
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1221
+ NV_DISPATCH_TARGET(
1222
+ NV_PROVIDES_SM_70, (),
1223
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1224
+ )
1090
1225
  asm volatile(R"YYY(
1091
- .reg .b128 _v;
1092
- mov.b128 {%1, %2}, _v;
1093
- st.relaxed.cluster.b128 [%0],_v;
1094
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1226
+ {
1227
+ .reg .b128 _v;
1228
+ mov.b128 _v, {%1, %2};
1229
+ st.relaxed.cluster.b128 [%0],_v;
1230
+ }
1231
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1095
1232
  }
1096
1233
  template <class _Type>
1097
1234
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1098
1235
  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
1099
1236
  {
1237
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1238
+ NV_DISPATCH_TARGET(
1239
+ NV_PROVIDES_SM_70, (),
1240
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1241
+ )
1100
1242
  asm volatile(R"YYY(
1101
- .reg .b128 _v;
1102
- mov.b128 {%1, %2}, _v;
1103
- st.relaxed.gpu.b128 [%0],_v;
1104
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1243
+ {
1244
+ .reg .b128 _v;
1245
+ mov.b128 _v, {%1, %2};
1246
+ st.relaxed.gpu.b128 [%0],_v;
1247
+ }
1248
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1105
1249
  }
1106
1250
  template <class _Type>
1107
1251
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1108
1252
  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
1109
1253
  {
1254
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1255
+ NV_DISPATCH_TARGET(
1256
+ NV_PROVIDES_SM_70, (),
1257
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1258
+ )
1110
1259
  asm volatile(R"YYY(
1111
- .reg .b128 _v;
1112
- mov.b128 {%1, %2}, _v;
1113
- st.relaxed.sys.b128 [%0],_v;
1114
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1260
+ {
1261
+ .reg .b128 _v;
1262
+ mov.b128 _v, {%1, %2};
1263
+ st.relaxed.sys.b128 [%0],_v;
1264
+ }
1265
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1115
1266
  }
1116
1267
  template <class _Type>
1117
1268
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1118
1269
  _Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
1119
1270
  {
1271
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1272
+ NV_DISPATCH_TARGET(
1273
+ NV_PROVIDES_SM_70, (),
1274
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1275
+ )
1120
1276
  asm volatile(R"YYY(
1121
- .reg .b128 _v;
1122
- mov.b128 {%1, %2}, _v;
1123
- st.mmio.relaxed.sys.b128 [%0],_v;
1124
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1277
+ {
1278
+ .reg .b128 _v;
1279
+ mov.b128 _v, {%1, %2};
1280
+ st.mmio.relaxed.sys.b128 [%0],_v;
1281
+ }
1282
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1125
1283
  }
1126
1284
  template <class _Type>
1127
1285
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1128
1286
  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
1129
1287
  {
1288
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1289
+ NV_DISPATCH_TARGET(
1290
+ NV_PROVIDES_SM_70, (),
1291
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1292
+ )
1130
1293
  asm volatile(R"YYY(
1131
- .reg .b128 _v;
1132
- mov.b128 {%1, %2}, _v;
1133
- st.volatile.b128 [%0],_v;
1134
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1294
+ {
1295
+ .reg .b128 _v;
1296
+ mov.b128 _v, {%1, %2};
1297
+ st.volatile.b128 [%0],_v;
1298
+ }
1299
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1135
1300
  }
1136
1301
  template <class _Type>
1137
1302
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1138
1303
  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
1139
1304
  {
1305
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1306
+ NV_DISPATCH_TARGET(
1307
+ NV_PROVIDES_SM_70, (),
1308
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1309
+ )
1140
1310
  asm volatile(R"YYY(
1141
- .reg .b128 _v;
1142
- mov.b128 {%1, %2}, _v;
1143
- st.volatile.b128 [%0],_v;
1144
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1311
+ {
1312
+ .reg .b128 _v;
1313
+ mov.b128 _v, {%1, %2};
1314
+ st.volatile.b128 [%0],_v;
1315
+ }
1316
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1145
1317
  }
1146
1318
  template <class _Type>
1147
1319
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1148
1320
  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
1149
1321
  {
1322
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1323
+ NV_DISPATCH_TARGET(
1324
+ NV_PROVIDES_SM_70, (),
1325
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1326
+ )
1150
1327
  asm volatile(R"YYY(
1151
- .reg .b128 _v;
1152
- mov.b128 {%1, %2}, _v;
1153
- st.volatile.b128 [%0],_v;
1154
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1328
+ {
1329
+ .reg .b128 _v;
1330
+ mov.b128 _v, {%1, %2};
1331
+ st.volatile.b128 [%0],_v;
1332
+ }
1333
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1155
1334
  }
1156
1335
  template <class _Type>
1157
1336
  static inline _CCCL_DEVICE void __cuda_atomic_store(
1158
1337
  _Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
1159
1338
  {
1339
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
1340
+ NV_DISPATCH_TARGET(
1341
+ NV_PROVIDES_SM_70, (),
1342
+ NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
1343
+ )
1160
1344
  asm volatile(R"YYY(
1161
- .reg .b128 _v;
1162
- mov.b128 {%1, %2}, _v;
1163
- st.volatile.b128 [%0],_v;
1164
- )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1345
+ {
1346
+ .reg .b128 _v;
1347
+ mov.b128 _v, {%1, %2};
1348
+ st.volatile.b128 [%0],_v;
1349
+ }
1350
+ )YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
1165
1351
  }
1166
1352
 
1167
1353
  template <typename _Type, typename _Tag, typename _Sco, typename _Mmio>
@@ -1391,242 +1577,382 @@ template <class _Type>
1391
1577
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1392
1578
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_block_tag)
1393
1579
  {
1580
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1581
+ NV_DISPATCH_TARGET(
1582
+ NV_PROVIDES_SM_90, (),
1583
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1584
+ )
1394
1585
  asm volatile(R"YYY(
1395
- .reg .b128 _d;
1396
- .reg .b128 _v;
1397
- mov.b128 {%0, %1}, _d;
1398
- mov.b128 {%4, %5}, _v;
1399
- atom.cas.acquire.cta.b128 _d,[%2],_d,_v;
1400
- mov.b128 _d, {%0, %1};
1401
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1586
+ {
1587
+ .reg .b128 _d;
1588
+ .reg .b128 _v;
1589
+ mov.b128 _d, {%0, %1};
1590
+ mov.b128 _v, {%4, %5};
1591
+ atom.cas.acquire.cta.b128 _d,[%2],_d,_v;
1592
+ mov.b128 {%0, %1}, _d;
1593
+ }
1594
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1402
1595
  template <class _Type>
1403
1596
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1404
1597
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
1405
1598
  {
1599
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1600
+ NV_DISPATCH_TARGET(
1601
+ NV_PROVIDES_SM_90, (),
1602
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1603
+ )
1406
1604
  asm volatile(R"YYY(
1407
- .reg .b128 _d;
1408
- .reg .b128 _v;
1409
- mov.b128 {%0, %1}, _d;
1410
- mov.b128 {%4, %5}, _v;
1411
- atom.cas.acquire.cluster.b128 _d,[%2],_d,_v;
1412
- mov.b128 _d, {%0, %1};
1413
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1605
+ {
1606
+ .reg .b128 _d;
1607
+ .reg .b128 _v;
1608
+ mov.b128 _d, {%0, %1};
1609
+ mov.b128 _v, {%4, %5};
1610
+ atom.cas.acquire.cluster.b128 _d,[%2],_d,_v;
1611
+ mov.b128 {%0, %1}, _d;
1612
+ }
1613
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1414
1614
  template <class _Type>
1415
1615
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1416
1616
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_device_tag)
1417
1617
  {
1618
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1619
+ NV_DISPATCH_TARGET(
1620
+ NV_PROVIDES_SM_90, (),
1621
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1622
+ )
1418
1623
  asm volatile(R"YYY(
1419
- .reg .b128 _d;
1420
- .reg .b128 _v;
1421
- mov.b128 {%0, %1}, _d;
1422
- mov.b128 {%4, %5}, _v;
1423
- atom.cas.acquire.gpu.b128 _d,[%2],_d,_v;
1424
- mov.b128 _d, {%0, %1};
1425
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1624
+ {
1625
+ .reg .b128 _d;
1626
+ .reg .b128 _v;
1627
+ mov.b128 _d, {%0, %1};
1628
+ mov.b128 _v, {%4, %5};
1629
+ atom.cas.acquire.gpu.b128 _d,[%2],_d,_v;
1630
+ mov.b128 {%0, %1}, _d;
1631
+ }
1632
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1426
1633
  template <class _Type>
1427
1634
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1428
1635
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_system_tag)
1429
1636
  {
1637
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1638
+ NV_DISPATCH_TARGET(
1639
+ NV_PROVIDES_SM_90, (),
1640
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1641
+ )
1430
1642
  asm volatile(R"YYY(
1431
- .reg .b128 _d;
1432
- .reg .b128 _v;
1433
- mov.b128 {%0, %1}, _d;
1434
- mov.b128 {%4, %5}, _v;
1435
- atom.cas.acquire.sys.b128 _d,[%2],_d,_v;
1436
- mov.b128 _d, {%0, %1};
1437
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1643
+ {
1644
+ .reg .b128 _d;
1645
+ .reg .b128 _v;
1646
+ mov.b128 _d, {%0, %1};
1647
+ mov.b128 _v, {%4, %5};
1648
+ atom.cas.acquire.sys.b128 _d,[%2],_d,_v;
1649
+ mov.b128 {%0, %1}, _d;
1650
+ }
1651
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1438
1652
  template <class _Type>
1439
1653
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1440
1654
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag)
1441
1655
  {
1656
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1657
+ NV_DISPATCH_TARGET(
1658
+ NV_PROVIDES_SM_90, (),
1659
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1660
+ )
1442
1661
  asm volatile(R"YYY(
1443
- .reg .b128 _d;
1444
- .reg .b128 _v;
1445
- mov.b128 {%0, %1}, _d;
1446
- mov.b128 {%4, %5}, _v;
1447
- atom.cas.relaxed.cta.b128 _d,[%2],_d,_v;
1448
- mov.b128 _d, {%0, %1};
1449
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1662
+ {
1663
+ .reg .b128 _d;
1664
+ .reg .b128 _v;
1665
+ mov.b128 _d, {%0, %1};
1666
+ mov.b128 _v, {%4, %5};
1667
+ atom.cas.relaxed.cta.b128 _d,[%2],_d,_v;
1668
+ mov.b128 {%0, %1}, _d;
1669
+ }
1670
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1450
1671
  template <class _Type>
1451
1672
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1452
1673
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
1453
1674
  {
1675
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1676
+ NV_DISPATCH_TARGET(
1677
+ NV_PROVIDES_SM_90, (),
1678
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1679
+ )
1454
1680
  asm volatile(R"YYY(
1455
- .reg .b128 _d;
1456
- .reg .b128 _v;
1457
- mov.b128 {%0, %1}, _d;
1458
- mov.b128 {%4, %5}, _v;
1459
- atom.cas.relaxed.cluster.b128 _d,[%2],_d,_v;
1460
- mov.b128 _d, {%0, %1};
1461
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1681
+ {
1682
+ .reg .b128 _d;
1683
+ .reg .b128 _v;
1684
+ mov.b128 _d, {%0, %1};
1685
+ mov.b128 _v, {%4, %5};
1686
+ atom.cas.relaxed.cluster.b128 _d,[%2],_d,_v;
1687
+ mov.b128 {%0, %1}, _d;
1688
+ }
1689
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1462
1690
  template <class _Type>
1463
1691
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1464
1692
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag)
1465
1693
  {
1694
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1695
+ NV_DISPATCH_TARGET(
1696
+ NV_PROVIDES_SM_90, (),
1697
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1698
+ )
1466
1699
  asm volatile(R"YYY(
1467
- .reg .b128 _d;
1468
- .reg .b128 _v;
1469
- mov.b128 {%0, %1}, _d;
1470
- mov.b128 {%4, %5}, _v;
1471
- atom.cas.relaxed.gpu.b128 _d,[%2],_d,_v;
1472
- mov.b128 _d, {%0, %1};
1473
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1700
+ {
1701
+ .reg .b128 _d;
1702
+ .reg .b128 _v;
1703
+ mov.b128 _d, {%0, %1};
1704
+ mov.b128 _v, {%4, %5};
1705
+ atom.cas.relaxed.gpu.b128 _d,[%2],_d,_v;
1706
+ mov.b128 {%0, %1}, _d;
1707
+ }
1708
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1474
1709
  template <class _Type>
1475
1710
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1476
1711
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag)
1477
1712
  {
1713
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1714
+ NV_DISPATCH_TARGET(
1715
+ NV_PROVIDES_SM_90, (),
1716
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1717
+ )
1478
1718
  asm volatile(R"YYY(
1479
- .reg .b128 _d;
1480
- .reg .b128 _v;
1481
- mov.b128 {%0, %1}, _d;
1482
- mov.b128 {%4, %5}, _v;
1483
- atom.cas.relaxed.sys.b128 _d,[%2],_d,_v;
1484
- mov.b128 _d, {%0, %1};
1485
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1719
+ {
1720
+ .reg .b128 _d;
1721
+ .reg .b128 _v;
1722
+ mov.b128 _d, {%0, %1};
1723
+ mov.b128 _v, {%4, %5};
1724
+ atom.cas.relaxed.sys.b128 _d,[%2],_d,_v;
1725
+ mov.b128 {%0, %1}, _d;
1726
+ }
1727
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1486
1728
  template <class _Type>
1487
1729
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1488
1730
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_block_tag)
1489
1731
  {
1732
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1733
+ NV_DISPATCH_TARGET(
1734
+ NV_PROVIDES_SM_90, (),
1735
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1736
+ )
1490
1737
  asm volatile(R"YYY(
1491
- .reg .b128 _d;
1492
- .reg .b128 _v;
1493
- mov.b128 {%0, %1}, _d;
1494
- mov.b128 {%4, %5}, _v;
1495
- atom.cas.release.cta.b128 _d,[%2],_d,_v;
1496
- mov.b128 _d, {%0, %1};
1497
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1738
+ {
1739
+ .reg .b128 _d;
1740
+ .reg .b128 _v;
1741
+ mov.b128 _d, {%0, %1};
1742
+ mov.b128 _v, {%4, %5};
1743
+ atom.cas.release.cta.b128 _d,[%2],_d,_v;
1744
+ mov.b128 {%0, %1}, _d;
1745
+ }
1746
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1498
1747
  template <class _Type>
1499
1748
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1500
1749
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
1501
1750
  {
1751
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1752
+ NV_DISPATCH_TARGET(
1753
+ NV_PROVIDES_SM_90, (),
1754
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1755
+ )
1502
1756
  asm volatile(R"YYY(
1503
- .reg .b128 _d;
1504
- .reg .b128 _v;
1505
- mov.b128 {%0, %1}, _d;
1506
- mov.b128 {%4, %5}, _v;
1507
- atom.cas.release.cluster.b128 _d,[%2],_d,_v;
1508
- mov.b128 _d, {%0, %1};
1509
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1757
+ {
1758
+ .reg .b128 _d;
1759
+ .reg .b128 _v;
1760
+ mov.b128 _d, {%0, %1};
1761
+ mov.b128 _v, {%4, %5};
1762
+ atom.cas.release.cluster.b128 _d,[%2],_d,_v;
1763
+ mov.b128 {%0, %1}, _d;
1764
+ }
1765
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1510
1766
  template <class _Type>
1511
1767
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1512
1768
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_device_tag)
1513
1769
  {
1770
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1771
+ NV_DISPATCH_TARGET(
1772
+ NV_PROVIDES_SM_90, (),
1773
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1774
+ )
1514
1775
  asm volatile(R"YYY(
1515
- .reg .b128 _d;
1516
- .reg .b128 _v;
1517
- mov.b128 {%0, %1}, _d;
1518
- mov.b128 {%4, %5}, _v;
1519
- atom.cas.release.gpu.b128 _d,[%2],_d,_v;
1520
- mov.b128 _d, {%0, %1};
1521
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1776
+ {
1777
+ .reg .b128 _d;
1778
+ .reg .b128 _v;
1779
+ mov.b128 _d, {%0, %1};
1780
+ mov.b128 _v, {%4, %5};
1781
+ atom.cas.release.gpu.b128 _d,[%2],_d,_v;
1782
+ mov.b128 {%0, %1}, _d;
1783
+ }
1784
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1522
1785
  template <class _Type>
1523
1786
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1524
1787
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_system_tag)
1525
1788
  {
1789
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1790
+ NV_DISPATCH_TARGET(
1791
+ NV_PROVIDES_SM_90, (),
1792
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1793
+ )
1526
1794
  asm volatile(R"YYY(
1527
- .reg .b128 _d;
1528
- .reg .b128 _v;
1529
- mov.b128 {%0, %1}, _d;
1530
- mov.b128 {%4, %5}, _v;
1531
- atom.cas.release.sys.b128 _d,[%2],_d,_v;
1532
- mov.b128 _d, {%0, %1};
1533
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1795
+ {
1796
+ .reg .b128 _d;
1797
+ .reg .b128 _v;
1798
+ mov.b128 _d, {%0, %1};
1799
+ mov.b128 _v, {%4, %5};
1800
+ atom.cas.release.sys.b128 _d,[%2],_d,_v;
1801
+ mov.b128 {%0, %1}, _d;
1802
+ }
1803
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1534
1804
  template <class _Type>
1535
1805
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1536
1806
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_block_tag)
1537
1807
  {
1808
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1809
+ NV_DISPATCH_TARGET(
1810
+ NV_PROVIDES_SM_90, (),
1811
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1812
+ )
1538
1813
  asm volatile(R"YYY(
1539
- .reg .b128 _d;
1540
- .reg .b128 _v;
1541
- mov.b128 {%0, %1}, _d;
1542
- mov.b128 {%4, %5}, _v;
1543
- atom.cas.acq_rel.cta.b128 _d,[%2],_d,_v;
1544
- mov.b128 _d, {%0, %1};
1545
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1814
+ {
1815
+ .reg .b128 _d;
1816
+ .reg .b128 _v;
1817
+ mov.b128 _d, {%0, %1};
1818
+ mov.b128 _v, {%4, %5};
1819
+ atom.cas.acq_rel.cta.b128 _d,[%2],_d,_v;
1820
+ mov.b128 {%0, %1}, _d;
1821
+ }
1822
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1546
1823
  template <class _Type>
1547
1824
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1548
1825
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
1549
1826
  {
1827
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1828
+ NV_DISPATCH_TARGET(
1829
+ NV_PROVIDES_SM_90, (),
1830
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1831
+ )
1550
1832
  asm volatile(R"YYY(
1551
- .reg .b128 _d;
1552
- .reg .b128 _v;
1553
- mov.b128 {%0, %1}, _d;
1554
- mov.b128 {%4, %5}, _v;
1555
- atom.cas.acq_rel.cluster.b128 _d,[%2],_d,_v;
1556
- mov.b128 _d, {%0, %1};
1557
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1833
+ {
1834
+ .reg .b128 _d;
1835
+ .reg .b128 _v;
1836
+ mov.b128 _d, {%0, %1};
1837
+ mov.b128 _v, {%4, %5};
1838
+ atom.cas.acq_rel.cluster.b128 _d,[%2],_d,_v;
1839
+ mov.b128 {%0, %1}, _d;
1840
+ }
1841
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1558
1842
  template <class _Type>
1559
1843
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1560
1844
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_device_tag)
1561
1845
  {
1846
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1847
+ NV_DISPATCH_TARGET(
1848
+ NV_PROVIDES_SM_90, (),
1849
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1850
+ )
1562
1851
  asm volatile(R"YYY(
1563
- .reg .b128 _d;
1564
- .reg .b128 _v;
1565
- mov.b128 {%0, %1}, _d;
1566
- mov.b128 {%4, %5}, _v;
1567
- atom.cas.acq_rel.gpu.b128 _d,[%2],_d,_v;
1568
- mov.b128 _d, {%0, %1};
1569
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1852
+ {
1853
+ .reg .b128 _d;
1854
+ .reg .b128 _v;
1855
+ mov.b128 _d, {%0, %1};
1856
+ mov.b128 _v, {%4, %5};
1857
+ atom.cas.acq_rel.gpu.b128 _d,[%2],_d,_v;
1858
+ mov.b128 {%0, %1}, _d;
1859
+ }
1860
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1570
1861
  template <class _Type>
1571
1862
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1572
1863
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_system_tag)
1573
1864
  {
1865
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1866
+ NV_DISPATCH_TARGET(
1867
+ NV_PROVIDES_SM_90, (),
1868
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1869
+ )
1574
1870
  asm volatile(R"YYY(
1575
- .reg .b128 _d;
1576
- .reg .b128 _v;
1577
- mov.b128 {%0, %1}, _d;
1578
- mov.b128 {%4, %5}, _v;
1579
- atom.cas.acq_rel.sys.b128 _d,[%2],_d,_v;
1580
- mov.b128 _d, {%0, %1};
1581
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1871
+ {
1872
+ .reg .b128 _d;
1873
+ .reg .b128 _v;
1874
+ mov.b128 _d, {%0, %1};
1875
+ mov.b128 _v, {%4, %5};
1876
+ atom.cas.acq_rel.sys.b128 _d,[%2],_d,_v;
1877
+ mov.b128 {%0, %1}, _d;
1878
+ }
1879
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1582
1880
  template <class _Type>
1583
1881
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1584
1882
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag)
1585
1883
  {
1884
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1885
+ NV_DISPATCH_TARGET(
1886
+ NV_PROVIDES_SM_90, (),
1887
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1888
+ )
1586
1889
  asm volatile(R"YYY(
1587
- .reg .b128 _d;
1588
- .reg .b128 _v;
1589
- mov.b128 {%0, %1}, _d;
1590
- mov.b128 {%4, %5}, _v;
1591
- atom.cas.cta.b128 _d,[%2],_d,_v;
1592
- mov.b128 _d, {%0, %1};
1593
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1890
+ {
1891
+ .reg .b128 _d;
1892
+ .reg .b128 _v;
1893
+ mov.b128 _d, {%0, %1};
1894
+ mov.b128 _v, {%4, %5};
1895
+ atom.cas.cta.b128 _d,[%2],_d,_v;
1896
+ mov.b128 {%0, %1}, _d;
1897
+ }
1898
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1594
1899
  template <class _Type>
1595
1900
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1596
1901
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
1597
1902
  {
1903
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1904
+ NV_DISPATCH_TARGET(
1905
+ NV_PROVIDES_SM_90, (),
1906
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1907
+ )
1598
1908
  asm volatile(R"YYY(
1599
- .reg .b128 _d;
1600
- .reg .b128 _v;
1601
- mov.b128 {%0, %1}, _d;
1602
- mov.b128 {%4, %5}, _v;
1603
- atom.cas.cluster.b128 _d,[%2],_d,_v;
1604
- mov.b128 _d, {%0, %1};
1605
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1909
+ {
1910
+ .reg .b128 _d;
1911
+ .reg .b128 _v;
1912
+ mov.b128 _d, {%0, %1};
1913
+ mov.b128 _v, {%4, %5};
1914
+ atom.cas.cluster.b128 _d,[%2],_d,_v;
1915
+ mov.b128 {%0, %1}, _d;
1916
+ }
1917
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1606
1918
  template <class _Type>
1607
1919
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1608
1920
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag)
1609
1921
  {
1922
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1923
+ NV_DISPATCH_TARGET(
1924
+ NV_PROVIDES_SM_90, (),
1925
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1926
+ )
1610
1927
  asm volatile(R"YYY(
1611
- .reg .b128 _d;
1612
- .reg .b128 _v;
1613
- mov.b128 {%0, %1}, _d;
1614
- mov.b128 {%4, %5}, _v;
1615
- atom.cas.gpu.b128 _d,[%2],_d,_v;
1616
- mov.b128 _d, {%0, %1};
1617
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1928
+ {
1929
+ .reg .b128 _d;
1930
+ .reg .b128 _v;
1931
+ mov.b128 _d, {%0, %1};
1932
+ mov.b128 _v, {%4, %5};
1933
+ atom.cas.gpu.b128 _d,[%2],_d,_v;
1934
+ mov.b128 {%0, %1}, _d;
1935
+ }
1936
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1618
1937
  template <class _Type>
1619
1938
  static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
1620
1939
  _Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag)
1621
1940
  {
1941
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
1942
+ NV_DISPATCH_TARGET(
1943
+ NV_PROVIDES_SM_90, (),
1944
+ NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
1945
+ )
1622
1946
  asm volatile(R"YYY(
1623
- .reg .b128 _d;
1624
- .reg .b128 _v;
1625
- mov.b128 {%0, %1}, _d;
1626
- mov.b128 {%4, %5}, _v;
1627
- atom.cas.sys.b128 _d,[%2],_d,_v;
1628
- mov.b128 _d, {%0, %1};
1629
- )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.x == __cmp.x && __dst.y == __cmp.y; }
1947
+ {
1948
+ .reg .b128 _d;
1949
+ .reg .b128 _v;
1950
+ mov.b128 _d, {%0, %1};
1951
+ mov.b128 _v, {%4, %5};
1952
+ atom.cas.sys.b128 _d,[%2],_d,_v;
1953
+ mov.b128 {%0, %1}, _d;
1954
+ }
1955
+ )YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
1630
1956
 
1631
1957
  template <typename _Type, typename _Tag, typename _Sco>
1632
1958
  struct __cuda_atomic_bind_compare_exchange {
@@ -1858,241 +2184,381 @@ template <class _Type>
1858
2184
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
1859
2185
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_block_tag)
1860
2186
  {
2187
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2188
+ NV_DISPATCH_TARGET(
2189
+ NV_PROVIDES_SM_90, (),
2190
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2191
+ )
1861
2192
  asm volatile(R"YYY(
1862
- .reg .b128 _d;
1863
- .reg .b128 _v;
1864
- mov.b128 {%3, %4}, _v;
1865
- atom.exch.acquire.cta.b128 _d,[%2],_v;
1866
- mov.b128 _d, {%0, %1};
1867
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2193
+ {
2194
+ .reg .b128 _d;
2195
+ .reg .b128 _v;
2196
+ mov.b128 _v, {%3, %4};
2197
+ atom.exch.acquire.cta.b128 _d,[%2],_v;
2198
+ mov.b128 {%0, %1}, _d;
2199
+ }
2200
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
1868
2201
  }
1869
2202
  template <class _Type>
1870
2203
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
1871
2204
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
1872
2205
  {
2206
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2207
+ NV_DISPATCH_TARGET(
2208
+ NV_PROVIDES_SM_90, (),
2209
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2210
+ )
1873
2211
  asm volatile(R"YYY(
1874
- .reg .b128 _d;
1875
- .reg .b128 _v;
1876
- mov.b128 {%3, %4}, _v;
1877
- atom.exch.acquire.cluster.b128 _d,[%2],_v;
1878
- mov.b128 _d, {%0, %1};
1879
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2212
+ {
2213
+ .reg .b128 _d;
2214
+ .reg .b128 _v;
2215
+ mov.b128 _v, {%3, %4};
2216
+ atom.exch.acquire.cluster.b128 _d,[%2],_v;
2217
+ mov.b128 {%0, %1}, _d;
2218
+ }
2219
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
1880
2220
  }
1881
2221
  template <class _Type>
1882
2222
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
1883
2223
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_device_tag)
1884
2224
  {
2225
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2226
+ NV_DISPATCH_TARGET(
2227
+ NV_PROVIDES_SM_90, (),
2228
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2229
+ )
1885
2230
  asm volatile(R"YYY(
1886
- .reg .b128 _d;
1887
- .reg .b128 _v;
1888
- mov.b128 {%3, %4}, _v;
1889
- atom.exch.acquire.gpu.b128 _d,[%2],_v;
1890
- mov.b128 _d, {%0, %1};
1891
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2231
+ {
2232
+ .reg .b128 _d;
2233
+ .reg .b128 _v;
2234
+ mov.b128 _v, {%3, %4};
2235
+ atom.exch.acquire.gpu.b128 _d,[%2],_v;
2236
+ mov.b128 {%0, %1}, _d;
2237
+ }
2238
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
1892
2239
  }
1893
2240
  template <class _Type>
1894
2241
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
1895
2242
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_system_tag)
1896
2243
  {
2244
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2245
+ NV_DISPATCH_TARGET(
2246
+ NV_PROVIDES_SM_90, (),
2247
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2248
+ )
1897
2249
  asm volatile(R"YYY(
1898
- .reg .b128 _d;
1899
- .reg .b128 _v;
1900
- mov.b128 {%3, %4}, _v;
1901
- atom.exch.acquire.sys.b128 _d,[%2],_v;
1902
- mov.b128 _d, {%0, %1};
1903
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2250
+ {
2251
+ .reg .b128 _d;
2252
+ .reg .b128 _v;
2253
+ mov.b128 _v, {%3, %4};
2254
+ atom.exch.acquire.sys.b128 _d,[%2],_v;
2255
+ mov.b128 {%0, %1}, _d;
2256
+ }
2257
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
1904
2258
  }
1905
2259
  template <class _Type>
1906
2260
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
1907
2261
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag)
1908
2262
  {
2263
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2264
+ NV_DISPATCH_TARGET(
2265
+ NV_PROVIDES_SM_90, (),
2266
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2267
+ )
1909
2268
  asm volatile(R"YYY(
1910
- .reg .b128 _d;
1911
- .reg .b128 _v;
1912
- mov.b128 {%3, %4}, _v;
1913
- atom.exch.relaxed.cta.b128 _d,[%2],_v;
1914
- mov.b128 _d, {%0, %1};
1915
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2269
+ {
2270
+ .reg .b128 _d;
2271
+ .reg .b128 _v;
2272
+ mov.b128 _v, {%3, %4};
2273
+ atom.exch.relaxed.cta.b128 _d,[%2],_v;
2274
+ mov.b128 {%0, %1}, _d;
2275
+ }
2276
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
1916
2277
  }
1917
2278
  template <class _Type>
1918
2279
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
1919
2280
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
1920
2281
  {
2282
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2283
+ NV_DISPATCH_TARGET(
2284
+ NV_PROVIDES_SM_90, (),
2285
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2286
+ )
1921
2287
  asm volatile(R"YYY(
1922
- .reg .b128 _d;
1923
- .reg .b128 _v;
1924
- mov.b128 {%3, %4}, _v;
1925
- atom.exch.relaxed.cluster.b128 _d,[%2],_v;
1926
- mov.b128 _d, {%0, %1};
1927
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2288
+ {
2289
+ .reg .b128 _d;
2290
+ .reg .b128 _v;
2291
+ mov.b128 _v, {%3, %4};
2292
+ atom.exch.relaxed.cluster.b128 _d,[%2],_v;
2293
+ mov.b128 {%0, %1}, _d;
2294
+ }
2295
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
1928
2296
  }
1929
2297
  template <class _Type>
1930
2298
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
1931
2299
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag)
1932
2300
  {
2301
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2302
+ NV_DISPATCH_TARGET(
2303
+ NV_PROVIDES_SM_90, (),
2304
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2305
+ )
1933
2306
  asm volatile(R"YYY(
1934
- .reg .b128 _d;
1935
- .reg .b128 _v;
1936
- mov.b128 {%3, %4}, _v;
1937
- atom.exch.relaxed.gpu.b128 _d,[%2],_v;
1938
- mov.b128 _d, {%0, %1};
1939
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2307
+ {
2308
+ .reg .b128 _d;
2309
+ .reg .b128 _v;
2310
+ mov.b128 _v, {%3, %4};
2311
+ atom.exch.relaxed.gpu.b128 _d,[%2],_v;
2312
+ mov.b128 {%0, %1}, _d;
2313
+ }
2314
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
1940
2315
  }
1941
2316
  template <class _Type>
1942
2317
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
1943
2318
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag)
1944
2319
  {
2320
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2321
+ NV_DISPATCH_TARGET(
2322
+ NV_PROVIDES_SM_90, (),
2323
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2324
+ )
1945
2325
  asm volatile(R"YYY(
1946
- .reg .b128 _d;
1947
- .reg .b128 _v;
1948
- mov.b128 {%3, %4}, _v;
1949
- atom.exch.relaxed.sys.b128 _d,[%2],_v;
1950
- mov.b128 _d, {%0, %1};
1951
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2326
+ {
2327
+ .reg .b128 _d;
2328
+ .reg .b128 _v;
2329
+ mov.b128 _v, {%3, %4};
2330
+ atom.exch.relaxed.sys.b128 _d,[%2],_v;
2331
+ mov.b128 {%0, %1}, _d;
2332
+ }
2333
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
1952
2334
  }
1953
2335
  template <class _Type>
1954
2336
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
1955
2337
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_block_tag)
1956
2338
  {
2339
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2340
+ NV_DISPATCH_TARGET(
2341
+ NV_PROVIDES_SM_90, (),
2342
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2343
+ )
1957
2344
  asm volatile(R"YYY(
1958
- .reg .b128 _d;
1959
- .reg .b128 _v;
1960
- mov.b128 {%3, %4}, _v;
1961
- atom.exch.release.cta.b128 _d,[%2],_v;
1962
- mov.b128 _d, {%0, %1};
1963
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2345
+ {
2346
+ .reg .b128 _d;
2347
+ .reg .b128 _v;
2348
+ mov.b128 _v, {%3, %4};
2349
+ atom.exch.release.cta.b128 _d,[%2],_v;
2350
+ mov.b128 {%0, %1}, _d;
2351
+ }
2352
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
1964
2353
  }
1965
2354
  template <class _Type>
1966
2355
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
1967
2356
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
1968
2357
  {
2358
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2359
+ NV_DISPATCH_TARGET(
2360
+ NV_PROVIDES_SM_90, (),
2361
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2362
+ )
1969
2363
  asm volatile(R"YYY(
1970
- .reg .b128 _d;
1971
- .reg .b128 _v;
1972
- mov.b128 {%3, %4}, _v;
1973
- atom.exch.release.cluster.b128 _d,[%2],_v;
1974
- mov.b128 _d, {%0, %1};
1975
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2364
+ {
2365
+ .reg .b128 _d;
2366
+ .reg .b128 _v;
2367
+ mov.b128 _v, {%3, %4};
2368
+ atom.exch.release.cluster.b128 _d,[%2],_v;
2369
+ mov.b128 {%0, %1}, _d;
2370
+ }
2371
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
1976
2372
  }
1977
2373
  template <class _Type>
1978
2374
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
1979
2375
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_device_tag)
1980
2376
  {
2377
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2378
+ NV_DISPATCH_TARGET(
2379
+ NV_PROVIDES_SM_90, (),
2380
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2381
+ )
1981
2382
  asm volatile(R"YYY(
1982
- .reg .b128 _d;
1983
- .reg .b128 _v;
1984
- mov.b128 {%3, %4}, _v;
1985
- atom.exch.release.gpu.b128 _d,[%2],_v;
1986
- mov.b128 _d, {%0, %1};
1987
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2383
+ {
2384
+ .reg .b128 _d;
2385
+ .reg .b128 _v;
2386
+ mov.b128 _v, {%3, %4};
2387
+ atom.exch.release.gpu.b128 _d,[%2],_v;
2388
+ mov.b128 {%0, %1}, _d;
2389
+ }
2390
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
1988
2391
  }
1989
2392
  template <class _Type>
1990
2393
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
1991
2394
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_system_tag)
1992
2395
  {
2396
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2397
+ NV_DISPATCH_TARGET(
2398
+ NV_PROVIDES_SM_90, (),
2399
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2400
+ )
1993
2401
  asm volatile(R"YYY(
1994
- .reg .b128 _d;
1995
- .reg .b128 _v;
1996
- mov.b128 {%3, %4}, _v;
1997
- atom.exch.release.sys.b128 _d,[%2],_v;
1998
- mov.b128 _d, {%0, %1};
1999
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2402
+ {
2403
+ .reg .b128 _d;
2404
+ .reg .b128 _v;
2405
+ mov.b128 _v, {%3, %4};
2406
+ atom.exch.release.sys.b128 _d,[%2],_v;
2407
+ mov.b128 {%0, %1}, _d;
2408
+ }
2409
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2000
2410
  }
2001
2411
  template <class _Type>
2002
2412
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
2003
2413
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_block_tag)
2004
2414
  {
2415
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2416
+ NV_DISPATCH_TARGET(
2417
+ NV_PROVIDES_SM_90, (),
2418
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2419
+ )
2005
2420
  asm volatile(R"YYY(
2006
- .reg .b128 _d;
2007
- .reg .b128 _v;
2008
- mov.b128 {%3, %4}, _v;
2009
- atom.exch.acq_rel.cta.b128 _d,[%2],_v;
2010
- mov.b128 _d, {%0, %1};
2011
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2421
+ {
2422
+ .reg .b128 _d;
2423
+ .reg .b128 _v;
2424
+ mov.b128 _v, {%3, %4};
2425
+ atom.exch.acq_rel.cta.b128 _d,[%2],_v;
2426
+ mov.b128 {%0, %1}, _d;
2427
+ }
2428
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2012
2429
  }
2013
2430
  template <class _Type>
2014
2431
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
2015
2432
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
2016
2433
  {
2434
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2435
+ NV_DISPATCH_TARGET(
2436
+ NV_PROVIDES_SM_90, (),
2437
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2438
+ )
2017
2439
  asm volatile(R"YYY(
2018
- .reg .b128 _d;
2019
- .reg .b128 _v;
2020
- mov.b128 {%3, %4}, _v;
2021
- atom.exch.acq_rel.cluster.b128 _d,[%2],_v;
2022
- mov.b128 _d, {%0, %1};
2023
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2440
+ {
2441
+ .reg .b128 _d;
2442
+ .reg .b128 _v;
2443
+ mov.b128 _v, {%3, %4};
2444
+ atom.exch.acq_rel.cluster.b128 _d,[%2],_v;
2445
+ mov.b128 {%0, %1}, _d;
2446
+ }
2447
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2024
2448
  }
2025
2449
  template <class _Type>
2026
2450
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
2027
2451
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_device_tag)
2028
2452
  {
2453
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2454
+ NV_DISPATCH_TARGET(
2455
+ NV_PROVIDES_SM_90, (),
2456
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2457
+ )
2029
2458
  asm volatile(R"YYY(
2030
- .reg .b128 _d;
2031
- .reg .b128 _v;
2032
- mov.b128 {%3, %4}, _v;
2033
- atom.exch.acq_rel.gpu.b128 _d,[%2],_v;
2034
- mov.b128 _d, {%0, %1};
2035
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2459
+ {
2460
+ .reg .b128 _d;
2461
+ .reg .b128 _v;
2462
+ mov.b128 _v, {%3, %4};
2463
+ atom.exch.acq_rel.gpu.b128 _d,[%2],_v;
2464
+ mov.b128 {%0, %1}, _d;
2465
+ }
2466
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2036
2467
  }
2037
2468
  template <class _Type>
2038
2469
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
2039
2470
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_system_tag)
2040
2471
  {
2472
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2473
+ NV_DISPATCH_TARGET(
2474
+ NV_PROVIDES_SM_90, (),
2475
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2476
+ )
2041
2477
  asm volatile(R"YYY(
2042
- .reg .b128 _d;
2043
- .reg .b128 _v;
2044
- mov.b128 {%3, %4}, _v;
2045
- atom.exch.acq_rel.sys.b128 _d,[%2],_v;
2046
- mov.b128 _d, {%0, %1};
2047
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2478
+ {
2479
+ .reg .b128 _d;
2480
+ .reg .b128 _v;
2481
+ mov.b128 _v, {%3, %4};
2482
+ atom.exch.acq_rel.sys.b128 _d,[%2],_v;
2483
+ mov.b128 {%0, %1}, _d;
2484
+ }
2485
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2048
2486
  }
2049
2487
  template <class _Type>
2050
2488
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
2051
2489
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag)
2052
2490
  {
2491
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2492
+ NV_DISPATCH_TARGET(
2493
+ NV_PROVIDES_SM_90, (),
2494
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2495
+ )
2053
2496
  asm volatile(R"YYY(
2054
- .reg .b128 _d;
2055
- .reg .b128 _v;
2056
- mov.b128 {%3, %4}, _v;
2057
- atom.exch.cta.b128 _d,[%2],_v;
2058
- mov.b128 _d, {%0, %1};
2059
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2497
+ {
2498
+ .reg .b128 _d;
2499
+ .reg .b128 _v;
2500
+ mov.b128 _v, {%3, %4};
2501
+ atom.exch.cta.b128 _d,[%2],_v;
2502
+ mov.b128 {%0, %1}, _d;
2503
+ }
2504
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2060
2505
  }
2061
2506
  template <class _Type>
2062
2507
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
2063
2508
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
2064
2509
  {
2510
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2511
+ NV_DISPATCH_TARGET(
2512
+ NV_PROVIDES_SM_90, (),
2513
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2514
+ )
2065
2515
  asm volatile(R"YYY(
2066
- .reg .b128 _d;
2067
- .reg .b128 _v;
2068
- mov.b128 {%3, %4}, _v;
2069
- atom.exch.cluster.b128 _d,[%2],_v;
2070
- mov.b128 _d, {%0, %1};
2071
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2516
+ {
2517
+ .reg .b128 _d;
2518
+ .reg .b128 _v;
2519
+ mov.b128 _v, {%3, %4};
2520
+ atom.exch.cluster.b128 _d,[%2],_v;
2521
+ mov.b128 {%0, %1}, _d;
2522
+ }
2523
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2072
2524
  }
2073
2525
  template <class _Type>
2074
2526
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
2075
2527
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag)
2076
2528
  {
2529
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2530
+ NV_DISPATCH_TARGET(
2531
+ NV_PROVIDES_SM_90, (),
2532
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2533
+ )
2077
2534
  asm volatile(R"YYY(
2078
- .reg .b128 _d;
2079
- .reg .b128 _v;
2080
- mov.b128 {%3, %4}, _v;
2081
- atom.exch.gpu.b128 _d,[%2],_v;
2082
- mov.b128 _d, {%0, %1};
2083
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2535
+ {
2536
+ .reg .b128 _d;
2537
+ .reg .b128 _v;
2538
+ mov.b128 _v, {%3, %4};
2539
+ atom.exch.gpu.b128 _d,[%2],_v;
2540
+ mov.b128 {%0, %1}, _d;
2541
+ }
2542
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2084
2543
  }
2085
2544
  template <class _Type>
2086
2545
  static inline _CCCL_DEVICE void __cuda_atomic_exchange(
2087
2546
  _Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag)
2088
2547
  {
2548
+ static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
2549
+ NV_DISPATCH_TARGET(
2550
+ NV_PROVIDES_SM_90, (),
2551
+ NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
2552
+ )
2089
2553
  asm volatile(R"YYY(
2090
- .reg .b128 _d;
2091
- .reg .b128 _v;
2092
- mov.b128 {%3, %4}, _v;
2093
- atom.exch.sys.b128 _d,[%2],_v;
2094
- mov.b128 _d, {%0, %1};
2095
- )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2554
+ {
2555
+ .reg .b128 _d;
2556
+ .reg .b128 _v;
2557
+ mov.b128 _v, {%3, %4};
2558
+ atom.exch.sys.b128 _d,[%2],_v;
2559
+ mov.b128 {%0, %1}, _d;
2560
+ }
2561
+ )YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
2096
2562
  }
2097
2563
 
2098
2564
  template <typename _Type, typename _Tag, typename _Sco>
@@ -2572,7 +3038,7 @@ struct __cuda_atomic_bind_fetch_add {
2572
3038
  }
2573
3039
  };
2574
3040
  template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_arithmetic<_Type> = 0>
2575
- static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
3041
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
2576
3042
  {
2577
3043
  constexpr auto __skip_v = __atomic_ptr_skip_t<_Type>::__skip;
2578
3044
  __op = __op * __skip_v;
@@ -2588,7 +3054,7 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type* __ptr, _Up __op,
2588
3054
  return __dst;
2589
3055
  }
2590
3056
  template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_arithmetic<_Type> = 0>
2591
- static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
3057
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
2592
3058
  {
2593
3059
  constexpr auto __skip_v = __atomic_ptr_skip_t<_Type>::__skip;
2594
3060
  __op = __op * __skip_v;
@@ -2777,7 +3243,7 @@ struct __cuda_atomic_bind_fetch_and {
2777
3243
  }
2778
3244
  };
2779
3245
  template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
2780
- static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
3246
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
2781
3247
  {
2782
3248
  constexpr auto __skip_v = 1;
2783
3249
  __op = __op * __skip_v;
@@ -2793,7 +3259,7 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type* __ptr, _Up __op,
2793
3259
  return __dst;
2794
3260
  }
2795
3261
  template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
2796
- static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
3262
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
2797
3263
  {
2798
3264
  constexpr auto __skip_v = 1;
2799
3265
  __op = __op * __skip_v;
@@ -3142,7 +3608,7 @@ struct __cuda_atomic_bind_fetch_max {
3142
3608
  }
3143
3609
  };
3144
3610
  template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
3145
- static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
3611
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
3146
3612
  {
3147
3613
  constexpr auto __skip_v = 1;
3148
3614
  __op = __op * __skip_v;
@@ -3158,7 +3624,7 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type* __ptr, _Up __op,
3158
3624
  return __dst;
3159
3625
  }
3160
3626
  template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
3161
- static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
3627
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
3162
3628
  {
3163
3629
  constexpr auto __skip_v = 1;
3164
3630
  __op = __op * __skip_v;
@@ -3507,7 +3973,7 @@ struct __cuda_atomic_bind_fetch_min {
3507
3973
  }
3508
3974
  };
3509
3975
  template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
3510
- static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
3976
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
3511
3977
  {
3512
3978
  constexpr auto __skip_v = 1;
3513
3979
  __op = __op * __skip_v;
@@ -3523,7 +3989,7 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type* __ptr, _Up __op,
3523
3989
  return __dst;
3524
3990
  }
3525
3991
  template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
3526
- static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
3992
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
3527
3993
  {
3528
3994
  constexpr auto __skip_v = 1;
3529
3995
  __op = __op * __skip_v;
@@ -3712,7 +4178,7 @@ struct __cuda_atomic_bind_fetch_or {
3712
4178
  }
3713
4179
  };
3714
4180
  template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
3715
- static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
4181
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
3716
4182
  {
3717
4183
  constexpr auto __skip_v = 1;
3718
4184
  __op = __op * __skip_v;
@@ -3728,7 +4194,7 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type* __ptr, _Up __op,
3728
4194
  return __dst;
3729
4195
  }
3730
4196
  template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
3731
- static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
4197
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
3732
4198
  {
3733
4199
  constexpr auto __skip_v = 1;
3734
4200
  __op = __op * __skip_v;
@@ -3917,7 +4383,7 @@ struct __cuda_atomic_bind_fetch_xor {
3917
4383
  }
3918
4384
  };
3919
4385
  template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
3920
- static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
4386
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
3921
4387
  {
3922
4388
  constexpr auto __skip_v = 1;
3923
4389
  __op = __op * __skip_v;
@@ -3933,7 +4399,7 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type* __ptr, _Up __op,
3933
4399
  return __dst;
3934
4400
  }
3935
4401
  template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
3936
- static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
4402
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
3937
4403
  {
3938
4404
  constexpr auto __skip_v = 1;
3939
4405
  __op = __op * __skip_v;
@@ -3950,12 +4416,12 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type volatile* __ptr,
3950
4416
  }
3951
4417
 
3952
4418
  template <class _Type, class _Up, class _Sco>
3953
- static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
4419
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
3954
4420
  {
3955
4421
  return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{});
3956
4422
  }
3957
4423
  template <class _Type, class _Up, class _Sco>
3958
- static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
4424
+ [[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
3959
4425
  {
3960
4426
  return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{});
3961
4427
  }