cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1968) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  52. cuda/cccl/headers/include/cub/config.cuh +53 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +120 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +800 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +118 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
  177. cuda/cccl/headers/include/cub/version.cuh +89 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  209. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  211. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  212. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  213. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  214. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  217. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  218. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  225. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  226. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  227. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  228. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  230. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  231. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  232. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  233. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  234. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  235. cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
  236. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  237. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  238. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  239. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  240. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  241. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  242. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  243. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  244. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  245. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  250. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  251. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  252. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  253. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  254. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  255. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  256. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
  257. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  258. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  259. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  260. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  261. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  268. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  269. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  273. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
  301. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  302. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  303. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  304. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  305. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  306. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  414. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  415. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  416. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  417. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  418. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  419. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  420. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  421. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  422. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  423. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  424. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  425. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  447. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  448. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  449. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  450. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  451. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  452. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  453. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  454. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  455. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  456. cuda/cccl/headers/include/cuda/access_property +26 -0
  457. cuda/cccl/headers/include/cuda/algorithm +27 -0
  458. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  459. cuda/cccl/headers/include/cuda/atomic +27 -0
  460. cuda/cccl/headers/include/cuda/barrier +267 -0
  461. cuda/cccl/headers/include/cuda/bit +29 -0
  462. cuda/cccl/headers/include/cuda/cmath +37 -0
  463. cuda/cccl/headers/include/cuda/devices +33 -0
  464. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  465. cuda/cccl/headers/include/cuda/functional +32 -0
  466. cuda/cccl/headers/include/cuda/iterator +39 -0
  467. cuda/cccl/headers/include/cuda/latch +27 -0
  468. cuda/cccl/headers/include/cuda/mdspan +28 -0
  469. cuda/cccl/headers/include/cuda/memory +35 -0
  470. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  471. cuda/cccl/headers/include/cuda/numeric +29 -0
  472. cuda/cccl/headers/include/cuda/pipeline +579 -0
  473. cuda/cccl/headers/include/cuda/ptx +129 -0
  474. cuda/cccl/headers/include/cuda/semaphore +31 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  593. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  594. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  595. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  601. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  602. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  603. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  604. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  605. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  635. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  636. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  637. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  638. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  639. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  640. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  641. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  642. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  643. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  644. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  645. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  646. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  647. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  648. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  649. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
  650. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  656. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  657. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
  658. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  659. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  660. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  661. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  678. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  679. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  680. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  681. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  682. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  683. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  684. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  685. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  686. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  687. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  688. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  694. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  695. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  696. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  697. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  698. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  699. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
  700. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  701. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  702. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  703. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  716. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  717. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  718. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  719. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  720. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  721. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  722. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  723. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  724. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  725. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
  726. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  727. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  728. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  729. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  730. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  731. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  732. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  733. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
  734. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  735. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  736. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  737. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  738. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  739. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  740. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  741. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  742. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  743. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  744. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  745. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  750. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  751. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  752. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  753. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  754. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  755. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  756. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  757. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  758. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  759. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  760. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  761. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  762. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  763. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  764. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  765. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  766. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  767. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  768. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  769. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  775. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  776. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  777. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  778. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  779. cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
  780. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  781. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  797. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  798. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  799. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  800. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  801. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  802. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  803. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  804. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  805. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  806. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  807. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  808. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  818. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  819. cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
  820. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  834. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  858. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  859. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  860. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  861. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  862. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  864. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  865. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  866. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  867. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  868. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  869. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  870. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
  871. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  872. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  873. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  874. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  875. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  876. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  877. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  878. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  879. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  880. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  881. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  882. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  883. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  884. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  885. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
  886. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  887. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  888. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  889. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
  890. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  891. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  899. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  900. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  901. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  902. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  904. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  905. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  906. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  907. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  908. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  909. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  910. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  911. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  912. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  913. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  915. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  916. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  917. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  918. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  919. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  920. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  922. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  923. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  924. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  925. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  926. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  927. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  928. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  929. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  930. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  935. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  936. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  937. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  938. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  939. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  940. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  941. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  942. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  943. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  944. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  945. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  946. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  947. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  960. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  961. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  962. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  963. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  964. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  965. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  966. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  967. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  968. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  969. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  970. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  971. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  972. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  973. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  974. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  975. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  976. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
  977. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  978. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  979. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  980. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
  981. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  983. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  984. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1125. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1126. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1127. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1128. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1129. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1130. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1131. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1132. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1133. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1134. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1135. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1136. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1137. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1138. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1139. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1140. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1141. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1142. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1143. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1144. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1145. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1146. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1147. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1148. cuda/cccl/headers/include/cuda/std/array +518 -0
  1149. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1150. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1151. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1152. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1153. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1154. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1155. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1156. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1157. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1158. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1159. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1160. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1161. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1162. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1163. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1164. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1165. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1166. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1167. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1168. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
  1169. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
  1170. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1171. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1172. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1173. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
  1174. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1175. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1176. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1177. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1178. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1179. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1180. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1181. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1182. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1183. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1184. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1185. cuda/cccl/headers/include/cuda/std/numbers +346 -0
  1186. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1187. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1188. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1189. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1190. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1191. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1192. cuda/cccl/headers/include/cuda/std/span +628 -0
  1193. cuda/cccl/headers/include/cuda/std/string_view +925 -0
  1194. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1195. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1196. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1197. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1198. cuda/cccl/headers/include/cuda/std/version +240 -0
  1199. cuda/cccl/headers/include/cuda/stream +31 -0
  1200. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1201. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1202. cuda/cccl/headers/include/cuda/utility +28 -0
  1203. cuda/cccl/headers/include/cuda/version +16 -0
  1204. cuda/cccl/headers/include/cuda/warp +28 -0
  1205. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1206. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1207. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1208. cuda/cccl/headers/include/nv/target +240 -0
  1209. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1210. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1211. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1212. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1213. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1214. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1215. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1216. cuda/cccl/headers/include/thrust/count.h +245 -0
  1217. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1218. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1219. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
  1220. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
  1221. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
  1222. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
  1223. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
  1224. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1225. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
  1226. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
  1227. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1228. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1229. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1230. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1237. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1238. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1239. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1240. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1241. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1242. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1243. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1244. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1245. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1246. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1247. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1248. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1249. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1250. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1251. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1252. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1253. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1254. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1255. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1256. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1257. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1258. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1259. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1260. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1261. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1262. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1263. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1264. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1265. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1266. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1267. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1268. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1269. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1270. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1271. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1272. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
  1273. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1274. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1275. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1276. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1277. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1278. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1279. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1280. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1281. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1282. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1283. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1284. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1285. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1286. cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
  1287. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1288. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1289. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1290. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1291. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1292. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1293. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1294. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1295. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1296. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1297. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1298. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1299. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1300. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1301. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1302. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1303. cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
  1304. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1305. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1306. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1307. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1308. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1309. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1310. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1311. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1312. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1313. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1314. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1315. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1316. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1317. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1318. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1320. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1321. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1322. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1324. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1325. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1330. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1331. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1332. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1333. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1334. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1335. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1336. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1337. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1338. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1339. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1340. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1341. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1342. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
  1343. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1344. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1345. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1346. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1347. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1348. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1349. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1350. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1351. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1352. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1353. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1354. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1355. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1356. cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
  1357. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1358. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1359. cuda/cccl/headers/include/thrust/find.h +382 -0
  1360. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1361. cuda/cccl/headers/include/thrust/functional.h +395 -0
  1362. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1363. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1364. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1365. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1366. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1367. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
  1377. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
  1378. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1379. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1380. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1381. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1382. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1383. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1384. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1385. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1386. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1387. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1388. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1389. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1390. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1391. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1392. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1393. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1394. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1395. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1396. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1397. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1398. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1399. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1400. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1401. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1402. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1403. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1404. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1405. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
  1406. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1407. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1408. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1409. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1410. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1411. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1412. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1413. cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
  1414. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1415. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1416. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1417. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1418. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1419. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1420. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1421. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1430. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1431. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1432. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1433. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1434. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1435. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1436. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1437. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1438. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1439. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1440. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1441. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1442. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1443. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1444. cuda/cccl/headers/include/thrust/random.h +120 -0
  1445. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1446. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1447. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1448. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1449. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1450. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1451. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1452. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1453. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1454. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1455. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1756. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1823. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +51 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1912. cuda/cccl/py.typed +0 -0
  1913. cuda/compute/__init__.py +79 -0
  1914. cuda/compute/_bindings.py +79 -0
  1915. cuda/compute/_bindings.pyi +475 -0
  1916. cuda/compute/_bindings_impl.pyx +2273 -0
  1917. cuda/compute/_caching.py +71 -0
  1918. cuda/compute/_cccl_interop.py +422 -0
  1919. cuda/compute/_utils/__init__.py +0 -0
  1920. cuda/compute/_utils/protocols.py +132 -0
  1921. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1922. cuda/compute/algorithms/__init__.py +54 -0
  1923. cuda/compute/algorithms/_histogram.py +243 -0
  1924. cuda/compute/algorithms/_merge_sort.py +225 -0
  1925. cuda/compute/algorithms/_radix_sort.py +312 -0
  1926. cuda/compute/algorithms/_reduce.py +182 -0
  1927. cuda/compute/algorithms/_scan.py +331 -0
  1928. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1929. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1930. cuda/compute/algorithms/_transform.py +329 -0
  1931. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1932. cuda/compute/cccl/.gitkeep +0 -0
  1933. cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1934. cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
  1935. cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
  1936. cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1937. cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
  1938. cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
  1939. cuda/compute/iterators/__init__.py +21 -0
  1940. cuda/compute/iterators/_factories.py +219 -0
  1941. cuda/compute/iterators/_iterators.py +817 -0
  1942. cuda/compute/iterators/_zip_iterator.py +199 -0
  1943. cuda/compute/numba_utils.py +53 -0
  1944. cuda/compute/op.py +3 -0
  1945. cuda/compute/struct.py +272 -0
  1946. cuda/compute/typing.py +37 -0
  1947. cuda/coop/__init__.py +8 -0
  1948. cuda/coop/_caching.py +48 -0
  1949. cuda/coop/_common.py +275 -0
  1950. cuda/coop/_nvrtc.py +92 -0
  1951. cuda/coop/_scan_op.py +181 -0
  1952. cuda/coop/_types.py +937 -0
  1953. cuda/coop/_typing.py +107 -0
  1954. cuda/coop/block/__init__.py +39 -0
  1955. cuda/coop/block/_block_exchange.py +251 -0
  1956. cuda/coop/block/_block_load_store.py +215 -0
  1957. cuda/coop/block/_block_merge_sort.py +125 -0
  1958. cuda/coop/block/_block_radix_sort.py +214 -0
  1959. cuda/coop/block/_block_reduce.py +294 -0
  1960. cuda/coop/block/_block_scan.py +983 -0
  1961. cuda/coop/warp/__init__.py +9 -0
  1962. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1963. cuda/coop/warp/_warp_reduce.py +153 -0
  1964. cuda/coop/warp/_warp_scan.py +78 -0
  1965. cuda_cccl-0.3.3.dist-info/METADATA +41 -0
  1966. cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
  1967. cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
  1968. cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,2202 @@
1
+ // This file was automatically generated. Do not edit.
2
+
3
+ #ifndef _CUDA_PTX_GENERATED_MULTIMEM_LD_REDUCE_H_
4
+ #define _CUDA_PTX_GENERATED_MULTIMEM_LD_REDUCE_H_
5
+
6
+ /*
7
+ // multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90
8
+ // .sem = { .weak }
9
+ // .op = { .min }
10
+ template <typename = void>
11
+ __device__ static inline uint32_t multimem_ld_reduce(
12
+ cuda::ptx::sem_weak_t,
13
+ cuda::ptx::op_min_t,
14
+ const uint32_t* addr);
15
+ */
16
+ #if __cccl_ptx_isa >= 810
17
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
18
+ template <typename = void>
19
+ _CCCL_DEVICE static inline ::cuda::std::uint32_t
20
+ multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_min_t, const ::cuda::std::uint32_t* __addr)
21
+ {
22
+ // __sem == sem_weak (due to parameter type constraint)
23
+ // __op == op_min (due to parameter type constraint)
24
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
25
+ ::cuda::std::uint32_t __dest;
26
+ asm("multimem.ld_reduce.weak.global.min.u32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
27
+ return __dest;
28
+ # else
29
+ // Unsupported architectures will have a linker error with a semi-decent error message
30
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
31
+ return 0;
32
+ # endif
33
+ }
34
+ #endif // __cccl_ptx_isa >= 810
35
+
36
+ /*
37
+ // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90
38
+ // .sem = { .relaxed, .acquire }
39
+ // .scope = { .cta, .cluster, .gpu, .sys }
40
+ // .op = { .min }
41
+ template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
42
+ __device__ static inline uint32_t multimem_ld_reduce(
43
+ cuda::ptx::sem_t<Sem> sem,
44
+ cuda::ptx::scope_t<Scope> scope,
45
+ cuda::ptx::op_min_t,
46
+ const uint32_t* addr);
47
+ */
48
+ #if __cccl_ptx_isa >= 810
49
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
50
+ template <::cuda::ptx::dot_sem _Sem, ::cuda::ptx::dot_scope _Scope>
51
+ _CCCL_DEVICE static inline ::cuda::std::uint32_t multimem_ld_reduce(
52
+ ::cuda::ptx::sem_t<_Sem> __sem,
53
+ ::cuda::ptx::scope_t<_Scope> __scope,
54
+ ::cuda::ptx::op_min_t,
55
+ const ::cuda::std::uint32_t* __addr)
56
+ {
57
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
58
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
59
+ // __op == op_min (due to parameter type constraint)
60
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
61
+ ::cuda::std::uint32_t __dest;
62
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
63
+ {
64
+ asm("multimem.ld_reduce.relaxed.cta.global.min.u32 %0, [%1];"
65
+ : "=r"(__dest)
66
+ : "l"(__as_ptr_gmem(__addr))
67
+ : "memory");
68
+ }
69
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
70
+ {
71
+ asm("multimem.ld_reduce.relaxed.cluster.global.min.u32 %0, [%1];"
72
+ : "=r"(__dest)
73
+ : "l"(__as_ptr_gmem(__addr))
74
+ : "memory");
75
+ }
76
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
77
+ {
78
+ asm("multimem.ld_reduce.relaxed.gpu.global.min.u32 %0, [%1];"
79
+ : "=r"(__dest)
80
+ : "l"(__as_ptr_gmem(__addr))
81
+ : "memory");
82
+ }
83
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
84
+ {
85
+ asm("multimem.ld_reduce.relaxed.sys.global.min.u32 %0, [%1];"
86
+ : "=r"(__dest)
87
+ : "l"(__as_ptr_gmem(__addr))
88
+ : "memory");
89
+ }
90
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
91
+ {
92
+ asm("multimem.ld_reduce.acquire.cta.global.min.u32 %0, [%1];"
93
+ : "=r"(__dest)
94
+ : "l"(__as_ptr_gmem(__addr))
95
+ : "memory");
96
+ }
97
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
98
+ {
99
+ asm("multimem.ld_reduce.acquire.cluster.global.min.u32 %0, [%1];"
100
+ : "=r"(__dest)
101
+ : "l"(__as_ptr_gmem(__addr))
102
+ : "memory");
103
+ }
104
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
105
+ {
106
+ asm("multimem.ld_reduce.acquire.gpu.global.min.u32 %0, [%1];"
107
+ : "=r"(__dest)
108
+ : "l"(__as_ptr_gmem(__addr))
109
+ : "memory");
110
+ }
111
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
112
+ {
113
+ asm("multimem.ld_reduce.acquire.sys.global.min.u32 %0, [%1];"
114
+ : "=r"(__dest)
115
+ : "l"(__as_ptr_gmem(__addr))
116
+ : "memory");
117
+ }
118
+ return __dest;
119
+ # else
120
+ // Unsupported architectures will have a linker error with a semi-decent error message
121
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
122
+ return 0;
123
+ # endif
124
+ }
125
+ #endif // __cccl_ptx_isa >= 810
126
+
127
+ /*
128
+ // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90
129
+ // .sem = { .weak }
130
+ // .op = { .min }
131
+ template <typename = void>
132
+ __device__ static inline uint64_t multimem_ld_reduce(
133
+ cuda::ptx::sem_weak_t,
134
+ cuda::ptx::op_min_t,
135
+ const uint64_t* addr);
136
+ */
137
+ #if __cccl_ptx_isa >= 810
138
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
139
+ template <typename = void>
140
+ _CCCL_DEVICE static inline ::cuda::std::uint64_t
141
+ multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_min_t, const ::cuda::std::uint64_t* __addr)
142
+ {
143
+ // __sem == sem_weak (due to parameter type constraint)
144
+ // __op == op_min (due to parameter type constraint)
145
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
146
+ ::cuda::std::uint64_t __dest;
147
+ asm("multimem.ld_reduce.weak.global.min.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
148
+ return __dest;
149
+ # else
150
+ // Unsupported architectures will have a linker error with a semi-decent error message
151
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
152
+ return 0;
153
+ # endif
154
+ }
155
+ #endif // __cccl_ptx_isa >= 810
156
+
157
+ /*
158
+ // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90
159
+ // .sem = { .relaxed, .acquire }
160
+ // .scope = { .cta, .cluster, .gpu, .sys }
161
+ // .op = { .min }
162
+ template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
163
+ __device__ static inline uint64_t multimem_ld_reduce(
164
+ cuda::ptx::sem_t<Sem> sem,
165
+ cuda::ptx::scope_t<Scope> scope,
166
+ cuda::ptx::op_min_t,
167
+ const uint64_t* addr);
168
+ */
169
+ #if __cccl_ptx_isa >= 810
170
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
171
+ template <::cuda::ptx::dot_sem _Sem, ::cuda::ptx::dot_scope _Scope>
172
+ _CCCL_DEVICE static inline ::cuda::std::uint64_t multimem_ld_reduce(
173
+ ::cuda::ptx::sem_t<_Sem> __sem,
174
+ ::cuda::ptx::scope_t<_Scope> __scope,
175
+ ::cuda::ptx::op_min_t,
176
+ const ::cuda::std::uint64_t* __addr)
177
+ {
178
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
179
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
180
+ // __op == op_min (due to parameter type constraint)
181
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
182
+ ::cuda::std::uint64_t __dest;
183
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
184
+ {
185
+ asm("multimem.ld_reduce.relaxed.cta.global.min.u64 %0, [%1];"
186
+ : "=l"(__dest)
187
+ : "l"(__as_ptr_gmem(__addr))
188
+ : "memory");
189
+ }
190
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
191
+ {
192
+ asm("multimem.ld_reduce.relaxed.cluster.global.min.u64 %0, [%1];"
193
+ : "=l"(__dest)
194
+ : "l"(__as_ptr_gmem(__addr))
195
+ : "memory");
196
+ }
197
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
198
+ {
199
+ asm("multimem.ld_reduce.relaxed.gpu.global.min.u64 %0, [%1];"
200
+ : "=l"(__dest)
201
+ : "l"(__as_ptr_gmem(__addr))
202
+ : "memory");
203
+ }
204
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
205
+ {
206
+ asm("multimem.ld_reduce.relaxed.sys.global.min.u64 %0, [%1];"
207
+ : "=l"(__dest)
208
+ : "l"(__as_ptr_gmem(__addr))
209
+ : "memory");
210
+ }
211
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
212
+ {
213
+ asm("multimem.ld_reduce.acquire.cta.global.min.u64 %0, [%1];"
214
+ : "=l"(__dest)
215
+ : "l"(__as_ptr_gmem(__addr))
216
+ : "memory");
217
+ }
218
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
219
+ {
220
+ asm("multimem.ld_reduce.acquire.cluster.global.min.u64 %0, [%1];"
221
+ : "=l"(__dest)
222
+ : "l"(__as_ptr_gmem(__addr))
223
+ : "memory");
224
+ }
225
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
226
+ {
227
+ asm("multimem.ld_reduce.acquire.gpu.global.min.u64 %0, [%1];"
228
+ : "=l"(__dest)
229
+ : "l"(__as_ptr_gmem(__addr))
230
+ : "memory");
231
+ }
232
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
233
+ {
234
+ asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];"
235
+ : "=l"(__dest)
236
+ : "l"(__as_ptr_gmem(__addr))
237
+ : "memory");
238
+ }
239
+ return __dest;
240
+ # else
241
+ // Unsupported architectures will have a linker error with a semi-decent error message
242
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
243
+ return 0;
244
+ # endif
245
+ }
246
+ #endif // __cccl_ptx_isa >= 810
247
+
248
+ /*
249
+ // multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90
250
+ // .sem = { .weak }
251
+ // .op = { .min }
252
+ template <typename = void>
253
+ __device__ static inline int32_t multimem_ld_reduce(
254
+ cuda::ptx::sem_weak_t,
255
+ cuda::ptx::op_min_t,
256
+ const int32_t* addr);
257
+ */
258
+ #if __cccl_ptx_isa >= 810
259
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
260
+ template <typename = void>
261
+ _CCCL_DEVICE static inline ::cuda::std::int32_t
262
+ multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_min_t, const ::cuda::std::int32_t* __addr)
263
+ {
264
+ // __sem == sem_weak (due to parameter type constraint)
265
+ // __op == op_min (due to parameter type constraint)
266
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
267
+ ::cuda::std::int32_t __dest;
268
+ asm("multimem.ld_reduce.weak.global.min.s32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
269
+ return __dest;
270
+ # else
271
+ // Unsupported architectures will have a linker error with a semi-decent error message
272
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
273
+ return 0;
274
+ # endif
275
+ }
276
+ #endif // __cccl_ptx_isa >= 810
277
+
278
+ /*
279
+ // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90
280
+ // .sem = { .relaxed, .acquire }
281
+ // .scope = { .cta, .cluster, .gpu, .sys }
282
+ // .op = { .min }
283
+ template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
284
+ __device__ static inline int32_t multimem_ld_reduce(
285
+ cuda::ptx::sem_t<Sem> sem,
286
+ cuda::ptx::scope_t<Scope> scope,
287
+ cuda::ptx::op_min_t,
288
+ const int32_t* addr);
289
+ */
290
+ #if __cccl_ptx_isa >= 810
291
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
292
+ template <::cuda::ptx::dot_sem _Sem, ::cuda::ptx::dot_scope _Scope>
293
+ _CCCL_DEVICE static inline ::cuda::std::int32_t multimem_ld_reduce(
294
+ ::cuda::ptx::sem_t<_Sem> __sem,
295
+ ::cuda::ptx::scope_t<_Scope> __scope,
296
+ ::cuda::ptx::op_min_t,
297
+ const ::cuda::std::int32_t* __addr)
298
+ {
299
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
300
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
301
+ // __op == op_min (due to parameter type constraint)
302
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
303
+ ::cuda::std::int32_t __dest;
304
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
305
+ {
306
+ asm("multimem.ld_reduce.relaxed.cta.global.min.s32 %0, [%1];"
307
+ : "=r"(__dest)
308
+ : "l"(__as_ptr_gmem(__addr))
309
+ : "memory");
310
+ }
311
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
312
+ {
313
+ asm("multimem.ld_reduce.relaxed.cluster.global.min.s32 %0, [%1];"
314
+ : "=r"(__dest)
315
+ : "l"(__as_ptr_gmem(__addr))
316
+ : "memory");
317
+ }
318
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
319
+ {
320
+ asm("multimem.ld_reduce.relaxed.gpu.global.min.s32 %0, [%1];"
321
+ : "=r"(__dest)
322
+ : "l"(__as_ptr_gmem(__addr))
323
+ : "memory");
324
+ }
325
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
326
+ {
327
+ asm("multimem.ld_reduce.relaxed.sys.global.min.s32 %0, [%1];"
328
+ : "=r"(__dest)
329
+ : "l"(__as_ptr_gmem(__addr))
330
+ : "memory");
331
+ }
332
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
333
+ {
334
+ asm("multimem.ld_reduce.acquire.cta.global.min.s32 %0, [%1];"
335
+ : "=r"(__dest)
336
+ : "l"(__as_ptr_gmem(__addr))
337
+ : "memory");
338
+ }
339
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
340
+ {
341
+ asm("multimem.ld_reduce.acquire.cluster.global.min.s32 %0, [%1];"
342
+ : "=r"(__dest)
343
+ : "l"(__as_ptr_gmem(__addr))
344
+ : "memory");
345
+ }
346
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
347
+ {
348
+ asm("multimem.ld_reduce.acquire.gpu.global.min.s32 %0, [%1];"
349
+ : "=r"(__dest)
350
+ : "l"(__as_ptr_gmem(__addr))
351
+ : "memory");
352
+ }
353
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
354
+ {
355
+ asm("multimem.ld_reduce.acquire.sys.global.min.s32 %0, [%1];"
356
+ : "=r"(__dest)
357
+ : "l"(__as_ptr_gmem(__addr))
358
+ : "memory");
359
+ }
360
+ return __dest;
361
+ # else
362
+ // Unsupported architectures will have a linker error with a semi-decent error message
363
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
364
+ return 0;
365
+ # endif
366
+ }
367
+ #endif // __cccl_ptx_isa >= 810
368
+
369
+ /*
370
+ // multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90
371
+ // .sem = { .weak }
372
+ // .op = { .min }
373
+ template <typename = void>
374
+ __device__ static inline int64_t multimem_ld_reduce(
375
+ cuda::ptx::sem_weak_t,
376
+ cuda::ptx::op_min_t,
377
+ const int64_t* addr);
378
+ */
379
+ #if __cccl_ptx_isa >= 810
380
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
381
+ template <typename = void>
382
+ _CCCL_DEVICE static inline ::cuda::std::int64_t
383
+ multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_min_t, const ::cuda::std::int64_t* __addr)
384
+ {
385
+ // __sem == sem_weak (due to parameter type constraint)
386
+ // __op == op_min (due to parameter type constraint)
387
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
388
+ ::cuda::std::int64_t __dest;
389
+ asm("multimem.ld_reduce.weak.global.min.s64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
390
+ return __dest;
391
+ # else
392
+ // Unsupported architectures will have a linker error with a semi-decent error message
393
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
394
+ return 0;
395
+ # endif
396
+ }
397
+ #endif // __cccl_ptx_isa >= 810
398
+
399
+ /*
400
+ // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90
401
+ // .sem = { .relaxed, .acquire }
402
+ // .scope = { .cta, .cluster, .gpu, .sys }
403
+ // .op = { .min }
404
+ template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
405
+ __device__ static inline int64_t multimem_ld_reduce(
406
+ cuda::ptx::sem_t<Sem> sem,
407
+ cuda::ptx::scope_t<Scope> scope,
408
+ cuda::ptx::op_min_t,
409
+ const int64_t* addr);
410
+ */
411
+ #if __cccl_ptx_isa >= 810
412
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
413
+ template <::cuda::ptx::dot_sem _Sem, ::cuda::ptx::dot_scope _Scope>
414
+ _CCCL_DEVICE static inline ::cuda::std::int64_t multimem_ld_reduce(
415
+ ::cuda::ptx::sem_t<_Sem> __sem,
416
+ ::cuda::ptx::scope_t<_Scope> __scope,
417
+ ::cuda::ptx::op_min_t,
418
+ const ::cuda::std::int64_t* __addr)
419
+ {
420
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
421
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
422
+ // __op == op_min (due to parameter type constraint)
423
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
424
+ ::cuda::std::int64_t __dest;
425
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
426
+ {
427
+ asm("multimem.ld_reduce.relaxed.cta.global.min.s64 %0, [%1];"
428
+ : "=l"(__dest)
429
+ : "l"(__as_ptr_gmem(__addr))
430
+ : "memory");
431
+ }
432
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
433
+ {
434
+ asm("multimem.ld_reduce.relaxed.cluster.global.min.s64 %0, [%1];"
435
+ : "=l"(__dest)
436
+ : "l"(__as_ptr_gmem(__addr))
437
+ : "memory");
438
+ }
439
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
440
+ {
441
+ asm("multimem.ld_reduce.relaxed.gpu.global.min.s64 %0, [%1];"
442
+ : "=l"(__dest)
443
+ : "l"(__as_ptr_gmem(__addr))
444
+ : "memory");
445
+ }
446
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
447
+ {
448
+ asm("multimem.ld_reduce.relaxed.sys.global.min.s64 %0, [%1];"
449
+ : "=l"(__dest)
450
+ : "l"(__as_ptr_gmem(__addr))
451
+ : "memory");
452
+ }
453
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
454
+ {
455
+ asm("multimem.ld_reduce.acquire.cta.global.min.s64 %0, [%1];"
456
+ : "=l"(__dest)
457
+ : "l"(__as_ptr_gmem(__addr))
458
+ : "memory");
459
+ }
460
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
461
+ {
462
+ asm("multimem.ld_reduce.acquire.cluster.global.min.s64 %0, [%1];"
463
+ : "=l"(__dest)
464
+ : "l"(__as_ptr_gmem(__addr))
465
+ : "memory");
466
+ }
467
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
468
+ {
469
+ asm("multimem.ld_reduce.acquire.gpu.global.min.s64 %0, [%1];"
470
+ : "=l"(__dest)
471
+ : "l"(__as_ptr_gmem(__addr))
472
+ : "memory");
473
+ }
474
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
475
+ {
476
+ asm("multimem.ld_reduce.acquire.sys.global.min.s64 %0, [%1];"
477
+ : "=l"(__dest)
478
+ : "l"(__as_ptr_gmem(__addr))
479
+ : "memory");
480
+ }
481
+ return __dest;
482
+ # else
483
+ // Unsupported architectures will have a linker error with a semi-decent error message
484
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
485
+ return 0;
486
+ # endif
487
+ }
488
+ #endif // __cccl_ptx_isa >= 810
489
+
490
+ /*
491
+ // multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90
492
+ // .sem = { .weak }
493
+ // .op = { .max }
494
+ template <typename = void>
495
+ __device__ static inline uint32_t multimem_ld_reduce(
496
+ cuda::ptx::sem_weak_t,
497
+ cuda::ptx::op_max_t,
498
+ const uint32_t* addr);
499
+ */
500
+ #if __cccl_ptx_isa >= 810
501
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
502
+ template <typename = void>
503
+ _CCCL_DEVICE static inline ::cuda::std::uint32_t
504
+ multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_max_t, const ::cuda::std::uint32_t* __addr)
505
+ {
506
+ // __sem == sem_weak (due to parameter type constraint)
507
+ // __op == op_max (due to parameter type constraint)
508
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
509
+ ::cuda::std::uint32_t __dest;
510
+ asm("multimem.ld_reduce.weak.global.max.u32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
511
+ return __dest;
512
+ # else
513
+ // Unsupported architectures will have a linker error with a semi-decent error message
514
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
515
+ return 0;
516
+ # endif
517
+ }
518
+ #endif // __cccl_ptx_isa >= 810
519
+
520
+ /*
521
+ // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90
522
+ // .sem = { .relaxed, .acquire }
523
+ // .scope = { .cta, .cluster, .gpu, .sys }
524
+ // .op = { .max }
525
+ template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
526
+ __device__ static inline uint32_t multimem_ld_reduce(
527
+ cuda::ptx::sem_t<Sem> sem,
528
+ cuda::ptx::scope_t<Scope> scope,
529
+ cuda::ptx::op_max_t,
530
+ const uint32_t* addr);
531
+ */
532
+ #if __cccl_ptx_isa >= 810
533
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
534
+ template <::cuda::ptx::dot_sem _Sem, ::cuda::ptx::dot_scope _Scope>
535
+ _CCCL_DEVICE static inline ::cuda::std::uint32_t multimem_ld_reduce(
536
+ ::cuda::ptx::sem_t<_Sem> __sem,
537
+ ::cuda::ptx::scope_t<_Scope> __scope,
538
+ ::cuda::ptx::op_max_t,
539
+ const ::cuda::std::uint32_t* __addr)
540
+ {
541
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
542
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
543
+ // __op == op_max (due to parameter type constraint)
544
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
545
+ ::cuda::std::uint32_t __dest;
546
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
547
+ {
548
+ asm("multimem.ld_reduce.relaxed.cta.global.max.u32 %0, [%1];"
549
+ : "=r"(__dest)
550
+ : "l"(__as_ptr_gmem(__addr))
551
+ : "memory");
552
+ }
553
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
554
+ {
555
+ asm("multimem.ld_reduce.relaxed.cluster.global.max.u32 %0, [%1];"
556
+ : "=r"(__dest)
557
+ : "l"(__as_ptr_gmem(__addr))
558
+ : "memory");
559
+ }
560
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
561
+ {
562
+ asm("multimem.ld_reduce.relaxed.gpu.global.max.u32 %0, [%1];"
563
+ : "=r"(__dest)
564
+ : "l"(__as_ptr_gmem(__addr))
565
+ : "memory");
566
+ }
567
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
568
+ {
569
+ asm("multimem.ld_reduce.relaxed.sys.global.max.u32 %0, [%1];"
570
+ : "=r"(__dest)
571
+ : "l"(__as_ptr_gmem(__addr))
572
+ : "memory");
573
+ }
574
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
575
+ {
576
+ asm("multimem.ld_reduce.acquire.cta.global.max.u32 %0, [%1];"
577
+ : "=r"(__dest)
578
+ : "l"(__as_ptr_gmem(__addr))
579
+ : "memory");
580
+ }
581
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
582
+ {
583
+ asm("multimem.ld_reduce.acquire.cluster.global.max.u32 %0, [%1];"
584
+ : "=r"(__dest)
585
+ : "l"(__as_ptr_gmem(__addr))
586
+ : "memory");
587
+ }
588
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
589
+ {
590
+ asm("multimem.ld_reduce.acquire.gpu.global.max.u32 %0, [%1];"
591
+ : "=r"(__dest)
592
+ : "l"(__as_ptr_gmem(__addr))
593
+ : "memory");
594
+ }
595
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
596
+ {
597
+ asm("multimem.ld_reduce.acquire.sys.global.max.u32 %0, [%1];"
598
+ : "=r"(__dest)
599
+ : "l"(__as_ptr_gmem(__addr))
600
+ : "memory");
601
+ }
602
+ return __dest;
603
+ # else
604
+ // Unsupported architectures will have a linker error with a semi-decent error message
605
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
606
+ return 0;
607
+ # endif
608
+ }
609
+ #endif // __cccl_ptx_isa >= 810
610
+
611
+ /*
612
+ // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90
613
+ // .sem = { .weak }
614
+ // .op = { .max }
615
+ template <typename = void>
616
+ __device__ static inline uint64_t multimem_ld_reduce(
617
+ cuda::ptx::sem_weak_t,
618
+ cuda::ptx::op_max_t,
619
+ const uint64_t* addr);
620
+ */
621
+ #if __cccl_ptx_isa >= 810
622
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
623
+ template <typename = void>
624
+ _CCCL_DEVICE static inline ::cuda::std::uint64_t
625
+ multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_max_t, const ::cuda::std::uint64_t* __addr)
626
+ {
627
+ // __sem == sem_weak (due to parameter type constraint)
628
+ // __op == op_max (due to parameter type constraint)
629
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
630
+ ::cuda::std::uint64_t __dest;
631
+ asm("multimem.ld_reduce.weak.global.max.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
632
+ return __dest;
633
+ # else
634
+ // Unsupported architectures will have a linker error with a semi-decent error message
635
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
636
+ return 0;
637
+ # endif
638
+ }
639
+ #endif // __cccl_ptx_isa >= 810
640
+
641
+ /*
642
+ // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90
643
+ // .sem = { .relaxed, .acquire }
644
+ // .scope = { .cta, .cluster, .gpu, .sys }
645
+ // .op = { .max }
646
+ template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
647
+ __device__ static inline uint64_t multimem_ld_reduce(
648
+ cuda::ptx::sem_t<Sem> sem,
649
+ cuda::ptx::scope_t<Scope> scope,
650
+ cuda::ptx::op_max_t,
651
+ const uint64_t* addr);
652
+ */
653
+ #if __cccl_ptx_isa >= 810
654
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
655
+ template <::cuda::ptx::dot_sem _Sem, ::cuda::ptx::dot_scope _Scope>
656
+ _CCCL_DEVICE static inline ::cuda::std::uint64_t multimem_ld_reduce(
657
+ ::cuda::ptx::sem_t<_Sem> __sem,
658
+ ::cuda::ptx::scope_t<_Scope> __scope,
659
+ ::cuda::ptx::op_max_t,
660
+ const ::cuda::std::uint64_t* __addr)
661
+ {
662
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
663
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
664
+ // __op == op_max (due to parameter type constraint)
665
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
666
+ ::cuda::std::uint64_t __dest;
667
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
668
+ {
669
+ asm("multimem.ld_reduce.relaxed.cta.global.max.u64 %0, [%1];"
670
+ : "=l"(__dest)
671
+ : "l"(__as_ptr_gmem(__addr))
672
+ : "memory");
673
+ }
674
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
675
+ {
676
+ asm("multimem.ld_reduce.relaxed.cluster.global.max.u64 %0, [%1];"
677
+ : "=l"(__dest)
678
+ : "l"(__as_ptr_gmem(__addr))
679
+ : "memory");
680
+ }
681
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
682
+ {
683
+ asm("multimem.ld_reduce.relaxed.gpu.global.max.u64 %0, [%1];"
684
+ : "=l"(__dest)
685
+ : "l"(__as_ptr_gmem(__addr))
686
+ : "memory");
687
+ }
688
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
689
+ {
690
+ asm("multimem.ld_reduce.relaxed.sys.global.max.u64 %0, [%1];"
691
+ : "=l"(__dest)
692
+ : "l"(__as_ptr_gmem(__addr))
693
+ : "memory");
694
+ }
695
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
696
+ {
697
+ asm("multimem.ld_reduce.acquire.cta.global.max.u64 %0, [%1];"
698
+ : "=l"(__dest)
699
+ : "l"(__as_ptr_gmem(__addr))
700
+ : "memory");
701
+ }
702
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
703
+ {
704
+ asm("multimem.ld_reduce.acquire.cluster.global.max.u64 %0, [%1];"
705
+ : "=l"(__dest)
706
+ : "l"(__as_ptr_gmem(__addr))
707
+ : "memory");
708
+ }
709
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
710
+ {
711
+ asm("multimem.ld_reduce.acquire.gpu.global.max.u64 %0, [%1];"
712
+ : "=l"(__dest)
713
+ : "l"(__as_ptr_gmem(__addr))
714
+ : "memory");
715
+ }
716
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
717
+ {
718
+ asm("multimem.ld_reduce.acquire.sys.global.max.u64 %0, [%1];"
719
+ : "=l"(__dest)
720
+ : "l"(__as_ptr_gmem(__addr))
721
+ : "memory");
722
+ }
723
+ return __dest;
724
+ # else
725
+ // Unsupported architectures will have a linker error with a semi-decent error message
726
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
727
+ return 0;
728
+ # endif
729
+ }
730
+ #endif // __cccl_ptx_isa >= 810
731
+
732
+ /*
733
+ // multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90
734
+ // .sem = { .weak }
735
+ // .op = { .max }
736
+ template <typename = void>
737
+ __device__ static inline int32_t multimem_ld_reduce(
738
+ cuda::ptx::sem_weak_t,
739
+ cuda::ptx::op_max_t,
740
+ const int32_t* addr);
741
+ */
742
+ #if __cccl_ptx_isa >= 810
743
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
744
+ template <typename = void>
745
+ _CCCL_DEVICE static inline ::cuda::std::int32_t
746
+ multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_max_t, const ::cuda::std::int32_t* __addr)
747
+ {
748
+ // __sem == sem_weak (due to parameter type constraint)
749
+ // __op == op_max (due to parameter type constraint)
750
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
751
+ ::cuda::std::int32_t __dest;
752
+ asm("multimem.ld_reduce.weak.global.max.s32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
753
+ return __dest;
754
+ # else
755
+ // Unsupported architectures will have a linker error with a semi-decent error message
756
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
757
+ return 0;
758
+ # endif
759
+ }
760
+ #endif // __cccl_ptx_isa >= 810
761
+
762
+ /*
763
+ // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90
764
+ // .sem = { .relaxed, .acquire }
765
+ // .scope = { .cta, .cluster, .gpu, .sys }
766
+ // .op = { .max }
767
+ template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
768
+ __device__ static inline int32_t multimem_ld_reduce(
769
+ cuda::ptx::sem_t<Sem> sem,
770
+ cuda::ptx::scope_t<Scope> scope,
771
+ cuda::ptx::op_max_t,
772
+ const int32_t* addr);
773
+ */
774
+ #if __cccl_ptx_isa >= 810
775
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
776
+ template <::cuda::ptx::dot_sem _Sem, ::cuda::ptx::dot_scope _Scope>
777
+ _CCCL_DEVICE static inline ::cuda::std::int32_t multimem_ld_reduce(
778
+ ::cuda::ptx::sem_t<_Sem> __sem,
779
+ ::cuda::ptx::scope_t<_Scope> __scope,
780
+ ::cuda::ptx::op_max_t,
781
+ const ::cuda::std::int32_t* __addr)
782
+ {
783
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
784
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
785
+ // __op == op_max (due to parameter type constraint)
786
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
787
+ ::cuda::std::int32_t __dest;
788
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
789
+ {
790
+ asm("multimem.ld_reduce.relaxed.cta.global.max.s32 %0, [%1];"
791
+ : "=r"(__dest)
792
+ : "l"(__as_ptr_gmem(__addr))
793
+ : "memory");
794
+ }
795
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
796
+ {
797
+ asm("multimem.ld_reduce.relaxed.cluster.global.max.s32 %0, [%1];"
798
+ : "=r"(__dest)
799
+ : "l"(__as_ptr_gmem(__addr))
800
+ : "memory");
801
+ }
802
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
803
+ {
804
+ asm("multimem.ld_reduce.relaxed.gpu.global.max.s32 %0, [%1];"
805
+ : "=r"(__dest)
806
+ : "l"(__as_ptr_gmem(__addr))
807
+ : "memory");
808
+ }
809
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
810
+ {
811
+ asm("multimem.ld_reduce.relaxed.sys.global.max.s32 %0, [%1];"
812
+ : "=r"(__dest)
813
+ : "l"(__as_ptr_gmem(__addr))
814
+ : "memory");
815
+ }
816
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
817
+ {
818
+ asm("multimem.ld_reduce.acquire.cta.global.max.s32 %0, [%1];"
819
+ : "=r"(__dest)
820
+ : "l"(__as_ptr_gmem(__addr))
821
+ : "memory");
822
+ }
823
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
824
+ {
825
+ asm("multimem.ld_reduce.acquire.cluster.global.max.s32 %0, [%1];"
826
+ : "=r"(__dest)
827
+ : "l"(__as_ptr_gmem(__addr))
828
+ : "memory");
829
+ }
830
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
831
+ {
832
+ asm("multimem.ld_reduce.acquire.gpu.global.max.s32 %0, [%1];"
833
+ : "=r"(__dest)
834
+ : "l"(__as_ptr_gmem(__addr))
835
+ : "memory");
836
+ }
837
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
838
+ {
839
+ asm("multimem.ld_reduce.acquire.sys.global.max.s32 %0, [%1];"
840
+ : "=r"(__dest)
841
+ : "l"(__as_ptr_gmem(__addr))
842
+ : "memory");
843
+ }
844
+ return __dest;
845
+ # else
846
+ // Unsupported architectures will have a linker error with a semi-decent error message
847
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
848
+ return 0;
849
+ # endif
850
+ }
851
+ #endif // __cccl_ptx_isa >= 810
852
+
853
+ /*
854
+ // multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90
855
+ // .sem = { .weak }
856
+ // .op = { .max }
857
+ template <typename = void>
858
+ __device__ static inline int64_t multimem_ld_reduce(
859
+ cuda::ptx::sem_weak_t,
860
+ cuda::ptx::op_max_t,
861
+ const int64_t* addr);
862
+ */
863
+ #if __cccl_ptx_isa >= 810
864
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
865
+ template <typename = void>
866
+ _CCCL_DEVICE static inline ::cuda::std::int64_t
867
+ multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_max_t, const ::cuda::std::int64_t* __addr)
868
+ {
869
+ // __sem == sem_weak (due to parameter type constraint)
870
+ // __op == op_max (due to parameter type constraint)
871
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
872
+ ::cuda::std::int64_t __dest;
873
+ asm("multimem.ld_reduce.weak.global.max.s64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
874
+ return __dest;
875
+ # else
876
+ // Unsupported architectures will have a linker error with a semi-decent error message
877
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
878
+ return 0;
879
+ # endif
880
+ }
881
+ #endif // __cccl_ptx_isa >= 810
882
+
883
+ /*
884
+ // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90
885
+ // .sem = { .relaxed, .acquire }
886
+ // .scope = { .cta, .cluster, .gpu, .sys }
887
+ // .op = { .max }
888
+ template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
889
+ __device__ static inline int64_t multimem_ld_reduce(
890
+ cuda::ptx::sem_t<Sem> sem,
891
+ cuda::ptx::scope_t<Scope> scope,
892
+ cuda::ptx::op_max_t,
893
+ const int64_t* addr);
894
+ */
895
+ #if __cccl_ptx_isa >= 810
896
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
897
+ template <::cuda::ptx::dot_sem _Sem, ::cuda::ptx::dot_scope _Scope>
898
+ _CCCL_DEVICE static inline ::cuda::std::int64_t multimem_ld_reduce(
899
+ ::cuda::ptx::sem_t<_Sem> __sem,
900
+ ::cuda::ptx::scope_t<_Scope> __scope,
901
+ ::cuda::ptx::op_max_t,
902
+ const ::cuda::std::int64_t* __addr)
903
+ {
904
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
905
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
906
+ // __op == op_max (due to parameter type constraint)
907
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
908
+ ::cuda::std::int64_t __dest;
909
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
910
+ {
911
+ asm("multimem.ld_reduce.relaxed.cta.global.max.s64 %0, [%1];"
912
+ : "=l"(__dest)
913
+ : "l"(__as_ptr_gmem(__addr))
914
+ : "memory");
915
+ }
916
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
917
+ {
918
+ asm("multimem.ld_reduce.relaxed.cluster.global.max.s64 %0, [%1];"
919
+ : "=l"(__dest)
920
+ : "l"(__as_ptr_gmem(__addr))
921
+ : "memory");
922
+ }
923
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
924
+ {
925
+ asm("multimem.ld_reduce.relaxed.gpu.global.max.s64 %0, [%1];"
926
+ : "=l"(__dest)
927
+ : "l"(__as_ptr_gmem(__addr))
928
+ : "memory");
929
+ }
930
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
931
+ {
932
+ asm("multimem.ld_reduce.relaxed.sys.global.max.s64 %0, [%1];"
933
+ : "=l"(__dest)
934
+ : "l"(__as_ptr_gmem(__addr))
935
+ : "memory");
936
+ }
937
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
938
+ {
939
+ asm("multimem.ld_reduce.acquire.cta.global.max.s64 %0, [%1];"
940
+ : "=l"(__dest)
941
+ : "l"(__as_ptr_gmem(__addr))
942
+ : "memory");
943
+ }
944
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
945
+ {
946
+ asm("multimem.ld_reduce.acquire.cluster.global.max.s64 %0, [%1];"
947
+ : "=l"(__dest)
948
+ : "l"(__as_ptr_gmem(__addr))
949
+ : "memory");
950
+ }
951
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
952
+ {
953
+ asm("multimem.ld_reduce.acquire.gpu.global.max.s64 %0, [%1];"
954
+ : "=l"(__dest)
955
+ : "l"(__as_ptr_gmem(__addr))
956
+ : "memory");
957
+ }
958
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
959
+ {
960
+ asm("multimem.ld_reduce.acquire.sys.global.max.s64 %0, [%1];"
961
+ : "=l"(__dest)
962
+ : "l"(__as_ptr_gmem(__addr))
963
+ : "memory");
964
+ }
965
+ return __dest;
966
+ # else
967
+ // Unsupported architectures will have a linker error with a semi-decent error message
968
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
969
+ return 0;
970
+ # endif
971
+ }
972
+ #endif // __cccl_ptx_isa >= 810
973
+
974
+ /*
975
+ // multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90
976
+ // .sem = { .weak }
977
+ // .op = { .add }
978
+ template <typename = void>
979
+ __device__ static inline uint32_t multimem_ld_reduce(
980
+ cuda::ptx::sem_weak_t,
981
+ cuda::ptx::op_add_t,
982
+ const uint32_t* addr);
983
+ */
984
+ #if __cccl_ptx_isa >= 810
985
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
986
+ template <typename = void>
987
+ _CCCL_DEVICE static inline ::cuda::std::uint32_t
988
+ multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_add_t, const ::cuda::std::uint32_t* __addr)
989
+ {
990
+ // __sem == sem_weak (due to parameter type constraint)
991
+ // __op == op_add (due to parameter type constraint)
992
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
993
+ ::cuda::std::uint32_t __dest;
994
+ asm("multimem.ld_reduce.weak.global.add.u32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
995
+ return __dest;
996
+ # else
997
+ // Unsupported architectures will have a linker error with a semi-decent error message
998
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
999
+ return 0;
1000
+ # endif
1001
+ }
1002
+ #endif // __cccl_ptx_isa >= 810
1003
+
1004
+ /*
1005
+ // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90
1006
+ // .sem = { .relaxed, .acquire }
1007
+ // .scope = { .cta, .cluster, .gpu, .sys }
1008
+ // .op = { .add }
1009
+ template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
1010
+ __device__ static inline uint32_t multimem_ld_reduce(
1011
+ cuda::ptx::sem_t<Sem> sem,
1012
+ cuda::ptx::scope_t<Scope> scope,
1013
+ cuda::ptx::op_add_t,
1014
+ const uint32_t* addr);
1015
+ */
1016
+ #if __cccl_ptx_isa >= 810
1017
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1018
+ template <::cuda::ptx::dot_sem _Sem, ::cuda::ptx::dot_scope _Scope>
1019
+ _CCCL_DEVICE static inline ::cuda::std::uint32_t multimem_ld_reduce(
1020
+ ::cuda::ptx::sem_t<_Sem> __sem,
1021
+ ::cuda::ptx::scope_t<_Scope> __scope,
1022
+ ::cuda::ptx::op_add_t,
1023
+ const ::cuda::std::uint32_t* __addr)
1024
+ {
1025
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
1026
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
1027
+ // __op == op_add (due to parameter type constraint)
1028
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1029
+ ::cuda::std::uint32_t __dest;
1030
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
1031
+ {
1032
+ asm("multimem.ld_reduce.relaxed.cta.global.add.u32 %0, [%1];"
1033
+ : "=r"(__dest)
1034
+ : "l"(__as_ptr_gmem(__addr))
1035
+ : "memory");
1036
+ }
1037
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
1038
+ {
1039
+ asm("multimem.ld_reduce.relaxed.cluster.global.add.u32 %0, [%1];"
1040
+ : "=r"(__dest)
1041
+ : "l"(__as_ptr_gmem(__addr))
1042
+ : "memory");
1043
+ }
1044
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
1045
+ {
1046
+ asm("multimem.ld_reduce.relaxed.gpu.global.add.u32 %0, [%1];"
1047
+ : "=r"(__dest)
1048
+ : "l"(__as_ptr_gmem(__addr))
1049
+ : "memory");
1050
+ }
1051
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
1052
+ {
1053
+ asm("multimem.ld_reduce.relaxed.sys.global.add.u32 %0, [%1];"
1054
+ : "=r"(__dest)
1055
+ : "l"(__as_ptr_gmem(__addr))
1056
+ : "memory");
1057
+ }
1058
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
1059
+ {
1060
+ asm("multimem.ld_reduce.acquire.cta.global.add.u32 %0, [%1];"
1061
+ : "=r"(__dest)
1062
+ : "l"(__as_ptr_gmem(__addr))
1063
+ : "memory");
1064
+ }
1065
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
1066
+ {
1067
+ asm("multimem.ld_reduce.acquire.cluster.global.add.u32 %0, [%1];"
1068
+ : "=r"(__dest)
1069
+ : "l"(__as_ptr_gmem(__addr))
1070
+ : "memory");
1071
+ }
1072
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
1073
+ {
1074
+ asm("multimem.ld_reduce.acquire.gpu.global.add.u32 %0, [%1];"
1075
+ : "=r"(__dest)
1076
+ : "l"(__as_ptr_gmem(__addr))
1077
+ : "memory");
1078
+ }
1079
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
1080
+ {
1081
+ asm("multimem.ld_reduce.acquire.sys.global.add.u32 %0, [%1];"
1082
+ : "=r"(__dest)
1083
+ : "l"(__as_ptr_gmem(__addr))
1084
+ : "memory");
1085
+ }
1086
+ return __dest;
1087
+ # else
1088
+ // Unsupported architectures will have a linker error with a semi-decent error message
1089
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1090
+ return 0;
1091
+ # endif
1092
+ }
1093
+ #endif // __cccl_ptx_isa >= 810
1094
+
1095
+ /*
1096
+ // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90
1097
+ // .sem = { .weak }
1098
+ // .op = { .add }
1099
+ template <typename = void>
1100
+ __device__ static inline uint64_t multimem_ld_reduce(
1101
+ cuda::ptx::sem_weak_t,
1102
+ cuda::ptx::op_add_t,
1103
+ const uint64_t* addr);
1104
+ */
1105
+ #if __cccl_ptx_isa >= 810
1106
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1107
+ template <typename = void>
1108
+ _CCCL_DEVICE static inline ::cuda::std::uint64_t
1109
+ multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_add_t, const ::cuda::std::uint64_t* __addr)
1110
+ {
1111
+ // __sem == sem_weak (due to parameter type constraint)
1112
+ // __op == op_add (due to parameter type constraint)
1113
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1114
+ ::cuda::std::uint64_t __dest;
1115
+ asm("multimem.ld_reduce.weak.global.add.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
1116
+ return __dest;
1117
+ # else
1118
+ // Unsupported architectures will have a linker error with a semi-decent error message
1119
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1120
+ return 0;
1121
+ # endif
1122
+ }
1123
+ #endif // __cccl_ptx_isa >= 810
1124
+
1125
+ /*
1126
+ // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90
1127
+ // .sem = { .relaxed, .acquire }
1128
+ // .scope = { .cta, .cluster, .gpu, .sys }
1129
+ // .op = { .add }
1130
+ template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
1131
+ __device__ static inline uint64_t multimem_ld_reduce(
1132
+ cuda::ptx::sem_t<Sem> sem,
1133
+ cuda::ptx::scope_t<Scope> scope,
1134
+ cuda::ptx::op_add_t,
1135
+ const uint64_t* addr);
1136
+ */
1137
+ #if __cccl_ptx_isa >= 810
1138
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1139
+ template <::cuda::ptx::dot_sem _Sem, ::cuda::ptx::dot_scope _Scope>
1140
+ _CCCL_DEVICE static inline ::cuda::std::uint64_t multimem_ld_reduce(
1141
+ ::cuda::ptx::sem_t<_Sem> __sem,
1142
+ ::cuda::ptx::scope_t<_Scope> __scope,
1143
+ ::cuda::ptx::op_add_t,
1144
+ const ::cuda::std::uint64_t* __addr)
1145
+ {
1146
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
1147
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
1148
+ // __op == op_add (due to parameter type constraint)
1149
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1150
+ ::cuda::std::uint64_t __dest;
1151
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
1152
+ {
1153
+ asm("multimem.ld_reduce.relaxed.cta.global.add.u64 %0, [%1];"
1154
+ : "=l"(__dest)
1155
+ : "l"(__as_ptr_gmem(__addr))
1156
+ : "memory");
1157
+ }
1158
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
1159
+ {
1160
+ asm("multimem.ld_reduce.relaxed.cluster.global.add.u64 %0, [%1];"
1161
+ : "=l"(__dest)
1162
+ : "l"(__as_ptr_gmem(__addr))
1163
+ : "memory");
1164
+ }
1165
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
1166
+ {
1167
+ asm("multimem.ld_reduce.relaxed.gpu.global.add.u64 %0, [%1];"
1168
+ : "=l"(__dest)
1169
+ : "l"(__as_ptr_gmem(__addr))
1170
+ : "memory");
1171
+ }
1172
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
1173
+ {
1174
+ asm("multimem.ld_reduce.relaxed.sys.global.add.u64 %0, [%1];"
1175
+ : "=l"(__dest)
1176
+ : "l"(__as_ptr_gmem(__addr))
1177
+ : "memory");
1178
+ }
1179
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
1180
+ {
1181
+ asm("multimem.ld_reduce.acquire.cta.global.add.u64 %0, [%1];"
1182
+ : "=l"(__dest)
1183
+ : "l"(__as_ptr_gmem(__addr))
1184
+ : "memory");
1185
+ }
1186
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
1187
+ {
1188
+ asm("multimem.ld_reduce.acquire.cluster.global.add.u64 %0, [%1];"
1189
+ : "=l"(__dest)
1190
+ : "l"(__as_ptr_gmem(__addr))
1191
+ : "memory");
1192
+ }
1193
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
1194
+ {
1195
+ asm("multimem.ld_reduce.acquire.gpu.global.add.u64 %0, [%1];"
1196
+ : "=l"(__dest)
1197
+ : "l"(__as_ptr_gmem(__addr))
1198
+ : "memory");
1199
+ }
1200
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
1201
+ {
1202
+ asm("multimem.ld_reduce.acquire.sys.global.add.u64 %0, [%1];"
1203
+ : "=l"(__dest)
1204
+ : "l"(__as_ptr_gmem(__addr))
1205
+ : "memory");
1206
+ }
1207
+ return __dest;
1208
+ # else
1209
+ // Unsupported architectures will have a linker error with a semi-decent error message
1210
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1211
+ return 0;
1212
+ # endif
1213
+ }
1214
+ #endif // __cccl_ptx_isa >= 810
1215
+
1216
+ /*
1217
+ // multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90
1218
+ // .sem = { .weak }
1219
+ // .op = { .add }
1220
+ template <typename = void>
1221
+ __device__ static inline int32_t multimem_ld_reduce(
1222
+ cuda::ptx::sem_weak_t,
1223
+ cuda::ptx::op_add_t,
1224
+ const int32_t* addr);
1225
+ */
1226
+ #if __cccl_ptx_isa >= 810
1227
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1228
+ template <typename = void>
1229
+ _CCCL_DEVICE static inline ::cuda::std::int32_t
1230
+ multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_add_t, const ::cuda::std::int32_t* __addr)
1231
+ {
1232
+ // __sem == sem_weak (due to parameter type constraint)
1233
+ // __op == op_add (due to parameter type constraint)
1234
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1235
+ ::cuda::std::int32_t __dest;
1236
+ asm("multimem.ld_reduce.weak.global.add.s32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
1237
+ return __dest;
1238
+ # else
1239
+ // Unsupported architectures will have a linker error with a semi-decent error message
1240
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1241
+ return 0;
1242
+ # endif
1243
+ }
1244
+ #endif // __cccl_ptx_isa >= 810
1245
+
1246
+ /*
1247
+ // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90
1248
+ // .sem = { .relaxed, .acquire }
1249
+ // .scope = { .cta, .cluster, .gpu, .sys }
1250
+ // .op = { .add }
1251
+ template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
1252
+ __device__ static inline int32_t multimem_ld_reduce(
1253
+ cuda::ptx::sem_t<Sem> sem,
1254
+ cuda::ptx::scope_t<Scope> scope,
1255
+ cuda::ptx::op_add_t,
1256
+ const int32_t* addr);
1257
+ */
1258
+ #if __cccl_ptx_isa >= 810
1259
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1260
+ template <::cuda::ptx::dot_sem _Sem, ::cuda::ptx::dot_scope _Scope>
1261
+ _CCCL_DEVICE static inline ::cuda::std::int32_t multimem_ld_reduce(
1262
+ ::cuda::ptx::sem_t<_Sem> __sem,
1263
+ ::cuda::ptx::scope_t<_Scope> __scope,
1264
+ ::cuda::ptx::op_add_t,
1265
+ const ::cuda::std::int32_t* __addr)
1266
+ {
1267
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
1268
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
1269
+ // __op == op_add (due to parameter type constraint)
1270
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1271
+ ::cuda::std::int32_t __dest;
1272
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
1273
+ {
1274
+ asm("multimem.ld_reduce.relaxed.cta.global.add.s32 %0, [%1];"
1275
+ : "=r"(__dest)
1276
+ : "l"(__as_ptr_gmem(__addr))
1277
+ : "memory");
1278
+ }
1279
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
1280
+ {
1281
+ asm("multimem.ld_reduce.relaxed.cluster.global.add.s32 %0, [%1];"
1282
+ : "=r"(__dest)
1283
+ : "l"(__as_ptr_gmem(__addr))
1284
+ : "memory");
1285
+ }
1286
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
1287
+ {
1288
+ asm("multimem.ld_reduce.relaxed.gpu.global.add.s32 %0, [%1];"
1289
+ : "=r"(__dest)
1290
+ : "l"(__as_ptr_gmem(__addr))
1291
+ : "memory");
1292
+ }
1293
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
1294
+ {
1295
+ asm("multimem.ld_reduce.relaxed.sys.global.add.s32 %0, [%1];"
1296
+ : "=r"(__dest)
1297
+ : "l"(__as_ptr_gmem(__addr))
1298
+ : "memory");
1299
+ }
1300
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
1301
+ {
1302
+ asm("multimem.ld_reduce.acquire.cta.global.add.s32 %0, [%1];"
1303
+ : "=r"(__dest)
1304
+ : "l"(__as_ptr_gmem(__addr))
1305
+ : "memory");
1306
+ }
1307
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
1308
+ {
1309
+ asm("multimem.ld_reduce.acquire.cluster.global.add.s32 %0, [%1];"
1310
+ : "=r"(__dest)
1311
+ : "l"(__as_ptr_gmem(__addr))
1312
+ : "memory");
1313
+ }
1314
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
1315
+ {
1316
+ asm("multimem.ld_reduce.acquire.gpu.global.add.s32 %0, [%1];"
1317
+ : "=r"(__dest)
1318
+ : "l"(__as_ptr_gmem(__addr))
1319
+ : "memory");
1320
+ }
1321
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
1322
+ {
1323
+ asm("multimem.ld_reduce.acquire.sys.global.add.s32 %0, [%1];"
1324
+ : "=r"(__dest)
1325
+ : "l"(__as_ptr_gmem(__addr))
1326
+ : "memory");
1327
+ }
1328
+ return __dest;
1329
+ # else
1330
+ // Unsupported architectures will have a linker error with a semi-decent error message
1331
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1332
+ return 0;
1333
+ # endif
1334
+ }
1335
+ #endif // __cccl_ptx_isa >= 810
1336
+
1337
+ /*
1338
+ // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90
1339
+ // .sem = { .weak }
1340
+ // .op = { .add }
1341
+ template <typename = void>
1342
+ __device__ static inline int64_t multimem_ld_reduce(
1343
+ cuda::ptx::sem_weak_t,
1344
+ cuda::ptx::op_add_t,
1345
+ const int64_t* addr);
1346
+ */
1347
+ #if __cccl_ptx_isa >= 810
1348
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1349
+ template <typename = void>
1350
+ _CCCL_DEVICE static inline ::cuda::std::int64_t
1351
+ multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_add_t, const ::cuda::std::int64_t* __addr)
1352
+ {
1353
+ // __sem == sem_weak (due to parameter type constraint)
1354
+ // __op == op_add (due to parameter type constraint)
1355
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1356
+ ::cuda::std::int64_t __dest;
1357
+ asm("multimem.ld_reduce.weak.global.add.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
1358
+ return __dest;
1359
+ # else
1360
+ // Unsupported architectures will have a linker error with a semi-decent error message
1361
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1362
+ return 0;
1363
+ # endif
1364
+ }
1365
+ #endif // __cccl_ptx_isa >= 810
1366
+
1367
+ /*
1368
+ // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90
1369
+ // .sem = { .relaxed, .acquire }
1370
+ // .scope = { .cta, .cluster, .gpu, .sys }
1371
+ // .op = { .add }
1372
+ template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
1373
+ __device__ static inline int64_t multimem_ld_reduce(
1374
+ cuda::ptx::sem_t<Sem> sem,
1375
+ cuda::ptx::scope_t<Scope> scope,
1376
+ cuda::ptx::op_add_t,
1377
+ const int64_t* addr);
1378
+ */
1379
+ #if __cccl_ptx_isa >= 810
1380
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1381
+ template <::cuda::ptx::dot_sem _Sem, ::cuda::ptx::dot_scope _Scope>
1382
+ _CCCL_DEVICE static inline ::cuda::std::int64_t multimem_ld_reduce(
1383
+ ::cuda::ptx::sem_t<_Sem> __sem,
1384
+ ::cuda::ptx::scope_t<_Scope> __scope,
1385
+ ::cuda::ptx::op_add_t,
1386
+ const ::cuda::std::int64_t* __addr)
1387
+ {
1388
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
1389
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
1390
+ // __op == op_add (due to parameter type constraint)
1391
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1392
+ ::cuda::std::int64_t __dest;
1393
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
1394
+ {
1395
+ asm("multimem.ld_reduce.relaxed.cta.global.add.u64 %0, [%1];"
1396
+ : "=l"(__dest)
1397
+ : "l"(__as_ptr_gmem(__addr))
1398
+ : "memory");
1399
+ }
1400
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
1401
+ {
1402
+ asm("multimem.ld_reduce.relaxed.cluster.global.add.u64 %0, [%1];"
1403
+ : "=l"(__dest)
1404
+ : "l"(__as_ptr_gmem(__addr))
1405
+ : "memory");
1406
+ }
1407
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
1408
+ {
1409
+ asm("multimem.ld_reduce.relaxed.gpu.global.add.u64 %0, [%1];"
1410
+ : "=l"(__dest)
1411
+ : "l"(__as_ptr_gmem(__addr))
1412
+ : "memory");
1413
+ }
1414
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
1415
+ {
1416
+ asm("multimem.ld_reduce.relaxed.sys.global.add.u64 %0, [%1];"
1417
+ : "=l"(__dest)
1418
+ : "l"(__as_ptr_gmem(__addr))
1419
+ : "memory");
1420
+ }
1421
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
1422
+ {
1423
+ asm("multimem.ld_reduce.acquire.cta.global.add.u64 %0, [%1];"
1424
+ : "=l"(__dest)
1425
+ : "l"(__as_ptr_gmem(__addr))
1426
+ : "memory");
1427
+ }
1428
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
1429
+ {
1430
+ asm("multimem.ld_reduce.acquire.cluster.global.add.u64 %0, [%1];"
1431
+ : "=l"(__dest)
1432
+ : "l"(__as_ptr_gmem(__addr))
1433
+ : "memory");
1434
+ }
1435
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
1436
+ {
1437
+ asm("multimem.ld_reduce.acquire.gpu.global.add.u64 %0, [%1];"
1438
+ : "=l"(__dest)
1439
+ : "l"(__as_ptr_gmem(__addr))
1440
+ : "memory");
1441
+ }
1442
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
1443
+ {
1444
+ asm("multimem.ld_reduce.acquire.sys.global.add.u64 %0, [%1];"
1445
+ : "=l"(__dest)
1446
+ : "l"(__as_ptr_gmem(__addr))
1447
+ : "memory");
1448
+ }
1449
+ return __dest;
1450
+ # else
1451
+ // Unsupported architectures will have a linker error with a semi-decent error message
1452
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1453
+ return 0;
1454
+ # endif
1455
+ }
1456
+ #endif // __cccl_ptx_isa >= 810
1457
+
1458
+ /*
1459
+ // multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90
1460
+ // .sem = { .weak }
1461
+ // .op = { .and }
1462
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1463
+ __device__ static inline B32 multimem_ld_reduce(
1464
+ cuda::ptx::sem_weak_t,
1465
+ cuda::ptx::op_and_op_t,
1466
+ const B32* addr);
1467
+ */
1468
+ #if __cccl_ptx_isa >= 810
1469
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1470
+ template <typename _B32, ::cuda::std::enable_if_t<sizeof(_B32) == 4, bool> = true>
1471
+ _CCCL_DEVICE static inline _B32 multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_and_op_t, const _B32* __addr)
1472
+ {
1473
+ // __sem == sem_weak (due to parameter type constraint)
1474
+ // __op == op_and_op (due to parameter type constraint)
1475
+ static_assert(sizeof(_B32) == 4, "");
1476
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1477
+ ::cuda::std::uint32_t __dest;
1478
+ asm("multimem.ld_reduce.weak.global.and.b32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
1479
+ return *reinterpret_cast<_B32*>(&__dest);
1480
+ # else
1481
+ // Unsupported architectures will have a linker error with a semi-decent error message
1482
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1483
+ ::cuda::std::uint32_t __err_out_var = 0;
1484
+ return *reinterpret_cast<_B32*>(&__err_out_var);
1485
+ # endif
1486
+ }
1487
+ #endif // __cccl_ptx_isa >= 810
1488
+
1489
+ /*
1490
+ // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90
1491
+ // .sem = { .relaxed, .acquire }
1492
+ // .scope = { .cta, .cluster, .gpu, .sys }
1493
+ // .op = { .and }
1494
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
1495
+ __device__ static inline B32 multimem_ld_reduce(
1496
+ cuda::ptx::sem_t<Sem> sem,
1497
+ cuda::ptx::scope_t<Scope> scope,
1498
+ cuda::ptx::op_and_op_t,
1499
+ const B32* addr);
1500
+ */
1501
+ #if __cccl_ptx_isa >= 810
1502
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1503
+ template <typename _B32,
1504
+ ::cuda::std::enable_if_t<sizeof(_B32) == 4, bool> = true,
1505
+ ::cuda::ptx::dot_sem _Sem,
1506
+ ::cuda::ptx::dot_scope _Scope>
1507
+ _CCCL_DEVICE static inline _B32 multimem_ld_reduce(
1508
+ ::cuda::ptx::sem_t<_Sem> __sem, ::cuda::ptx::scope_t<_Scope> __scope, ::cuda::ptx::op_and_op_t, const _B32* __addr)
1509
+ {
1510
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
1511
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
1512
+ // __op == op_and_op (due to parameter type constraint)
1513
+ static_assert(sizeof(_B32) == 4, "");
1514
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1515
+ ::cuda::std::uint32_t __dest;
1516
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
1517
+ {
1518
+ asm("multimem.ld_reduce.relaxed.cta.global.and.b32 %0, [%1];"
1519
+ : "=r"(__dest)
1520
+ : "l"(__as_ptr_gmem(__addr))
1521
+ : "memory");
1522
+ }
1523
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
1524
+ {
1525
+ asm("multimem.ld_reduce.relaxed.cluster.global.and.b32 %0, [%1];"
1526
+ : "=r"(__dest)
1527
+ : "l"(__as_ptr_gmem(__addr))
1528
+ : "memory");
1529
+ }
1530
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
1531
+ {
1532
+ asm("multimem.ld_reduce.relaxed.gpu.global.and.b32 %0, [%1];"
1533
+ : "=r"(__dest)
1534
+ : "l"(__as_ptr_gmem(__addr))
1535
+ : "memory");
1536
+ }
1537
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
1538
+ {
1539
+ asm("multimem.ld_reduce.relaxed.sys.global.and.b32 %0, [%1];"
1540
+ : "=r"(__dest)
1541
+ : "l"(__as_ptr_gmem(__addr))
1542
+ : "memory");
1543
+ }
1544
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
1545
+ {
1546
+ asm("multimem.ld_reduce.acquire.cta.global.and.b32 %0, [%1];"
1547
+ : "=r"(__dest)
1548
+ : "l"(__as_ptr_gmem(__addr))
1549
+ : "memory");
1550
+ }
1551
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
1552
+ {
1553
+ asm("multimem.ld_reduce.acquire.cluster.global.and.b32 %0, [%1];"
1554
+ : "=r"(__dest)
1555
+ : "l"(__as_ptr_gmem(__addr))
1556
+ : "memory");
1557
+ }
1558
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
1559
+ {
1560
+ asm("multimem.ld_reduce.acquire.gpu.global.and.b32 %0, [%1];"
1561
+ : "=r"(__dest)
1562
+ : "l"(__as_ptr_gmem(__addr))
1563
+ : "memory");
1564
+ }
1565
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
1566
+ {
1567
+ asm("multimem.ld_reduce.acquire.sys.global.and.b32 %0, [%1];"
1568
+ : "=r"(__dest)
1569
+ : "l"(__as_ptr_gmem(__addr))
1570
+ : "memory");
1571
+ }
1572
+ return *reinterpret_cast<_B32*>(&__dest);
1573
+ # else
1574
+ // Unsupported architectures will have a linker error with a semi-decent error message
1575
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1576
+ ::cuda::std::uint32_t __err_out_var = 0;
1577
+ return *reinterpret_cast<_B32*>(&__err_out_var);
1578
+ # endif
1579
+ }
1580
+ #endif // __cccl_ptx_isa >= 810
1581
+
1582
+ /*
1583
+ // multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90
1584
+ // .sem = { .weak }
1585
+ // .op = { .or }
1586
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1587
+ __device__ static inline B32 multimem_ld_reduce(
1588
+ cuda::ptx::sem_weak_t,
1589
+ cuda::ptx::op_or_op_t,
1590
+ const B32* addr);
1591
+ */
1592
+ #if __cccl_ptx_isa >= 810
1593
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1594
+ template <typename _B32, ::cuda::std::enable_if_t<sizeof(_B32) == 4, bool> = true>
1595
+ _CCCL_DEVICE static inline _B32 multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_or_op_t, const _B32* __addr)
1596
+ {
1597
+ // __sem == sem_weak (due to parameter type constraint)
1598
+ // __op == op_or_op (due to parameter type constraint)
1599
+ static_assert(sizeof(_B32) == 4, "");
1600
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1601
+ ::cuda::std::uint32_t __dest;
1602
+ asm("multimem.ld_reduce.weak.global.or.b32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
1603
+ return *reinterpret_cast<_B32*>(&__dest);
1604
+ # else
1605
+ // Unsupported architectures will have a linker error with a semi-decent error message
1606
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1607
+ ::cuda::std::uint32_t __err_out_var = 0;
1608
+ return *reinterpret_cast<_B32*>(&__err_out_var);
1609
+ # endif
1610
+ }
1611
+ #endif // __cccl_ptx_isa >= 810
1612
+
1613
+ /*
1614
+ // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90
1615
+ // .sem = { .relaxed, .acquire }
1616
+ // .scope = { .cta, .cluster, .gpu, .sys }
1617
+ // .op = { .or }
1618
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
1619
+ __device__ static inline B32 multimem_ld_reduce(
1620
+ cuda::ptx::sem_t<Sem> sem,
1621
+ cuda::ptx::scope_t<Scope> scope,
1622
+ cuda::ptx::op_or_op_t,
1623
+ const B32* addr);
1624
+ */
1625
+ #if __cccl_ptx_isa >= 810
1626
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1627
+ template <typename _B32,
1628
+ ::cuda::std::enable_if_t<sizeof(_B32) == 4, bool> = true,
1629
+ ::cuda::ptx::dot_sem _Sem,
1630
+ ::cuda::ptx::dot_scope _Scope>
1631
+ _CCCL_DEVICE static inline _B32 multimem_ld_reduce(
1632
+ ::cuda::ptx::sem_t<_Sem> __sem, ::cuda::ptx::scope_t<_Scope> __scope, ::cuda::ptx::op_or_op_t, const _B32* __addr)
1633
+ {
1634
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
1635
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
1636
+ // __op == op_or_op (due to parameter type constraint)
1637
+ static_assert(sizeof(_B32) == 4, "");
1638
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1639
+ ::cuda::std::uint32_t __dest;
1640
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
1641
+ {
1642
+ asm("multimem.ld_reduce.relaxed.cta.global.or.b32 %0, [%1];"
1643
+ : "=r"(__dest)
1644
+ : "l"(__as_ptr_gmem(__addr))
1645
+ : "memory");
1646
+ }
1647
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
1648
+ {
1649
+ asm("multimem.ld_reduce.relaxed.cluster.global.or.b32 %0, [%1];"
1650
+ : "=r"(__dest)
1651
+ : "l"(__as_ptr_gmem(__addr))
1652
+ : "memory");
1653
+ }
1654
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
1655
+ {
1656
+ asm("multimem.ld_reduce.relaxed.gpu.global.or.b32 %0, [%1];"
1657
+ : "=r"(__dest)
1658
+ : "l"(__as_ptr_gmem(__addr))
1659
+ : "memory");
1660
+ }
1661
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
1662
+ {
1663
+ asm("multimem.ld_reduce.relaxed.sys.global.or.b32 %0, [%1];"
1664
+ : "=r"(__dest)
1665
+ : "l"(__as_ptr_gmem(__addr))
1666
+ : "memory");
1667
+ }
1668
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
1669
+ {
1670
+ asm("multimem.ld_reduce.acquire.cta.global.or.b32 %0, [%1];"
1671
+ : "=r"(__dest)
1672
+ : "l"(__as_ptr_gmem(__addr))
1673
+ : "memory");
1674
+ }
1675
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
1676
+ {
1677
+ asm("multimem.ld_reduce.acquire.cluster.global.or.b32 %0, [%1];"
1678
+ : "=r"(__dest)
1679
+ : "l"(__as_ptr_gmem(__addr))
1680
+ : "memory");
1681
+ }
1682
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
1683
+ {
1684
+ asm("multimem.ld_reduce.acquire.gpu.global.or.b32 %0, [%1];"
1685
+ : "=r"(__dest)
1686
+ : "l"(__as_ptr_gmem(__addr))
1687
+ : "memory");
1688
+ }
1689
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
1690
+ {
1691
+ asm("multimem.ld_reduce.acquire.sys.global.or.b32 %0, [%1];"
1692
+ : "=r"(__dest)
1693
+ : "l"(__as_ptr_gmem(__addr))
1694
+ : "memory");
1695
+ }
1696
+ return *reinterpret_cast<_B32*>(&__dest);
1697
+ # else
1698
+ // Unsupported architectures will have a linker error with a semi-decent error message
1699
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1700
+ ::cuda::std::uint32_t __err_out_var = 0;
1701
+ return *reinterpret_cast<_B32*>(&__err_out_var);
1702
+ # endif
1703
+ }
1704
+ #endif // __cccl_ptx_isa >= 810
1705
+
1706
+ /*
1707
+ // multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90
1708
+ // .sem = { .weak }
1709
+ // .op = { .xor }
1710
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1711
+ __device__ static inline B32 multimem_ld_reduce(
1712
+ cuda::ptx::sem_weak_t,
1713
+ cuda::ptx::op_xor_op_t,
1714
+ const B32* addr);
1715
+ */
1716
+ #if __cccl_ptx_isa >= 810
1717
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1718
+ template <typename _B32, ::cuda::std::enable_if_t<sizeof(_B32) == 4, bool> = true>
1719
+ _CCCL_DEVICE static inline _B32 multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_xor_op_t, const _B32* __addr)
1720
+ {
1721
+ // __sem == sem_weak (due to parameter type constraint)
1722
+ // __op == op_xor_op (due to parameter type constraint)
1723
+ static_assert(sizeof(_B32) == 4, "");
1724
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1725
+ ::cuda::std::uint32_t __dest;
1726
+ asm("multimem.ld_reduce.weak.global.xor.b32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
1727
+ return *reinterpret_cast<_B32*>(&__dest);
1728
+ # else
1729
+ // Unsupported architectures will have a linker error with a semi-decent error message
1730
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1731
+ ::cuda::std::uint32_t __err_out_var = 0;
1732
+ return *reinterpret_cast<_B32*>(&__err_out_var);
1733
+ # endif
1734
+ }
1735
+ #endif // __cccl_ptx_isa >= 810
1736
+
1737
+ /*
1738
+ // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90
1739
+ // .sem = { .relaxed, .acquire }
1740
+ // .scope = { .cta, .cluster, .gpu, .sys }
1741
+ // .op = { .xor }
1742
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
1743
+ __device__ static inline B32 multimem_ld_reduce(
1744
+ cuda::ptx::sem_t<Sem> sem,
1745
+ cuda::ptx::scope_t<Scope> scope,
1746
+ cuda::ptx::op_xor_op_t,
1747
+ const B32* addr);
1748
+ */
1749
+ #if __cccl_ptx_isa >= 810
1750
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1751
+ template <typename _B32,
1752
+ ::cuda::std::enable_if_t<sizeof(_B32) == 4, bool> = true,
1753
+ ::cuda::ptx::dot_sem _Sem,
1754
+ ::cuda::ptx::dot_scope _Scope>
1755
+ _CCCL_DEVICE static inline _B32 multimem_ld_reduce(
1756
+ ::cuda::ptx::sem_t<_Sem> __sem, ::cuda::ptx::scope_t<_Scope> __scope, ::cuda::ptx::op_xor_op_t, const _B32* __addr)
1757
+ {
1758
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
1759
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
1760
+ // __op == op_xor_op (due to parameter type constraint)
1761
+ static_assert(sizeof(_B32) == 4, "");
1762
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1763
+ ::cuda::std::uint32_t __dest;
1764
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
1765
+ {
1766
+ asm("multimem.ld_reduce.relaxed.cta.global.xor.b32 %0, [%1];"
1767
+ : "=r"(__dest)
1768
+ : "l"(__as_ptr_gmem(__addr))
1769
+ : "memory");
1770
+ }
1771
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
1772
+ {
1773
+ asm("multimem.ld_reduce.relaxed.cluster.global.xor.b32 %0, [%1];"
1774
+ : "=r"(__dest)
1775
+ : "l"(__as_ptr_gmem(__addr))
1776
+ : "memory");
1777
+ }
1778
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
1779
+ {
1780
+ asm("multimem.ld_reduce.relaxed.gpu.global.xor.b32 %0, [%1];"
1781
+ : "=r"(__dest)
1782
+ : "l"(__as_ptr_gmem(__addr))
1783
+ : "memory");
1784
+ }
1785
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
1786
+ {
1787
+ asm("multimem.ld_reduce.relaxed.sys.global.xor.b32 %0, [%1];"
1788
+ : "=r"(__dest)
1789
+ : "l"(__as_ptr_gmem(__addr))
1790
+ : "memory");
1791
+ }
1792
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
1793
+ {
1794
+ asm("multimem.ld_reduce.acquire.cta.global.xor.b32 %0, [%1];"
1795
+ : "=r"(__dest)
1796
+ : "l"(__as_ptr_gmem(__addr))
1797
+ : "memory");
1798
+ }
1799
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
1800
+ {
1801
+ asm("multimem.ld_reduce.acquire.cluster.global.xor.b32 %0, [%1];"
1802
+ : "=r"(__dest)
1803
+ : "l"(__as_ptr_gmem(__addr))
1804
+ : "memory");
1805
+ }
1806
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
1807
+ {
1808
+ asm("multimem.ld_reduce.acquire.gpu.global.xor.b32 %0, [%1];"
1809
+ : "=r"(__dest)
1810
+ : "l"(__as_ptr_gmem(__addr))
1811
+ : "memory");
1812
+ }
1813
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
1814
+ {
1815
+ asm("multimem.ld_reduce.acquire.sys.global.xor.b32 %0, [%1];"
1816
+ : "=r"(__dest)
1817
+ : "l"(__as_ptr_gmem(__addr))
1818
+ : "memory");
1819
+ }
1820
+ return *reinterpret_cast<_B32*>(&__dest);
1821
+ # else
1822
+ // Unsupported architectures will have a linker error with a semi-decent error message
1823
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1824
+ ::cuda::std::uint32_t __err_out_var = 0;
1825
+ return *reinterpret_cast<_B32*>(&__err_out_var);
1826
+ # endif
1827
+ }
1828
+ #endif // __cccl_ptx_isa >= 810
1829
+
1830
+ /*
1831
+ // multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90
1832
+ // .sem = { .weak }
1833
+ // .op = { .and }
1834
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
1835
+ __device__ static inline B64 multimem_ld_reduce(
1836
+ cuda::ptx::sem_weak_t,
1837
+ cuda::ptx::op_and_op_t,
1838
+ const B64* addr);
1839
+ */
1840
+ #if __cccl_ptx_isa >= 810
1841
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1842
+ template <typename _B64, ::cuda::std::enable_if_t<sizeof(_B64) == 8, bool> = true>
1843
+ _CCCL_DEVICE static inline _B64 multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_and_op_t, const _B64* __addr)
1844
+ {
1845
+ // __sem == sem_weak (due to parameter type constraint)
1846
+ // __op == op_and_op (due to parameter type constraint)
1847
+ static_assert(sizeof(_B64) == 8, "");
1848
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1849
+ ::cuda::std::uint64_t __dest;
1850
+ asm("multimem.ld_reduce.weak.global.and.b64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
1851
+ return *reinterpret_cast<_B64*>(&__dest);
1852
+ # else
1853
+ // Unsupported architectures will have a linker error with a semi-decent error message
1854
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1855
+ ::cuda::std::uint64_t __err_out_var = 0;
1856
+ return *reinterpret_cast<_B64*>(&__err_out_var);
1857
+ # endif
1858
+ }
1859
+ #endif // __cccl_ptx_isa >= 810
1860
+
1861
+ /*
1862
+ // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90
1863
+ // .sem = { .relaxed, .acquire }
1864
+ // .scope = { .cta, .cluster, .gpu, .sys }
1865
+ // .op = { .and }
1866
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
1867
+ __device__ static inline B64 multimem_ld_reduce(
1868
+ cuda::ptx::sem_t<Sem> sem,
1869
+ cuda::ptx::scope_t<Scope> scope,
1870
+ cuda::ptx::op_and_op_t,
1871
+ const B64* addr);
1872
+ */
1873
+ #if __cccl_ptx_isa >= 810
1874
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1875
+ template <typename _B64,
1876
+ ::cuda::std::enable_if_t<sizeof(_B64) == 8, bool> = true,
1877
+ ::cuda::ptx::dot_sem _Sem,
1878
+ ::cuda::ptx::dot_scope _Scope>
1879
+ _CCCL_DEVICE static inline _B64 multimem_ld_reduce(
1880
+ ::cuda::ptx::sem_t<_Sem> __sem, ::cuda::ptx::scope_t<_Scope> __scope, ::cuda::ptx::op_and_op_t, const _B64* __addr)
1881
+ {
1882
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
1883
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
1884
+ // __op == op_and_op (due to parameter type constraint)
1885
+ static_assert(sizeof(_B64) == 8, "");
1886
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1887
+ ::cuda::std::uint64_t __dest;
1888
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
1889
+ {
1890
+ asm("multimem.ld_reduce.relaxed.cta.global.and.b64 %0, [%1];"
1891
+ : "=l"(__dest)
1892
+ : "l"(__as_ptr_gmem(__addr))
1893
+ : "memory");
1894
+ }
1895
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
1896
+ {
1897
+ asm("multimem.ld_reduce.relaxed.cluster.global.and.b64 %0, [%1];"
1898
+ : "=l"(__dest)
1899
+ : "l"(__as_ptr_gmem(__addr))
1900
+ : "memory");
1901
+ }
1902
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
1903
+ {
1904
+ asm("multimem.ld_reduce.relaxed.gpu.global.and.b64 %0, [%1];"
1905
+ : "=l"(__dest)
1906
+ : "l"(__as_ptr_gmem(__addr))
1907
+ : "memory");
1908
+ }
1909
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
1910
+ {
1911
+ asm("multimem.ld_reduce.relaxed.sys.global.and.b64 %0, [%1];"
1912
+ : "=l"(__dest)
1913
+ : "l"(__as_ptr_gmem(__addr))
1914
+ : "memory");
1915
+ }
1916
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
1917
+ {
1918
+ asm("multimem.ld_reduce.acquire.cta.global.and.b64 %0, [%1];"
1919
+ : "=l"(__dest)
1920
+ : "l"(__as_ptr_gmem(__addr))
1921
+ : "memory");
1922
+ }
1923
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
1924
+ {
1925
+ asm("multimem.ld_reduce.acquire.cluster.global.and.b64 %0, [%1];"
1926
+ : "=l"(__dest)
1927
+ : "l"(__as_ptr_gmem(__addr))
1928
+ : "memory");
1929
+ }
1930
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
1931
+ {
1932
+ asm("multimem.ld_reduce.acquire.gpu.global.and.b64 %0, [%1];"
1933
+ : "=l"(__dest)
1934
+ : "l"(__as_ptr_gmem(__addr))
1935
+ : "memory");
1936
+ }
1937
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
1938
+ {
1939
+ asm("multimem.ld_reduce.acquire.sys.global.and.b64 %0, [%1];"
1940
+ : "=l"(__dest)
1941
+ : "l"(__as_ptr_gmem(__addr))
1942
+ : "memory");
1943
+ }
1944
+ return *reinterpret_cast<_B64*>(&__dest);
1945
+ # else
1946
+ // Unsupported architectures will have a linker error with a semi-decent error message
1947
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1948
+ ::cuda::std::uint64_t __err_out_var = 0;
1949
+ return *reinterpret_cast<_B64*>(&__err_out_var);
1950
+ # endif
1951
+ }
1952
+ #endif // __cccl_ptx_isa >= 810
1953
+
1954
+ /*
1955
+ // multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90
1956
+ // .sem = { .weak }
1957
+ // .op = { .or }
1958
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
1959
+ __device__ static inline B64 multimem_ld_reduce(
1960
+ cuda::ptx::sem_weak_t,
1961
+ cuda::ptx::op_or_op_t,
1962
+ const B64* addr);
1963
+ */
1964
+ #if __cccl_ptx_isa >= 810
1965
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1966
+ template <typename _B64, ::cuda::std::enable_if_t<sizeof(_B64) == 8, bool> = true>
1967
+ _CCCL_DEVICE static inline _B64 multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_or_op_t, const _B64* __addr)
1968
+ {
1969
+ // __sem == sem_weak (due to parameter type constraint)
1970
+ // __op == op_or_op (due to parameter type constraint)
1971
+ static_assert(sizeof(_B64) == 8, "");
1972
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
1973
+ ::cuda::std::uint64_t __dest;
1974
+ asm("multimem.ld_reduce.weak.global.or.b64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
1975
+ return *reinterpret_cast<_B64*>(&__dest);
1976
+ # else
1977
+ // Unsupported architectures will have a linker error with a semi-decent error message
1978
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1979
+ ::cuda::std::uint64_t __err_out_var = 0;
1980
+ return *reinterpret_cast<_B64*>(&__err_out_var);
1981
+ # endif
1982
+ }
1983
+ #endif // __cccl_ptx_isa >= 810
1984
+
1985
+ /*
1986
+ // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90
1987
+ // .sem = { .relaxed, .acquire }
1988
+ // .scope = { .cta, .cluster, .gpu, .sys }
1989
+ // .op = { .or }
1990
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
1991
+ __device__ static inline B64 multimem_ld_reduce(
1992
+ cuda::ptx::sem_t<Sem> sem,
1993
+ cuda::ptx::scope_t<Scope> scope,
1994
+ cuda::ptx::op_or_op_t,
1995
+ const B64* addr);
1996
+ */
1997
+ #if __cccl_ptx_isa >= 810
1998
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
1999
+ template <typename _B64,
2000
+ ::cuda::std::enable_if_t<sizeof(_B64) == 8, bool> = true,
2001
+ ::cuda::ptx::dot_sem _Sem,
2002
+ ::cuda::ptx::dot_scope _Scope>
2003
+ _CCCL_DEVICE static inline _B64 multimem_ld_reduce(
2004
+ ::cuda::ptx::sem_t<_Sem> __sem, ::cuda::ptx::scope_t<_Scope> __scope, ::cuda::ptx::op_or_op_t, const _B64* __addr)
2005
+ {
2006
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
2007
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
2008
+ // __op == op_or_op (due to parameter type constraint)
2009
+ static_assert(sizeof(_B64) == 8, "");
2010
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
2011
+ ::cuda::std::uint64_t __dest;
2012
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
2013
+ {
2014
+ asm("multimem.ld_reduce.relaxed.cta.global.or.b64 %0, [%1];"
2015
+ : "=l"(__dest)
2016
+ : "l"(__as_ptr_gmem(__addr))
2017
+ : "memory");
2018
+ }
2019
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
2020
+ {
2021
+ asm("multimem.ld_reduce.relaxed.cluster.global.or.b64 %0, [%1];"
2022
+ : "=l"(__dest)
2023
+ : "l"(__as_ptr_gmem(__addr))
2024
+ : "memory");
2025
+ }
2026
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
2027
+ {
2028
+ asm("multimem.ld_reduce.relaxed.gpu.global.or.b64 %0, [%1];"
2029
+ : "=l"(__dest)
2030
+ : "l"(__as_ptr_gmem(__addr))
2031
+ : "memory");
2032
+ }
2033
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
2034
+ {
2035
+ asm("multimem.ld_reduce.relaxed.sys.global.or.b64 %0, [%1];"
2036
+ : "=l"(__dest)
2037
+ : "l"(__as_ptr_gmem(__addr))
2038
+ : "memory");
2039
+ }
2040
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
2041
+ {
2042
+ asm("multimem.ld_reduce.acquire.cta.global.or.b64 %0, [%1];"
2043
+ : "=l"(__dest)
2044
+ : "l"(__as_ptr_gmem(__addr))
2045
+ : "memory");
2046
+ }
2047
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
2048
+ {
2049
+ asm("multimem.ld_reduce.acquire.cluster.global.or.b64 %0, [%1];"
2050
+ : "=l"(__dest)
2051
+ : "l"(__as_ptr_gmem(__addr))
2052
+ : "memory");
2053
+ }
2054
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
2055
+ {
2056
+ asm("multimem.ld_reduce.acquire.gpu.global.or.b64 %0, [%1];"
2057
+ : "=l"(__dest)
2058
+ : "l"(__as_ptr_gmem(__addr))
2059
+ : "memory");
2060
+ }
2061
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
2062
+ {
2063
+ asm("multimem.ld_reduce.acquire.sys.global.or.b64 %0, [%1];"
2064
+ : "=l"(__dest)
2065
+ : "l"(__as_ptr_gmem(__addr))
2066
+ : "memory");
2067
+ }
2068
+ return *reinterpret_cast<_B64*>(&__dest);
2069
+ # else
2070
+ // Unsupported architectures will have a linker error with a semi-decent error message
2071
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
2072
+ ::cuda::std::uint64_t __err_out_var = 0;
2073
+ return *reinterpret_cast<_B64*>(&__err_out_var);
2074
+ # endif
2075
+ }
2076
+ #endif // __cccl_ptx_isa >= 810
2077
+
2078
+ /*
2079
+ // multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90
2080
+ // .sem = { .weak }
2081
+ // .op = { .xor }
2082
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
2083
+ __device__ static inline B64 multimem_ld_reduce(
2084
+ cuda::ptx::sem_weak_t,
2085
+ cuda::ptx::op_xor_op_t,
2086
+ const B64* addr);
2087
+ */
2088
+ #if __cccl_ptx_isa >= 810
2089
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
2090
+ template <typename _B64, ::cuda::std::enable_if_t<sizeof(_B64) == 8, bool> = true>
2091
+ _CCCL_DEVICE static inline _B64 multimem_ld_reduce(::cuda::ptx::sem_weak_t, ::cuda::ptx::op_xor_op_t, const _B64* __addr)
2092
+ {
2093
+ // __sem == sem_weak (due to parameter type constraint)
2094
+ // __op == op_xor_op (due to parameter type constraint)
2095
+ static_assert(sizeof(_B64) == 8, "");
2096
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
2097
+ ::cuda::std::uint64_t __dest;
2098
+ asm("multimem.ld_reduce.weak.global.xor.b64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory");
2099
+ return *reinterpret_cast<_B64*>(&__dest);
2100
+ # else
2101
+ // Unsupported architectures will have a linker error with a semi-decent error message
2102
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
2103
+ ::cuda::std::uint64_t __err_out_var = 0;
2104
+ return *reinterpret_cast<_B64*>(&__err_out_var);
2105
+ # endif
2106
+ }
2107
+ #endif // __cccl_ptx_isa >= 810
2108
+
2109
+ /*
2110
+ // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90
2111
+ // .sem = { .relaxed, .acquire }
2112
+ // .scope = { .cta, .cluster, .gpu, .sys }
2113
+ // .op = { .xor }
2114
+ template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
2115
+ __device__ static inline B64 multimem_ld_reduce(
2116
+ cuda::ptx::sem_t<Sem> sem,
2117
+ cuda::ptx::scope_t<Scope> scope,
2118
+ cuda::ptx::op_xor_op_t,
2119
+ const B64* addr);
2120
+ */
2121
+ #if __cccl_ptx_isa >= 810
2122
+ extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
2123
+ template <typename _B64,
2124
+ ::cuda::std::enable_if_t<sizeof(_B64) == 8, bool> = true,
2125
+ ::cuda::ptx::dot_sem _Sem,
2126
+ ::cuda::ptx::dot_scope _Scope>
2127
+ _CCCL_DEVICE static inline _B64 multimem_ld_reduce(
2128
+ ::cuda::ptx::sem_t<_Sem> __sem, ::cuda::ptx::scope_t<_Scope> __scope, ::cuda::ptx::op_xor_op_t, const _B64* __addr)
2129
+ {
2130
+ static_assert(__sem == sem_relaxed || __sem == sem_acquire, "");
2131
+ static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
2132
+ // __op == op_xor_op (due to parameter type constraint)
2133
+ static_assert(sizeof(_B64) == 8, "");
2134
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
2135
+ ::cuda::std::uint64_t __dest;
2136
+ if constexpr (__sem == sem_relaxed && __scope == scope_cta)
2137
+ {
2138
+ asm("multimem.ld_reduce.relaxed.cta.global.xor.b64 %0, [%1];"
2139
+ : "=l"(__dest)
2140
+ : "l"(__as_ptr_gmem(__addr))
2141
+ : "memory");
2142
+ }
2143
+ else if constexpr (__sem == sem_relaxed && __scope == scope_cluster)
2144
+ {
2145
+ asm("multimem.ld_reduce.relaxed.cluster.global.xor.b64 %0, [%1];"
2146
+ : "=l"(__dest)
2147
+ : "l"(__as_ptr_gmem(__addr))
2148
+ : "memory");
2149
+ }
2150
+ else if constexpr (__sem == sem_relaxed && __scope == scope_gpu)
2151
+ {
2152
+ asm("multimem.ld_reduce.relaxed.gpu.global.xor.b64 %0, [%1];"
2153
+ : "=l"(__dest)
2154
+ : "l"(__as_ptr_gmem(__addr))
2155
+ : "memory");
2156
+ }
2157
+ else if constexpr (__sem == sem_relaxed && __scope == scope_sys)
2158
+ {
2159
+ asm("multimem.ld_reduce.relaxed.sys.global.xor.b64 %0, [%1];"
2160
+ : "=l"(__dest)
2161
+ : "l"(__as_ptr_gmem(__addr))
2162
+ : "memory");
2163
+ }
2164
+ else if constexpr (__sem == sem_acquire && __scope == scope_cta)
2165
+ {
2166
+ asm("multimem.ld_reduce.acquire.cta.global.xor.b64 %0, [%1];"
2167
+ : "=l"(__dest)
2168
+ : "l"(__as_ptr_gmem(__addr))
2169
+ : "memory");
2170
+ }
2171
+ else if constexpr (__sem == sem_acquire && __scope == scope_cluster)
2172
+ {
2173
+ asm("multimem.ld_reduce.acquire.cluster.global.xor.b64 %0, [%1];"
2174
+ : "=l"(__dest)
2175
+ : "l"(__as_ptr_gmem(__addr))
2176
+ : "memory");
2177
+ }
2178
+ else if constexpr (__sem == sem_acquire && __scope == scope_gpu)
2179
+ {
2180
+ asm("multimem.ld_reduce.acquire.gpu.global.xor.b64 %0, [%1];"
2181
+ : "=l"(__dest)
2182
+ : "l"(__as_ptr_gmem(__addr))
2183
+ : "memory");
2184
+ }
2185
+ else if constexpr (__sem == sem_acquire && __scope == scope_sys)
2186
+ {
2187
+ asm("multimem.ld_reduce.acquire.sys.global.xor.b64 %0, [%1];"
2188
+ : "=l"(__dest)
2189
+ : "l"(__as_ptr_gmem(__addr))
2190
+ : "memory");
2191
+ }
2192
+ return *reinterpret_cast<_B64*>(&__dest);
2193
+ # else
2194
+ // Unsupported architectures will have a linker error with a semi-decent error message
2195
+ __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__();
2196
+ ::cuda::std::uint64_t __err_out_var = 0;
2197
+ return *reinterpret_cast<_B64*>(&__err_out_var);
2198
+ # endif
2199
+ }
2200
+ #endif // __cccl_ptx_isa >= 810
2201
+
2202
+ #endif // _CUDA_PTX_GENERATED_MULTIMEM_LD_REDUCE_H_