cuda-cccl 0.1.3.1.0.dev1486__cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1819) hide show
  1. cuda/cccl/__init__.py +14 -0
  2. cuda/cccl/cooperative/__init__.py +3 -0
  3. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  4. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  5. cuda/cccl/cooperative/experimental/_common.py +276 -0
  6. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  7. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  8. cuda/cccl/cooperative/experimental/_types.py +953 -0
  9. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  10. cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
  11. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  12. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  13. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  14. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  15. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  16. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  17. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
  18. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  20. cuda/cccl/headers/__init__.py +7 -0
  21. cuda/cccl/headers/include/__init__.py +1 -0
  22. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
  23. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  24. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  25. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +919 -0
  26. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
  27. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +752 -0
  28. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  29. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  32. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
  33. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  34. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
  35. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  36. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  37. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  38. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
  39. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
  40. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  41. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  42. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
  43. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  44. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  45. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  46. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  47. cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
  48. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  49. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  50. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  51. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  52. cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
  53. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  54. cuda/cccl/headers/include/cub/block/block_scan.cuh +2600 -0
  55. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  56. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  57. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  58. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  59. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  65. cuda/cccl/headers/include/cub/config.cuh +60 -0
  66. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  67. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  68. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  69. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  70. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  71. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  72. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
  73. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
  74. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  75. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  76. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  77. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  82. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  83. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  84. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  85. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +355 -0
  86. cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
  87. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  88. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  89. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  90. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  91. cuda/cccl/headers/include/cub/device/device_for.cuh +994 -0
  92. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  93. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  94. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  95. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  96. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  97. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3431 -0
  98. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1387 -0
  99. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
  100. cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
  101. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  102. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  103. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  104. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  105. cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +502 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +397 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +523 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +437 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +283 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  154. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
  155. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  156. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  157. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  158. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
  159. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  160. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  161. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  162. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
  163. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
  164. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
  165. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  166. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
  167. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  168. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  169. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  170. cuda/cccl/headers/include/cub/util_arch.cuh +163 -0
  171. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  172. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  173. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  174. cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
  175. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  176. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  177. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  178. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  179. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  180. cuda/cccl/headers/include/cub/util_type.cuh +1111 -0
  181. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  182. cuda/cccl/headers/include/cub/version.cuh +89 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  184. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  185. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  186. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  187. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
  189. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  190. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  191. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  192. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  193. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
  194. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +169 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
  201. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
  202. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  203. cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
  204. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier.h +66 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
  209. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  210. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +61 -0
  211. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  212. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  213. cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
  214. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +126 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  217. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  218. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +104 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +106 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
  225. cuda/cccl/headers/include/cuda/__execution/require.h +67 -0
  226. cuda/cccl/headers/include/cuda/__execution/tune.h +62 -0
  227. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  228. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +279 -0
  229. cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
  230. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  231. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  232. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  233. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  234. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  235. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  236. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  237. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +261 -0
  238. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +407 -0
  239. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  240. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +323 -0
  241. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +481 -0
  242. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  243. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +457 -0
  244. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  245. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +123 -0
  246. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  247. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  248. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  249. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  250. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  251. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
  252. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
  253. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
  254. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  255. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  256. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
  257. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  258. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  259. cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
  260. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
  261. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +158 -0
  262. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
  263. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
  264. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
  265. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  266. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
  267. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  268. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
  269. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  270. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  271. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  272. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  273. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  274. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  275. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  276. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  277. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  278. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  279. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  280. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  281. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  282. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  283. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  284. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  285. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  286. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  287. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  288. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
  289. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  290. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  291. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
  292. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
  293. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
  294. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  295. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  296. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  297. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
  298. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  299. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
  300. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  301. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  302. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  303. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  304. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +275 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  376. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  377. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
  378. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  379. cuda/cccl/headers/include/cuda/__stream/get_stream.h +97 -0
  380. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +165 -0
  381. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  382. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  383. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +66 -0
  384. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
  385. cuda/cccl/headers/include/cuda/access_property +26 -0
  386. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  387. cuda/cccl/headers/include/cuda/atomic +27 -0
  388. cuda/cccl/headers/include/cuda/barrier +262 -0
  389. cuda/cccl/headers/include/cuda/bit +29 -0
  390. cuda/cccl/headers/include/cuda/cmath +35 -0
  391. cuda/cccl/headers/include/cuda/discard_memory +61 -0
  392. cuda/cccl/headers/include/cuda/functional +31 -0
  393. cuda/cccl/headers/include/cuda/iterator +31 -0
  394. cuda/cccl/headers/include/cuda/latch +27 -0
  395. cuda/cccl/headers/include/cuda/mdspan +28 -0
  396. cuda/cccl/headers/include/cuda/memory +28 -0
  397. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  398. cuda/cccl/headers/include/cuda/numeric +28 -0
  399. cuda/cccl/headers/include/cuda/pipeline +579 -0
  400. cuda/cccl/headers/include/cuda/ptx +118 -0
  401. cuda/cccl/headers/include/cuda/semaphore +31 -0
  402. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +60 -0
  403. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +46 -0
  404. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +46 -0
  405. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
  406. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  407. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  408. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  409. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
  410. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +79 -0
  411. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  412. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +74 -0
  413. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  414. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  415. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +129 -0
  416. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  417. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  418. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  419. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +64 -0
  420. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  421. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  422. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  423. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  424. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  425. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  426. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  427. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  428. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  429. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  430. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +51 -0
  431. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  432. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +58 -0
  433. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  434. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +50 -0
  435. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +69 -0
  436. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  437. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +188 -0
  438. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  439. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +83 -0
  440. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +72 -0
  441. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  442. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  443. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +70 -0
  444. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  445. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  446. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +88 -0
  447. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +71 -0
  448. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +141 -0
  449. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  450. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +88 -0
  451. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  452. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +89 -0
  453. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +46 -0
  454. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  455. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  456. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +121 -0
  457. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  458. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  459. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +95 -0
  460. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +89 -0
  461. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +103 -0
  462. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  463. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +99 -0
  464. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +69 -0
  465. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  466. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  467. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  468. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  469. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +264 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +123 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +135 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +129 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +72 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +77 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +156 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +96 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +127 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  495. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  496. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  497. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  498. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  499. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
  500. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  501. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  502. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  503. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  504. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  505. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  506. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  507. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  508. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  509. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
  510. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
  511. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  512. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
  513. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  514. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  515. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  516. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  517. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  518. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  519. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +84 -0
  520. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
  521. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
  522. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  523. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  524. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  525. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  526. cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
  527. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  528. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1274 -0
  529. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  530. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  531. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +146 -0
  532. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
  533. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +1343 -0
  534. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +216 -0
  535. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
  536. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  537. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  538. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +129 -0
  539. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +124 -0
  540. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
  541. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +35 -0
  542. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  543. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +129 -0
  544. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  545. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  546. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1234 -0
  547. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
  548. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
  549. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  550. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  551. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  552. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  553. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  554. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +112 -0
  555. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  556. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  557. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  558. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  559. cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
  560. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +240 -0
  561. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +187 -0
  562. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +620 -0
  563. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +207 -0
  564. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +181 -0
  565. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +250 -0
  566. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +213 -0
  567. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +250 -0
  568. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +323 -0
  569. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +163 -0
  570. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +201 -0
  571. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +176 -0
  572. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +129 -0
  573. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +106 -0
  574. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +503 -0
  575. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +236 -0
  576. cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
  577. cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
  578. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +180 -0
  579. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +877 -0
  580. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
  581. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
  582. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +292 -0
  583. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +351 -0
  584. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +350 -0
  585. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +135 -0
  586. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  587. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  588. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  589. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
  590. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  591. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  592. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +274 -0
  593. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  594. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
  595. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  596. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
  597. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  598. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  599. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  600. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  601. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  602. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  603. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  604. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  605. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  606. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  607. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  608. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  609. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  610. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  611. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  612. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  613. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  614. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  615. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  616. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  617. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +143 -0
  618. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  619. cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
  620. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  621. cuda/cccl/headers/include/cuda/std/__expected/expected.h +2002 -0
  622. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1078 -0
  623. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  624. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +178 -0
  625. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  626. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  627. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  628. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  629. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
  630. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
  631. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  632. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  633. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
  634. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  635. cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
  636. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  637. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  638. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  639. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  640. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  641. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  642. cuda/cccl/headers/include/cuda/std/__functional/bind.h +352 -0
  643. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +88 -0
  644. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  645. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +75 -0
  646. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +75 -0
  647. cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
  648. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  649. cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
  650. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  651. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  652. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
  653. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  654. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  655. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +214 -0
  656. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +121 -0
  657. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  658. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
  659. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  660. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  661. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  662. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  663. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  664. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +67 -0
  665. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  666. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +278 -0
  667. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  668. cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
  669. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  670. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  671. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  672. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  673. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  674. cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
  675. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
  676. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  677. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  678. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  679. cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
  680. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  681. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  682. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  683. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  684. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  685. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  686. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
  687. cuda/cccl/headers/include/cuda/std/__iterator/access.h +132 -0
  688. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +230 -0
  689. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +103 -0
  690. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +264 -0
  691. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +608 -0
  692. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +469 -0
  693. cuda/cccl/headers/include/cuda/std/__iterator/data.h +63 -0
  694. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  695. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  696. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +54 -0
  697. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  698. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +98 -0
  699. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
  700. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  701. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +105 -0
  702. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +141 -0
  703. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  704. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  705. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  706. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  707. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +935 -0
  708. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  709. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +401 -0
  710. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  711. cuda/cccl/headers/include/cuda/std/__iterator/next.h +102 -0
  712. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +99 -0
  713. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +101 -0
  714. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  715. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +92 -0
  716. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  717. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  718. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +146 -0
  719. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +615 -0
  720. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  721. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  722. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +88 -0
  723. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +259 -0
  724. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  725. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  726. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
  727. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  728. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +55 -0
  729. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +140 -0
  730. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +134 -0
  731. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +328 -0
  732. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +100 -0
  733. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  734. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +74 -0
  735. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +363 -0
  736. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +765 -0
  737. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +317 -0
  738. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +310 -0
  739. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +615 -0
  740. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  741. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  742. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +190 -0
  743. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +347 -0
  744. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  745. cuda/cccl/headers/include/cuda/std/__memory/align.h +87 -0
  746. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  747. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  748. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  749. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  750. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  751. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +569 -0
  752. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  753. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  754. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +231 -0
  755. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  756. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  757. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  758. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +260 -0
  759. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  760. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +686 -0
  761. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +771 -0
  762. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
  763. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  764. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  765. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  766. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  767. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  768. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  769. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +57 -0
  770. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  771. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  772. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
  773. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  774. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  775. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  776. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
  777. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +70 -0
  778. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +61 -0
  779. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  780. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  781. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  782. cuda/cccl/headers/include/cuda/std/__ranges/access.h +304 -0
  783. cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
  784. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
  785. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  786. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  787. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  788. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +111 -0
  789. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  790. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  791. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
  792. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  793. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +271 -0
  794. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  795. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  796. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +114 -0
  797. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  798. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  799. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  800. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +343 -0
  801. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +156 -0
  802. cuda/cccl/headers/include/cuda/std/__ranges/size.h +200 -0
  803. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  804. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +263 -0
  805. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +531 -0
  806. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  807. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
  808. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  809. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  810. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  811. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  812. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +591 -0
  813. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +299 -0
  814. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  815. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  816. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  817. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  818. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  819. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  820. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  821. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +144 -0
  822. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  823. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  824. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  825. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +236 -0
  826. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
  827. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  828. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  829. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  830. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  831. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  832. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  833. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +242 -0
  834. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  835. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  836. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  837. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  838. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  839. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  840. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  841. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  842. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  843. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  844. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  845. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  846. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
  847. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  848. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  849. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  850. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  851. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  852. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  853. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  854. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  855. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  856. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  857. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  858. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  859. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  860. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  861. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  862. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  863. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  864. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  865. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  866. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  867. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  868. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  869. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  870. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  871. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  872. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  873. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  874. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  875. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  876. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  877. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  878. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  879. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +43 -0
  880. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  881. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  882. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  883. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  884. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  885. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  886. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  887. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  888. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  889. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  890. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  891. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  892. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  893. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  894. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  895. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  896. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  897. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  898. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  899. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  900. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  901. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  902. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  903. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  904. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  905. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  906. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +79 -0
  907. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  908. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  909. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  910. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  911. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  912. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  913. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
  915. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +43 -0
  916. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  917. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  918. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  919. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  920. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  921. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  922. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  923. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  924. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  925. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  926. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  927. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +203 -0
  928. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  929. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  930. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  931. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  932. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  933. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  934. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  935. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  936. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  937. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  938. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  939. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  940. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  941. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  942. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  943. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  944. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  945. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  946. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  947. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  948. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  949. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  950. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  951. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  952. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  953. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  954. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  955. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  956. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  957. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  958. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  959. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1069 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  973. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  974. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
  975. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  976. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +103 -0
  977. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  978. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
  979. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  980. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  981. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +56 -0
  982. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  983. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  984. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  985. cuda/cccl/headers/include/cuda/std/__utility/move.h +75 -0
  986. cuda/cccl/headers/include/cuda/std/__utility/pair.h +808 -0
  987. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  988. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +763 -0
  989. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  990. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  991. cuda/cccl/headers/include/cuda/std/__utility/swap.h +65 -0
  992. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  993. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +425 -0
  994. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  995. cuda/cccl/headers/include/cuda/std/array +527 -0
  996. cuda/cccl/headers/include/cuda/std/atomic +823 -0
  997. cuda/cccl/headers/include/cuda/std/barrier +43 -0
  998. cuda/cccl/headers/include/cuda/std/bit +35 -0
  999. cuda/cccl/headers/include/cuda/std/bitset +1026 -0
  1000. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1001. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1002. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1003. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1004. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1005. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1006. cuda/cccl/headers/include/cuda/std/complex +25 -0
  1007. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1008. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1009. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1010. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1011. cuda/cccl/headers/include/cuda/std/cstring +111 -0
  1012. cuda/cccl/headers/include/cuda/std/ctime +147 -0
  1013. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1014. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +258 -0
  1015. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +2692 -0
  1016. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3689 -0
  1017. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +685 -0
  1018. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/complex +1610 -0
  1019. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1020. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/optional +1786 -0
  1021. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1022. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1378 -0
  1023. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2160 -0
  1024. cuda/cccl/headers/include/cuda/std/execution +27 -0
  1025. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1026. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1027. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1028. cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
  1029. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1030. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1031. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1032. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1033. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1034. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1035. cuda/cccl/headers/include/cuda/std/numbers +335 -0
  1036. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1037. cuda/cccl/headers/include/cuda/std/optional +25 -0
  1038. cuda/cccl/headers/include/cuda/std/ranges +68 -0
  1039. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1040. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1041. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1042. cuda/cccl/headers/include/cuda/std/span +640 -0
  1043. cuda/cccl/headers/include/cuda/std/string_view +814 -0
  1044. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1045. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1046. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1047. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1048. cuda/cccl/headers/include/cuda/std/version +245 -0
  1049. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1050. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1051. cuda/cccl/headers/include/cuda/version +16 -0
  1052. cuda/cccl/headers/include/cuda/warp +28 -0
  1053. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1054. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1055. cuda/cccl/headers/include/nv/detail/__target_macros +599 -0
  1056. cuda/cccl/headers/include/nv/target +229 -0
  1057. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1058. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1059. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1060. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1061. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1062. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1063. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1064. cuda/cccl/headers/include/thrust/count.h +245 -0
  1065. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1066. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1067. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1068. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1069. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1070. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1071. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1072. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1073. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1074. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1075. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1076. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1077. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1078. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1079. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1080. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1081. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1082. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
  1083. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1084. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1085. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1086. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1087. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1088. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1089. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1090. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1091. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1092. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1093. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1094. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1095. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1096. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1097. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1098. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1099. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1100. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1101. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1102. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1103. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1104. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1105. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1106. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1107. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1108. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1109. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1110. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1111. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1112. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1113. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1114. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1115. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1116. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1117. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1118. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1119. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1120. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1121. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1122. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1123. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1124. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1125. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1126. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1127. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1128. cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
  1129. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1130. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1131. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1132. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1133. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1134. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1135. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1136. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1137. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1138. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1139. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1140. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1141. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1142. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1143. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1144. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1145. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1146. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1147. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1148. cuda/cccl/headers/include/thrust/detail/internal_functional.h +285 -0
  1149. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1150. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +92 -0
  1151. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1152. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1153. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1154. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1155. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1156. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1157. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1158. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1159. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1160. cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
  1161. cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
  1162. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1163. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1164. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1165. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1166. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1167. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
  1168. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1169. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1170. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1171. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1172. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1173. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1174. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1175. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1176. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1177. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1178. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1179. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1180. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1181. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1182. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1183. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1184. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1185. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1186. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +138 -0
  1187. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1188. cuda/cccl/headers/include/thrust/detail/transform.inl +250 -0
  1189. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1190. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1191. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +131 -0
  1192. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1193. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1194. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1195. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1196. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1197. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1198. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1199. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +60 -0
  1200. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1201. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1202. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1203. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1204. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1205. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1206. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1207. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1208. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1209. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1210. cuda/cccl/headers/include/thrust/detail/vector_base.h +630 -0
  1211. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1242 -0
  1212. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1213. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1214. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1215. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1216. cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
  1217. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1218. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1219. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1220. cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
  1221. cuda/cccl/headers/include/thrust/device_reference.h +986 -0
  1222. cuda/cccl/headers/include/thrust/device_vector.h +574 -0
  1223. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1224. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1225. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1226. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1227. cuda/cccl/headers/include/thrust/fill.h +201 -0
  1228. cuda/cccl/headers/include/thrust/find.h +382 -0
  1229. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1230. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1231. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1232. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1233. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1234. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1235. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
  1236. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1237. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1238. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1239. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1240. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1241. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1242. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1243. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1244. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1245. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1246. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1247. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1248. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1249. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1250. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1251. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1252. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +164 -0
  1253. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1254. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1255. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1256. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +245 -0
  1257. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
  1258. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1259. cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
  1260. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
  1261. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
  1262. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1263. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1264. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1265. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1266. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1267. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
  1268. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1269. cuda/cccl/headers/include/thrust/memory.h +395 -0
  1270. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1271. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1272. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1273. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1274. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1275. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1276. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
  1277. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1278. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1279. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1280. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1281. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1282. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1283. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1284. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1285. cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
  1286. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1287. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1288. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1289. cuda/cccl/headers/include/thrust/partition.h +1383 -0
  1290. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1291. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1292. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1293. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1294. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1295. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1296. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1297. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1298. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1299. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1300. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1301. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1302. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1303. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1304. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1305. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1306. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1307. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1308. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1309. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1310. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1311. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1312. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1313. cuda/cccl/headers/include/thrust/random.h +120 -0
  1314. cuda/cccl/headers/include/thrust/reduce.h +1112 -0
  1315. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1316. cuda/cccl/headers/include/thrust/replace.h +827 -0
  1317. cuda/cccl/headers/include/thrust/reverse.h +213 -0
  1318. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1319. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1320. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1321. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1322. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1323. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1324. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1325. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1326. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1327. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1328. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1329. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1330. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1331. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1332. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1333. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1334. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1335. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1336. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1337. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1338. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1339. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1340. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1341. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1342. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1343. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1344. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1345. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1346. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1347. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1348. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1349. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1350. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1351. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1352. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1353. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1354. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1355. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1356. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1357. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1358. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1359. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1360. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1361. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1362. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1363. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1364. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1365. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1366. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1367. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1368. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1369. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1370. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1371. cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
  1372. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
  1373. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1374. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1375. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +119 -0
  1376. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1377. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1378. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1379. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1380. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1381. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1382. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1383. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1384. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1385. cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
  1386. cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +60 -0
  1387. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1388. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +630 -0
  1389. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1390. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1391. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1392. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1393. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1394. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1395. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1396. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1397. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1398. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1399. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1400. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1401. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1402. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1403. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1404. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +98 -0
  1405. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1406. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1407. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1408. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1409. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1410. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1411. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1412. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1413. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1414. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1415. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1416. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1417. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +961 -0
  1418. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
  1419. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1420. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +164 -0
  1421. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
  1422. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1423. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1424. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1425. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1426. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
  1427. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1428. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
  1429. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1430. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1431. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1432. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
  1433. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1434. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1435. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
  1436. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1437. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +648 -0
  1438. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1439. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1440. cuda/cccl/headers/include/thrust/system/cuda/error.h +175 -0
  1441. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1442. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1443. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1444. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +140 -0
  1445. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1446. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1447. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1448. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1449. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1450. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1451. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1452. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1453. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1454. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1455. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1456. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1457. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1458. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1459. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1460. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1461. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1462. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1463. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1464. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1465. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1466. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1467. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
  1468. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1469. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1470. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1471. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1472. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1473. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1474. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1475. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1476. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1477. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1478. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1479. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1480. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1481. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1482. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1483. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1484. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1485. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1486. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1487. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1488. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1489. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1490. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1491. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1492. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1493. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1494. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1495. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1496. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1497. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1498. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1499. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1500. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1501. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1502. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1503. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1504. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1505. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1506. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1507. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1508. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1509. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1510. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1511. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1512. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1513. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1514. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1515. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1516. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1517. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1518. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1519. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1520. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1521. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1522. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1523. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1524. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1525. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1526. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1527. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1528. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1529. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1530. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1531. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1532. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1533. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1534. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1535. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1536. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1537. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
  1538. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1539. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1540. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1541. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1542. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1543. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1544. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1545. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1546. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1547. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +55 -0
  1548. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.inl +95 -0
  1549. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1550. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1551. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1552. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1553. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1554. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1555. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1556. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1557. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1558. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1559. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1560. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1561. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1562. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +109 -0
  1563. cuda/cccl/headers/include/thrust/system/detail/generic/transform.inl +185 -0
  1564. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1565. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1566. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1567. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +187 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1635. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1636. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1637. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1638. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1639. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1640. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1641. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1642. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1643. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1644. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1645. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1646. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1647. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1648. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1649. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1650. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1651. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1652. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1653. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1654. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1655. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1656. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1657. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1658. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1659. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1660. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1661. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1662. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1663. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1664. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1665. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1666. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1667. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1668. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1669. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1670. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1671. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1672. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1673. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1674. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1675. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1676. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1677. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1678. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1679. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1680. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1681. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1682. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1683. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1684. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +259 -0
  1685. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1686. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1687. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1688. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1689. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1690. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1691. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1692. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1693. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1694. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1695. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1696. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1697. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
  1698. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1699. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1700. cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
  1701. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1702. cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
  1703. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1704. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1705. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1706. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1707. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1708. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1709. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1710. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1711. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1712. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1713. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1714. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1715. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1716. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1717. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1718. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1719. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1720. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1721. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1722. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1723. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1724. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1725. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1726. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1727. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1728. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1729. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1730. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1731. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1732. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1733. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1734. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1735. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1736. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1737. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1738. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1739. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1740. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1741. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1742. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1743. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1744. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1745. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1746. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1747. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1748. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1749. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1750. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1751. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1752. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1754. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1755. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1756. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1757. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1758. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1759. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1760. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1761. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1762. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
  1763. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1764. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1765. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +120 -0
  1766. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1767. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1768. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1769. cuda/cccl/headers/include/thrust/transform.h +903 -0
  1770. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1771. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1772. cuda/cccl/headers/include/thrust/tuple.h +142 -0
  1773. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1774. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +182 -0
  1775. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1776. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1777. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1778. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +306 -0
  1779. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1780. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +93 -0
  1781. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1782. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1783. cuda/cccl/headers/include/thrust/unique.h +1090 -0
  1784. cuda/cccl/headers/include/thrust/universal_allocator.h +90 -0
  1785. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1786. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1787. cuda/cccl/headers/include/thrust/version.h +93 -0
  1788. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1789. cuda/cccl/headers/include_paths.py +72 -0
  1790. cuda/cccl/parallel/__init__.py +3 -0
  1791. cuda/cccl/parallel/experimental/__init__.py +3 -0
  1792. cuda/cccl/parallel/experimental/_bindings.py +24 -0
  1793. cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
  1794. cuda/cccl/parallel/experimental/_bindings_impl.cpython-311-x86_64-linux-gnu.so +0 -0
  1795. cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
  1796. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1797. cuda/cccl/parallel/experimental/_cccl_interop.py +371 -0
  1798. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1799. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1800. cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
  1801. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
  1802. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
  1803. cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
  1804. cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
  1805. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
  1806. cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
  1807. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
  1808. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1809. cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
  1810. cuda/cccl/parallel/experimental/iterators/__init__.py +157 -0
  1811. cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
  1812. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1813. cuda/cccl/parallel/experimental/struct.py +150 -0
  1814. cuda/cccl/parallel/experimental/typing.py +27 -0
  1815. cuda/cccl/py.typed +0 -0
  1816. cuda_cccl-0.1.3.1.0.dev1486.dist-info/METADATA +29 -0
  1817. cuda_cccl-0.1.3.1.0.dev1486.dist-info/RECORD +1819 -0
  1818. cuda_cccl-0.1.3.1.0.dev1486.dist-info/WHEEL +6 -0
  1819. cuda_cccl-0.1.3.1.0.dev1486.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,4061 @@
1
+ // This file was automatically generated. Do not edit.
2
+
3
+ #ifndef _CUDA_PTX_GENERATED_TCGEN05_MMA_H_
4
+ #define _CUDA_PTX_GENERATED_TCGEN05_MMA_H_
5
+
6
+ /*
7
+ // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; //
8
+ PTX ISA 86, SM_100a
9
+ // .kind = { .kind::f16, .kind::tf32 }
10
+ // .cta_group = { .cta_group::1 }
11
+ template <int N32, cuda::ptx::dot_kind Kind>
12
+ __device__ static inline void tcgen05_mma(
13
+ cuda::ptx::kind_t<Kind> kind,
14
+ cuda::ptx::cta_group_1_t,
15
+ uint32_t d_tmem,
16
+ uint64_t a_desc,
17
+ uint64_t b_desc,
18
+ uint32_t idesc,
19
+ const uint32_t (&disable_output_lane)[4],
20
+ bool enable_input_d,
21
+ cuda::ptx::n32_t<N32> scale_input_d);
22
+ */
23
+ #if __cccl_ptx_isa >= 860
24
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__();
25
+ template <int _N32, dot_kind _Kind>
26
+ _CCCL_DEVICE static inline void tcgen05_mma(
27
+ kind_t<_Kind> __kind,
28
+ cta_group_1_t,
29
+ _CUDA_VSTD::uint32_t __d_tmem,
30
+ _CUDA_VSTD::uint64_t __a_desc,
31
+ _CUDA_VSTD::uint64_t __b_desc,
32
+ _CUDA_VSTD::uint32_t __idesc,
33
+ const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4],
34
+ bool __enable_input_d,
35
+ n32_t<_N32> __scale_input_d)
36
+ {
37
+ static_assert(__kind == kind_f16 || __kind == kind_tf32, "");
38
+ // __cta_group == cta_group_1 (due to parameter type constraint)
39
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL
40
+ if constexpr (__kind == kind_f16)
41
+ {
42
+ asm volatile(
43
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
44
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
45
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t"
46
+ "}"
47
+ :
48
+ : "r"(__d_tmem),
49
+ "l"(__a_desc),
50
+ "l"(__b_desc),
51
+ "r"(__idesc),
52
+ "r"(__disable_output_lane[0]),
53
+ "r"(__disable_output_lane[1]),
54
+ "r"(__disable_output_lane[2]),
55
+ "r"(__disable_output_lane[3]),
56
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
57
+ "n"(__scale_input_d.value)
58
+ : "memory");
59
+ }
60
+ else if constexpr (__kind == kind_tf32)
61
+ {
62
+ asm volatile(
63
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
64
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
65
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t"
66
+ "}"
67
+ :
68
+ : "r"(__d_tmem),
69
+ "l"(__a_desc),
70
+ "l"(__b_desc),
71
+ "r"(__idesc),
72
+ "r"(__disable_output_lane[0]),
73
+ "r"(__disable_output_lane[1]),
74
+ "r"(__disable_output_lane[2]),
75
+ "r"(__disable_output_lane[3]),
76
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
77
+ "n"(__scale_input_d.value)
78
+ : "memory");
79
+ }
80
+ # else
81
+ // Unsupported architectures will have a linker error with a semi-decent error message
82
+ __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__();
83
+ # endif
84
+ }
85
+ #endif // __cccl_ptx_isa >= 860
86
+
87
+ /*
88
+ // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; //
89
+ PTX ISA 86, SM_100a
90
+ // .kind = { .kind::f16, .kind::tf32 }
91
+ // .cta_group = { .cta_group::2 }
92
+ template <int N32, cuda::ptx::dot_kind Kind>
93
+ __device__ static inline void tcgen05_mma(
94
+ cuda::ptx::kind_t<Kind> kind,
95
+ cuda::ptx::cta_group_2_t,
96
+ uint32_t d_tmem,
97
+ uint64_t a_desc,
98
+ uint64_t b_desc,
99
+ uint32_t idesc,
100
+ const uint32_t (&disable_output_lane)[8],
101
+ bool enable_input_d,
102
+ cuda::ptx::n32_t<N32> scale_input_d);
103
+ */
104
+ #if __cccl_ptx_isa >= 860
105
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__();
106
+ template <int _N32, dot_kind _Kind>
107
+ _CCCL_DEVICE static inline void tcgen05_mma(
108
+ kind_t<_Kind> __kind,
109
+ cta_group_2_t,
110
+ _CUDA_VSTD::uint32_t __d_tmem,
111
+ _CUDA_VSTD::uint64_t __a_desc,
112
+ _CUDA_VSTD::uint64_t __b_desc,
113
+ _CUDA_VSTD::uint32_t __idesc,
114
+ const _CUDA_VSTD::uint32_t (&__disable_output_lane)[8],
115
+ bool __enable_input_d,
116
+ n32_t<_N32> __scale_input_d)
117
+ {
118
+ static_assert(__kind == kind_f16 || __kind == kind_tf32, "");
119
+ // __cta_group == cta_group_2 (due to parameter type constraint)
120
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL
121
+ if constexpr (__kind == kind_f16)
122
+ {
123
+ asm volatile(
124
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
125
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
126
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, PRED_enable_input_d, "
127
+ "%13;\n\t"
128
+ "}"
129
+ :
130
+ : "r"(__d_tmem),
131
+ "l"(__a_desc),
132
+ "l"(__b_desc),
133
+ "r"(__idesc),
134
+ "r"(__disable_output_lane[0]),
135
+ "r"(__disable_output_lane[1]),
136
+ "r"(__disable_output_lane[2]),
137
+ "r"(__disable_output_lane[3]),
138
+ "r"(__disable_output_lane[4]),
139
+ "r"(__disable_output_lane[5]),
140
+ "r"(__disable_output_lane[6]),
141
+ "r"(__disable_output_lane[7]),
142
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
143
+ "n"(__scale_input_d.value)
144
+ : "memory");
145
+ }
146
+ else if constexpr (__kind == kind_tf32)
147
+ {
148
+ asm volatile(
149
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
150
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
151
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, PRED_enable_input_d, "
152
+ "%13;\n\t"
153
+ "}"
154
+ :
155
+ : "r"(__d_tmem),
156
+ "l"(__a_desc),
157
+ "l"(__b_desc),
158
+ "r"(__idesc),
159
+ "r"(__disable_output_lane[0]),
160
+ "r"(__disable_output_lane[1]),
161
+ "r"(__disable_output_lane[2]),
162
+ "r"(__disable_output_lane[3]),
163
+ "r"(__disable_output_lane[4]),
164
+ "r"(__disable_output_lane[5]),
165
+ "r"(__disable_output_lane[6]),
166
+ "r"(__disable_output_lane[7]),
167
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
168
+ "n"(__scale_input_d.value)
169
+ : "memory");
170
+ }
171
+ # else
172
+ // Unsupported architectures will have a linker error with a semi-decent error message
173
+ __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__();
174
+ # endif
175
+ }
176
+ #endif // __cccl_ptx_isa >= 860
177
+
178
+ /*
179
+ // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86,
180
+ SM_100a, SM_101a
181
+ // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
182
+ // .cta_group = { .cta_group::1 }
183
+ template <cuda::ptx::dot_kind Kind>
184
+ __device__ static inline void tcgen05_mma(
185
+ cuda::ptx::kind_t<Kind> kind,
186
+ cuda::ptx::cta_group_1_t,
187
+ uint32_t d_tmem,
188
+ uint64_t a_desc,
189
+ uint64_t b_desc,
190
+ uint32_t idesc,
191
+ const uint32_t (&disable_output_lane)[4],
192
+ bool enable_input_d);
193
+ */
194
+ #if __cccl_ptx_isa >= 860
195
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__();
196
+ template <dot_kind _Kind>
197
+ _CCCL_DEVICE static inline void tcgen05_mma(
198
+ kind_t<_Kind> __kind,
199
+ cta_group_1_t,
200
+ _CUDA_VSTD::uint32_t __d_tmem,
201
+ _CUDA_VSTD::uint64_t __a_desc,
202
+ _CUDA_VSTD::uint64_t __b_desc,
203
+ _CUDA_VSTD::uint32_t __idesc,
204
+ const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4],
205
+ bool __enable_input_d)
206
+ {
207
+ static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, "");
208
+ // __cta_group == cta_group_1 (due to parameter type constraint)
209
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
210
+ if constexpr (__kind == kind_f16)
211
+ {
212
+ asm volatile(
213
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
214
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
215
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
216
+ "}"
217
+ :
218
+ : "r"(__d_tmem),
219
+ "l"(__a_desc),
220
+ "l"(__b_desc),
221
+ "r"(__idesc),
222
+ "r"(__disable_output_lane[0]),
223
+ "r"(__disable_output_lane[1]),
224
+ "r"(__disable_output_lane[2]),
225
+ "r"(__disable_output_lane[3]),
226
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
227
+ : "memory");
228
+ }
229
+ else if constexpr (__kind == kind_tf32)
230
+ {
231
+ asm volatile(
232
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
233
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
234
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
235
+ "}"
236
+ :
237
+ : "r"(__d_tmem),
238
+ "l"(__a_desc),
239
+ "l"(__b_desc),
240
+ "r"(__idesc),
241
+ "r"(__disable_output_lane[0]),
242
+ "r"(__disable_output_lane[1]),
243
+ "r"(__disable_output_lane[2]),
244
+ "r"(__disable_output_lane[3]),
245
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
246
+ : "memory");
247
+ }
248
+ else if constexpr (__kind == kind_f8f6f4)
249
+ {
250
+ asm volatile(
251
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
252
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
253
+ "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
254
+ "}"
255
+ :
256
+ : "r"(__d_tmem),
257
+ "l"(__a_desc),
258
+ "l"(__b_desc),
259
+ "r"(__idesc),
260
+ "r"(__disable_output_lane[0]),
261
+ "r"(__disable_output_lane[1]),
262
+ "r"(__disable_output_lane[2]),
263
+ "r"(__disable_output_lane[3]),
264
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
265
+ : "memory");
266
+ }
267
+ else if constexpr (__kind == kind_i8)
268
+ {
269
+ asm volatile(
270
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
271
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
272
+ "tcgen05.mma.cta_group::1.kind::i8 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
273
+ "}"
274
+ :
275
+ : "r"(__d_tmem),
276
+ "l"(__a_desc),
277
+ "l"(__b_desc),
278
+ "r"(__idesc),
279
+ "r"(__disable_output_lane[0]),
280
+ "r"(__disable_output_lane[1]),
281
+ "r"(__disable_output_lane[2]),
282
+ "r"(__disable_output_lane[3]),
283
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
284
+ : "memory");
285
+ }
286
+ # else
287
+ // Unsupported architectures will have a linker error with a semi-decent error message
288
+ __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__();
289
+ # endif
290
+ }
291
+ #endif // __cccl_ptx_isa >= 860
292
+
293
+ /*
294
+ // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86,
295
+ SM_100a, SM_101a
296
+ // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
297
+ // .cta_group = { .cta_group::2 }
298
+ template <cuda::ptx::dot_kind Kind>
299
+ __device__ static inline void tcgen05_mma(
300
+ cuda::ptx::kind_t<Kind> kind,
301
+ cuda::ptx::cta_group_2_t,
302
+ uint32_t d_tmem,
303
+ uint64_t a_desc,
304
+ uint64_t b_desc,
305
+ uint32_t idesc,
306
+ const uint32_t (&disable_output_lane)[8],
307
+ bool enable_input_d);
308
+ */
309
+ #if __cccl_ptx_isa >= 860
310
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__();
311
+ template <dot_kind _Kind>
312
+ _CCCL_DEVICE static inline void tcgen05_mma(
313
+ kind_t<_Kind> __kind,
314
+ cta_group_2_t,
315
+ _CUDA_VSTD::uint32_t __d_tmem,
316
+ _CUDA_VSTD::uint64_t __a_desc,
317
+ _CUDA_VSTD::uint64_t __b_desc,
318
+ _CUDA_VSTD::uint32_t __idesc,
319
+ const _CUDA_VSTD::uint32_t (&__disable_output_lane)[8],
320
+ bool __enable_input_d)
321
+ {
322
+ static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, "");
323
+ // __cta_group == cta_group_2 (due to parameter type constraint)
324
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
325
+ if constexpr (__kind == kind_f16)
326
+ {
327
+ asm volatile(
328
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
329
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
330
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
331
+ "PRED_enable_input_d;\n\t"
332
+ "}"
333
+ :
334
+ : "r"(__d_tmem),
335
+ "l"(__a_desc),
336
+ "l"(__b_desc),
337
+ "r"(__idesc),
338
+ "r"(__disable_output_lane[0]),
339
+ "r"(__disable_output_lane[1]),
340
+ "r"(__disable_output_lane[2]),
341
+ "r"(__disable_output_lane[3]),
342
+ "r"(__disable_output_lane[4]),
343
+ "r"(__disable_output_lane[5]),
344
+ "r"(__disable_output_lane[6]),
345
+ "r"(__disable_output_lane[7]),
346
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
347
+ : "memory");
348
+ }
349
+ else if constexpr (__kind == kind_tf32)
350
+ {
351
+ asm volatile(
352
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
353
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
354
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
355
+ "PRED_enable_input_d;\n\t"
356
+ "}"
357
+ :
358
+ : "r"(__d_tmem),
359
+ "l"(__a_desc),
360
+ "l"(__b_desc),
361
+ "r"(__idesc),
362
+ "r"(__disable_output_lane[0]),
363
+ "r"(__disable_output_lane[1]),
364
+ "r"(__disable_output_lane[2]),
365
+ "r"(__disable_output_lane[3]),
366
+ "r"(__disable_output_lane[4]),
367
+ "r"(__disable_output_lane[5]),
368
+ "r"(__disable_output_lane[6]),
369
+ "r"(__disable_output_lane[7]),
370
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
371
+ : "memory");
372
+ }
373
+ else if constexpr (__kind == kind_f8f6f4)
374
+ {
375
+ asm volatile(
376
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
377
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
378
+ "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], %1, %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
379
+ "PRED_enable_input_d;\n\t"
380
+ "}"
381
+ :
382
+ : "r"(__d_tmem),
383
+ "l"(__a_desc),
384
+ "l"(__b_desc),
385
+ "r"(__idesc),
386
+ "r"(__disable_output_lane[0]),
387
+ "r"(__disable_output_lane[1]),
388
+ "r"(__disable_output_lane[2]),
389
+ "r"(__disable_output_lane[3]),
390
+ "r"(__disable_output_lane[4]),
391
+ "r"(__disable_output_lane[5]),
392
+ "r"(__disable_output_lane[6]),
393
+ "r"(__disable_output_lane[7]),
394
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
395
+ : "memory");
396
+ }
397
+ else if constexpr (__kind == kind_i8)
398
+ {
399
+ asm volatile(
400
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
401
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
402
+ "tcgen05.mma.cta_group::2.kind::i8 [%0], %1, %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, PRED_enable_input_d;\n\t"
403
+ "}"
404
+ :
405
+ : "r"(__d_tmem),
406
+ "l"(__a_desc),
407
+ "l"(__b_desc),
408
+ "r"(__idesc),
409
+ "r"(__disable_output_lane[0]),
410
+ "r"(__disable_output_lane[1]),
411
+ "r"(__disable_output_lane[2]),
412
+ "r"(__disable_output_lane[3]),
413
+ "r"(__disable_output_lane[4]),
414
+ "r"(__disable_output_lane[5]),
415
+ "r"(__disable_output_lane[6]),
416
+ "r"(__disable_output_lane[7]),
417
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
418
+ : "memory");
419
+ }
420
+ # else
421
+ // Unsupported architectures will have a linker error with a semi-decent error message
422
+ __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__();
423
+ # endif
424
+ }
425
+ #endif // __cccl_ptx_isa >= 860
426
+
427
+ /*
428
+ // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a
429
+ // .kind = { .kind::f16, .kind::tf32 }
430
+ // .cta_group = { .cta_group::1, .cta_group::2 }
431
+ template <int N32, cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
432
+ __device__ static inline void tcgen05_mma(
433
+ cuda::ptx::kind_t<Kind> kind,
434
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
435
+ uint32_t d_tmem,
436
+ uint64_t a_desc,
437
+ uint64_t b_desc,
438
+ uint32_t idesc,
439
+ bool enable_input_d,
440
+ cuda::ptx::n32_t<N32> scale_input_d);
441
+ */
442
+ #if __cccl_ptx_isa >= 860
443
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__();
444
+ template <int _N32, dot_kind _Kind, dot_cta_group _Cta_Group>
445
+ _CCCL_DEVICE static inline void tcgen05_mma(
446
+ kind_t<_Kind> __kind,
447
+ cta_group_t<_Cta_Group> __cta_group,
448
+ _CUDA_VSTD::uint32_t __d_tmem,
449
+ _CUDA_VSTD::uint64_t __a_desc,
450
+ _CUDA_VSTD::uint64_t __b_desc,
451
+ _CUDA_VSTD::uint32_t __idesc,
452
+ bool __enable_input_d,
453
+ n32_t<_N32> __scale_input_d)
454
+ {
455
+ static_assert(__kind == kind_f16 || __kind == kind_tf32, "");
456
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
457
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL
458
+ if constexpr (__kind == kind_f16 && __cta_group == cta_group_1)
459
+ {
460
+ asm volatile(
461
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
462
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
463
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t"
464
+ "}"
465
+ :
466
+ : "r"(__d_tmem),
467
+ "l"(__a_desc),
468
+ "l"(__b_desc),
469
+ "r"(__idesc),
470
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
471
+ "n"(__scale_input_d.value)
472
+ : "memory");
473
+ }
474
+ else if constexpr (__kind == kind_f16 && __cta_group == cta_group_2)
475
+ {
476
+ asm volatile(
477
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
478
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
479
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t"
480
+ "}"
481
+ :
482
+ : "r"(__d_tmem),
483
+ "l"(__a_desc),
484
+ "l"(__b_desc),
485
+ "r"(__idesc),
486
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
487
+ "n"(__scale_input_d.value)
488
+ : "memory");
489
+ }
490
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_1)
491
+ {
492
+ asm volatile(
493
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
494
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
495
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t"
496
+ "}"
497
+ :
498
+ : "r"(__d_tmem),
499
+ "l"(__a_desc),
500
+ "l"(__b_desc),
501
+ "r"(__idesc),
502
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
503
+ "n"(__scale_input_d.value)
504
+ : "memory");
505
+ }
506
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_2)
507
+ {
508
+ asm volatile(
509
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
510
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
511
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t"
512
+ "}"
513
+ :
514
+ : "r"(__d_tmem),
515
+ "l"(__a_desc),
516
+ "l"(__b_desc),
517
+ "r"(__idesc),
518
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
519
+ "n"(__scale_input_d.value)
520
+ : "memory");
521
+ }
522
+ # else
523
+ // Unsupported architectures will have a linker error with a semi-decent error message
524
+ __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__();
525
+ # endif
526
+ }
527
+ #endif // __cccl_ptx_isa >= 860
528
+
529
+ /*
530
+ // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
531
+ // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
532
+ // .cta_group = { .cta_group::1, .cta_group::2 }
533
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
534
+ __device__ static inline void tcgen05_mma(
535
+ cuda::ptx::kind_t<Kind> kind,
536
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
537
+ uint32_t d_tmem,
538
+ uint64_t a_desc,
539
+ uint64_t b_desc,
540
+ uint32_t idesc,
541
+ bool enable_input_d);
542
+ */
543
+ #if __cccl_ptx_isa >= 860
544
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__();
545
+ template <dot_kind _Kind, dot_cta_group _Cta_Group>
546
+ _CCCL_DEVICE static inline void tcgen05_mma(
547
+ kind_t<_Kind> __kind,
548
+ cta_group_t<_Cta_Group> __cta_group,
549
+ _CUDA_VSTD::uint32_t __d_tmem,
550
+ _CUDA_VSTD::uint64_t __a_desc,
551
+ _CUDA_VSTD::uint64_t __b_desc,
552
+ _CUDA_VSTD::uint32_t __idesc,
553
+ bool __enable_input_d)
554
+ {
555
+ static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, "");
556
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
557
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
558
+ if constexpr (__kind == kind_f16 && __cta_group == cta_group_1)
559
+ {
560
+ asm volatile(
561
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
562
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
563
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
564
+ "}"
565
+ :
566
+ : "r"(__d_tmem),
567
+ "l"(__a_desc),
568
+ "l"(__b_desc),
569
+ "r"(__idesc),
570
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
571
+ : "memory");
572
+ }
573
+ else if constexpr (__kind == kind_f16 && __cta_group == cta_group_2)
574
+ {
575
+ asm volatile(
576
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
577
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
578
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
579
+ "}"
580
+ :
581
+ : "r"(__d_tmem),
582
+ "l"(__a_desc),
583
+ "l"(__b_desc),
584
+ "r"(__idesc),
585
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
586
+ : "memory");
587
+ }
588
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_1)
589
+ {
590
+ asm volatile(
591
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
592
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
593
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
594
+ "}"
595
+ :
596
+ : "r"(__d_tmem),
597
+ "l"(__a_desc),
598
+ "l"(__b_desc),
599
+ "r"(__idesc),
600
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
601
+ : "memory");
602
+ }
603
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_2)
604
+ {
605
+ asm volatile(
606
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
607
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
608
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
609
+ "}"
610
+ :
611
+ : "r"(__d_tmem),
612
+ "l"(__a_desc),
613
+ "l"(__b_desc),
614
+ "r"(__idesc),
615
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
616
+ : "memory");
617
+ }
618
+ else if constexpr (__kind == kind_f8f6f4 && __cta_group == cta_group_1)
619
+ {
620
+ asm volatile(
621
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
622
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
623
+ "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
624
+ "}"
625
+ :
626
+ : "r"(__d_tmem),
627
+ "l"(__a_desc),
628
+ "l"(__b_desc),
629
+ "r"(__idesc),
630
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
631
+ : "memory");
632
+ }
633
+ else if constexpr (__kind == kind_f8f6f4 && __cta_group == cta_group_2)
634
+ {
635
+ asm volatile(
636
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
637
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
638
+ "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
639
+ "}"
640
+ :
641
+ : "r"(__d_tmem),
642
+ "l"(__a_desc),
643
+ "l"(__b_desc),
644
+ "r"(__idesc),
645
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
646
+ : "memory");
647
+ }
648
+ else if constexpr (__kind == kind_i8 && __cta_group == cta_group_1)
649
+ {
650
+ asm volatile(
651
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
652
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
653
+ "tcgen05.mma.cta_group::1.kind::i8 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
654
+ "}"
655
+ :
656
+ : "r"(__d_tmem),
657
+ "l"(__a_desc),
658
+ "l"(__b_desc),
659
+ "r"(__idesc),
660
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
661
+ : "memory");
662
+ }
663
+ else if constexpr (__kind == kind_i8 && __cta_group == cta_group_2)
664
+ {
665
+ asm volatile(
666
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
667
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
668
+ "tcgen05.mma.cta_group::2.kind::i8 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
669
+ "}"
670
+ :
671
+ : "r"(__d_tmem),
672
+ "l"(__a_desc),
673
+ "l"(__b_desc),
674
+ "r"(__idesc),
675
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
676
+ : "memory");
677
+ }
678
+ # else
679
+ // Unsupported architectures will have a linker error with a semi-decent error message
680
+ __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__();
681
+ # endif
682
+ }
683
+ #endif // __cccl_ptx_isa >= 860
684
+
685
+ /*
686
+ // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; //
687
+ PTX ISA 86, SM_100a
688
+ // .kind = { .kind::f16, .kind::tf32 }
689
+ // .cta_group = { .cta_group::1 }
690
+ template <int N32, cuda::ptx::dot_kind Kind>
691
+ __device__ static inline void tcgen05_mma_tmem_a(
692
+ cuda::ptx::kind_t<Kind> kind,
693
+ cuda::ptx::cta_group_1_t,
694
+ uint32_t d_tmem,
695
+ uint32_t a_tmem,
696
+ uint64_t b_desc,
697
+ uint32_t idesc,
698
+ const uint32_t (&disable_output_lane)[4],
699
+ bool enable_input_d,
700
+ cuda::ptx::n32_t<N32> scale_input_d);
701
+ */
702
+ #if __cccl_ptx_isa >= 860
703
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__();
704
+ template <int _N32, dot_kind _Kind>
705
+ _CCCL_DEVICE static inline void tcgen05_mma_tmem_a(
706
+ kind_t<_Kind> __kind,
707
+ cta_group_1_t,
708
+ _CUDA_VSTD::uint32_t __d_tmem,
709
+ _CUDA_VSTD::uint32_t __a_tmem,
710
+ _CUDA_VSTD::uint64_t __b_desc,
711
+ _CUDA_VSTD::uint32_t __idesc,
712
+ const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4],
713
+ bool __enable_input_d,
714
+ n32_t<_N32> __scale_input_d)
715
+ {
716
+ static_assert(__kind == kind_f16 || __kind == kind_tf32, "");
717
+ // __cta_group == cta_group_1 (due to parameter type constraint)
718
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL
719
+ if constexpr (__kind == kind_f16)
720
+ {
721
+ asm volatile(
722
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
723
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
724
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t"
725
+ "}"
726
+ :
727
+ : "r"(__d_tmem),
728
+ "r"(__a_tmem),
729
+ "l"(__b_desc),
730
+ "r"(__idesc),
731
+ "r"(__disable_output_lane[0]),
732
+ "r"(__disable_output_lane[1]),
733
+ "r"(__disable_output_lane[2]),
734
+ "r"(__disable_output_lane[3]),
735
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
736
+ "n"(__scale_input_d.value)
737
+ : "memory");
738
+ }
739
+ else if constexpr (__kind == kind_tf32)
740
+ {
741
+ asm volatile(
742
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
743
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
744
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t"
745
+ "}"
746
+ :
747
+ : "r"(__d_tmem),
748
+ "r"(__a_tmem),
749
+ "l"(__b_desc),
750
+ "r"(__idesc),
751
+ "r"(__disable_output_lane[0]),
752
+ "r"(__disable_output_lane[1]),
753
+ "r"(__disable_output_lane[2]),
754
+ "r"(__disable_output_lane[3]),
755
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
756
+ "n"(__scale_input_d.value)
757
+ : "memory");
758
+ }
759
+ # else
760
+ // Unsupported architectures will have a linker error with a semi-decent error message
761
+ __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__();
762
+ # endif
763
+ }
764
+ #endif // __cccl_ptx_isa >= 860
765
+
766
+ /*
767
+ // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; //
768
+ PTX ISA 86, SM_100a
769
+ // .kind = { .kind::f16, .kind::tf32 }
770
+ // .cta_group = { .cta_group::2 }
771
+ template <int N32, cuda::ptx::dot_kind Kind>
772
+ __device__ static inline void tcgen05_mma_tmem_a(
773
+ cuda::ptx::kind_t<Kind> kind,
774
+ cuda::ptx::cta_group_2_t,
775
+ uint32_t d_tmem,
776
+ uint32_t a_tmem,
777
+ uint64_t b_desc,
778
+ uint32_t idesc,
779
+ const uint32_t (&disable_output_lane)[8],
780
+ bool enable_input_d,
781
+ cuda::ptx::n32_t<N32> scale_input_d);
782
+ */
783
+ #if __cccl_ptx_isa >= 860
784
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__();
785
+ template <int _N32, dot_kind _Kind>
786
+ _CCCL_DEVICE static inline void tcgen05_mma_tmem_a(
787
+ kind_t<_Kind> __kind,
788
+ cta_group_2_t,
789
+ _CUDA_VSTD::uint32_t __d_tmem,
790
+ _CUDA_VSTD::uint32_t __a_tmem,
791
+ _CUDA_VSTD::uint64_t __b_desc,
792
+ _CUDA_VSTD::uint32_t __idesc,
793
+ const _CUDA_VSTD::uint32_t (&__disable_output_lane)[8],
794
+ bool __enable_input_d,
795
+ n32_t<_N32> __scale_input_d)
796
+ {
797
+ static_assert(__kind == kind_f16 || __kind == kind_tf32, "");
798
+ // __cta_group == cta_group_2 (due to parameter type constraint)
799
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL
800
+ if constexpr (__kind == kind_f16)
801
+ {
802
+ asm volatile(
803
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
804
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
805
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, PRED_enable_input_d, "
806
+ "%13;\n\t"
807
+ "}"
808
+ :
809
+ : "r"(__d_tmem),
810
+ "r"(__a_tmem),
811
+ "l"(__b_desc),
812
+ "r"(__idesc),
813
+ "r"(__disable_output_lane[0]),
814
+ "r"(__disable_output_lane[1]),
815
+ "r"(__disable_output_lane[2]),
816
+ "r"(__disable_output_lane[3]),
817
+ "r"(__disable_output_lane[4]),
818
+ "r"(__disable_output_lane[5]),
819
+ "r"(__disable_output_lane[6]),
820
+ "r"(__disable_output_lane[7]),
821
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
822
+ "n"(__scale_input_d.value)
823
+ : "memory");
824
+ }
825
+ else if constexpr (__kind == kind_tf32)
826
+ {
827
+ asm volatile(
828
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
829
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
830
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
831
+ "PRED_enable_input_d, %13;\n\t"
832
+ "}"
833
+ :
834
+ : "r"(__d_tmem),
835
+ "r"(__a_tmem),
836
+ "l"(__b_desc),
837
+ "r"(__idesc),
838
+ "r"(__disable_output_lane[0]),
839
+ "r"(__disable_output_lane[1]),
840
+ "r"(__disable_output_lane[2]),
841
+ "r"(__disable_output_lane[3]),
842
+ "r"(__disable_output_lane[4]),
843
+ "r"(__disable_output_lane[5]),
844
+ "r"(__disable_output_lane[6]),
845
+ "r"(__disable_output_lane[7]),
846
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
847
+ "n"(__scale_input_d.value)
848
+ : "memory");
849
+ }
850
+ # else
851
+ // Unsupported architectures will have a linker error with a semi-decent error message
852
+ __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__();
853
+ # endif
854
+ }
855
+ #endif // __cccl_ptx_isa >= 860
856
+
857
+ /*
858
+ // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86,
859
+ SM_100a, SM_101a
860
+ // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
861
+ // .cta_group = { .cta_group::1 }
862
+ template <cuda::ptx::dot_kind Kind>
863
+ __device__ static inline void tcgen05_mma_tmem_a(
864
+ cuda::ptx::kind_t<Kind> kind,
865
+ cuda::ptx::cta_group_1_t,
866
+ uint32_t d_tmem,
867
+ uint32_t a_tmem,
868
+ uint64_t b_desc,
869
+ uint32_t idesc,
870
+ const uint32_t (&disable_output_lane)[4],
871
+ bool enable_input_d);
872
+ */
873
+ #if __cccl_ptx_isa >= 860
874
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__();
875
+ template <dot_kind _Kind>
876
+ _CCCL_DEVICE static inline void tcgen05_mma_tmem_a(
877
+ kind_t<_Kind> __kind,
878
+ cta_group_1_t,
879
+ _CUDA_VSTD::uint32_t __d_tmem,
880
+ _CUDA_VSTD::uint32_t __a_tmem,
881
+ _CUDA_VSTD::uint64_t __b_desc,
882
+ _CUDA_VSTD::uint32_t __idesc,
883
+ const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4],
884
+ bool __enable_input_d)
885
+ {
886
+ static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, "");
887
+ // __cta_group == cta_group_1 (due to parameter type constraint)
888
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
889
+ if constexpr (__kind == kind_f16)
890
+ {
891
+ asm volatile(
892
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
893
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
894
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
895
+ "}"
896
+ :
897
+ : "r"(__d_tmem),
898
+ "r"(__a_tmem),
899
+ "l"(__b_desc),
900
+ "r"(__idesc),
901
+ "r"(__disable_output_lane[0]),
902
+ "r"(__disable_output_lane[1]),
903
+ "r"(__disable_output_lane[2]),
904
+ "r"(__disable_output_lane[3]),
905
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
906
+ : "memory");
907
+ }
908
+ else if constexpr (__kind == kind_tf32)
909
+ {
910
+ asm volatile(
911
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
912
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
913
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
914
+ "}"
915
+ :
916
+ : "r"(__d_tmem),
917
+ "r"(__a_tmem),
918
+ "l"(__b_desc),
919
+ "r"(__idesc),
920
+ "r"(__disable_output_lane[0]),
921
+ "r"(__disable_output_lane[1]),
922
+ "r"(__disable_output_lane[2]),
923
+ "r"(__disable_output_lane[3]),
924
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
925
+ : "memory");
926
+ }
927
+ else if constexpr (__kind == kind_f8f6f4)
928
+ {
929
+ asm volatile(
930
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
931
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
932
+ "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
933
+ "}"
934
+ :
935
+ : "r"(__d_tmem),
936
+ "r"(__a_tmem),
937
+ "l"(__b_desc),
938
+ "r"(__idesc),
939
+ "r"(__disable_output_lane[0]),
940
+ "r"(__disable_output_lane[1]),
941
+ "r"(__disable_output_lane[2]),
942
+ "r"(__disable_output_lane[3]),
943
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
944
+ : "memory");
945
+ }
946
+ else if constexpr (__kind == kind_i8)
947
+ {
948
+ asm volatile(
949
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
950
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
951
+ "tcgen05.mma.cta_group::1.kind::i8 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
952
+ "}"
953
+ :
954
+ : "r"(__d_tmem),
955
+ "r"(__a_tmem),
956
+ "l"(__b_desc),
957
+ "r"(__idesc),
958
+ "r"(__disable_output_lane[0]),
959
+ "r"(__disable_output_lane[1]),
960
+ "r"(__disable_output_lane[2]),
961
+ "r"(__disable_output_lane[3]),
962
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
963
+ : "memory");
964
+ }
965
+ # else
966
+ // Unsupported architectures will have a linker error with a semi-decent error message
967
+ __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__();
968
+ # endif
969
+ }
970
+ #endif // __cccl_ptx_isa >= 860
971
+
972
+ /*
973
+ // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86,
974
+ SM_100a, SM_101a
975
+ // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
976
+ // .cta_group = { .cta_group::2 }
977
+ template <cuda::ptx::dot_kind Kind>
978
+ __device__ static inline void tcgen05_mma_tmem_a(
979
+ cuda::ptx::kind_t<Kind> kind,
980
+ cuda::ptx::cta_group_2_t,
981
+ uint32_t d_tmem,
982
+ uint32_t a_tmem,
983
+ uint64_t b_desc,
984
+ uint32_t idesc,
985
+ const uint32_t (&disable_output_lane)[8],
986
+ bool enable_input_d);
987
+ */
988
+ #if __cccl_ptx_isa >= 860
989
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__();
990
+ template <dot_kind _Kind>
991
+ _CCCL_DEVICE static inline void tcgen05_mma_tmem_a(
992
+ kind_t<_Kind> __kind,
993
+ cta_group_2_t,
994
+ _CUDA_VSTD::uint32_t __d_tmem,
995
+ _CUDA_VSTD::uint32_t __a_tmem,
996
+ _CUDA_VSTD::uint64_t __b_desc,
997
+ _CUDA_VSTD::uint32_t __idesc,
998
+ const _CUDA_VSTD::uint32_t (&__disable_output_lane)[8],
999
+ bool __enable_input_d)
1000
+ {
1001
+ static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, "");
1002
+ // __cta_group == cta_group_2 (due to parameter type constraint)
1003
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1004
+ if constexpr (__kind == kind_f16)
1005
+ {
1006
+ asm volatile(
1007
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1008
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
1009
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
1010
+ "PRED_enable_input_d;\n\t"
1011
+ "}"
1012
+ :
1013
+ : "r"(__d_tmem),
1014
+ "r"(__a_tmem),
1015
+ "l"(__b_desc),
1016
+ "r"(__idesc),
1017
+ "r"(__disable_output_lane[0]),
1018
+ "r"(__disable_output_lane[1]),
1019
+ "r"(__disable_output_lane[2]),
1020
+ "r"(__disable_output_lane[3]),
1021
+ "r"(__disable_output_lane[4]),
1022
+ "r"(__disable_output_lane[5]),
1023
+ "r"(__disable_output_lane[6]),
1024
+ "r"(__disable_output_lane[7]),
1025
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1026
+ : "memory");
1027
+ }
1028
+ else if constexpr (__kind == kind_tf32)
1029
+ {
1030
+ asm volatile(
1031
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1032
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
1033
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
1034
+ "PRED_enable_input_d;\n\t"
1035
+ "}"
1036
+ :
1037
+ : "r"(__d_tmem),
1038
+ "r"(__a_tmem),
1039
+ "l"(__b_desc),
1040
+ "r"(__idesc),
1041
+ "r"(__disable_output_lane[0]),
1042
+ "r"(__disable_output_lane[1]),
1043
+ "r"(__disable_output_lane[2]),
1044
+ "r"(__disable_output_lane[3]),
1045
+ "r"(__disable_output_lane[4]),
1046
+ "r"(__disable_output_lane[5]),
1047
+ "r"(__disable_output_lane[6]),
1048
+ "r"(__disable_output_lane[7]),
1049
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1050
+ : "memory");
1051
+ }
1052
+ else if constexpr (__kind == kind_f8f6f4)
1053
+ {
1054
+ asm volatile(
1055
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1056
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
1057
+ "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], [%1], %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
1058
+ "PRED_enable_input_d;\n\t"
1059
+ "}"
1060
+ :
1061
+ : "r"(__d_tmem),
1062
+ "r"(__a_tmem),
1063
+ "l"(__b_desc),
1064
+ "r"(__idesc),
1065
+ "r"(__disable_output_lane[0]),
1066
+ "r"(__disable_output_lane[1]),
1067
+ "r"(__disable_output_lane[2]),
1068
+ "r"(__disable_output_lane[3]),
1069
+ "r"(__disable_output_lane[4]),
1070
+ "r"(__disable_output_lane[5]),
1071
+ "r"(__disable_output_lane[6]),
1072
+ "r"(__disable_output_lane[7]),
1073
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1074
+ : "memory");
1075
+ }
1076
+ else if constexpr (__kind == kind_i8)
1077
+ {
1078
+ asm volatile(
1079
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1080
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
1081
+ "tcgen05.mma.cta_group::2.kind::i8 [%0], [%1], %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
1082
+ "PRED_enable_input_d;\n\t"
1083
+ "}"
1084
+ :
1085
+ : "r"(__d_tmem),
1086
+ "r"(__a_tmem),
1087
+ "l"(__b_desc),
1088
+ "r"(__idesc),
1089
+ "r"(__disable_output_lane[0]),
1090
+ "r"(__disable_output_lane[1]),
1091
+ "r"(__disable_output_lane[2]),
1092
+ "r"(__disable_output_lane[3]),
1093
+ "r"(__disable_output_lane[4]),
1094
+ "r"(__disable_output_lane[5]),
1095
+ "r"(__disable_output_lane[6]),
1096
+ "r"(__disable_output_lane[7]),
1097
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1098
+ : "memory");
1099
+ }
1100
+ # else
1101
+ // Unsupported architectures will have a linker error with a semi-decent error message
1102
+ __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__();
1103
+ # endif
1104
+ }
1105
+ #endif // __cccl_ptx_isa >= 860
1106
+
1107
+ /*
1108
+ // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a
1109
+ // .kind = { .kind::f16, .kind::tf32 }
1110
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1111
+ template <int N32, cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
1112
+ __device__ static inline void tcgen05_mma_tmem_a(
1113
+ cuda::ptx::kind_t<Kind> kind,
1114
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1115
+ uint32_t d_tmem,
1116
+ uint32_t a_tmem,
1117
+ uint64_t b_desc,
1118
+ uint32_t idesc,
1119
+ bool enable_input_d,
1120
+ cuda::ptx::n32_t<N32> scale_input_d);
1121
+ */
1122
+ #if __cccl_ptx_isa >= 860
1123
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__();
1124
+ template <int _N32, dot_kind _Kind, dot_cta_group _Cta_Group>
1125
+ _CCCL_DEVICE static inline void tcgen05_mma_tmem_a(
1126
+ kind_t<_Kind> __kind,
1127
+ cta_group_t<_Cta_Group> __cta_group,
1128
+ _CUDA_VSTD::uint32_t __d_tmem,
1129
+ _CUDA_VSTD::uint32_t __a_tmem,
1130
+ _CUDA_VSTD::uint64_t __b_desc,
1131
+ _CUDA_VSTD::uint32_t __idesc,
1132
+ bool __enable_input_d,
1133
+ n32_t<_N32> __scale_input_d)
1134
+ {
1135
+ static_assert(__kind == kind_f16 || __kind == kind_tf32, "");
1136
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1137
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL
1138
+ if constexpr (__kind == kind_f16 && __cta_group == cta_group_1)
1139
+ {
1140
+ asm volatile(
1141
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1142
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1143
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t"
1144
+ "}"
1145
+ :
1146
+ : "r"(__d_tmem),
1147
+ "r"(__a_tmem),
1148
+ "l"(__b_desc),
1149
+ "r"(__idesc),
1150
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
1151
+ "n"(__scale_input_d.value)
1152
+ : "memory");
1153
+ }
1154
+ else if constexpr (__kind == kind_f16 && __cta_group == cta_group_2)
1155
+ {
1156
+ asm volatile(
1157
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1158
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1159
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t"
1160
+ "}"
1161
+ :
1162
+ : "r"(__d_tmem),
1163
+ "r"(__a_tmem),
1164
+ "l"(__b_desc),
1165
+ "r"(__idesc),
1166
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
1167
+ "n"(__scale_input_d.value)
1168
+ : "memory");
1169
+ }
1170
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_1)
1171
+ {
1172
+ asm volatile(
1173
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1174
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1175
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t"
1176
+ "}"
1177
+ :
1178
+ : "r"(__d_tmem),
1179
+ "r"(__a_tmem),
1180
+ "l"(__b_desc),
1181
+ "r"(__idesc),
1182
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
1183
+ "n"(__scale_input_d.value)
1184
+ : "memory");
1185
+ }
1186
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_2)
1187
+ {
1188
+ asm volatile(
1189
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1190
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1191
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t"
1192
+ "}"
1193
+ :
1194
+ : "r"(__d_tmem),
1195
+ "r"(__a_tmem),
1196
+ "l"(__b_desc),
1197
+ "r"(__idesc),
1198
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)),
1199
+ "n"(__scale_input_d.value)
1200
+ : "memory");
1201
+ }
1202
+ # else
1203
+ // Unsupported architectures will have a linker error with a semi-decent error message
1204
+ __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__();
1205
+ # endif
1206
+ }
1207
+ #endif // __cccl_ptx_isa >= 860
1208
+
1209
+ /*
1210
+ // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a
1211
+ // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
1212
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1213
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
1214
+ __device__ static inline void tcgen05_mma_tmem_a(
1215
+ cuda::ptx::kind_t<Kind> kind,
1216
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1217
+ uint32_t d_tmem,
1218
+ uint32_t a_tmem,
1219
+ uint64_t b_desc,
1220
+ uint32_t idesc,
1221
+ bool enable_input_d);
1222
+ */
1223
+ #if __cccl_ptx_isa >= 860
1224
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__();
1225
+ template <dot_kind _Kind, dot_cta_group _Cta_Group>
1226
+ _CCCL_DEVICE static inline void tcgen05_mma_tmem_a(
1227
+ kind_t<_Kind> __kind,
1228
+ cta_group_t<_Cta_Group> __cta_group,
1229
+ _CUDA_VSTD::uint32_t __d_tmem,
1230
+ _CUDA_VSTD::uint32_t __a_tmem,
1231
+ _CUDA_VSTD::uint64_t __b_desc,
1232
+ _CUDA_VSTD::uint32_t __idesc,
1233
+ bool __enable_input_d)
1234
+ {
1235
+ static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, "");
1236
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1237
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1238
+ if constexpr (__kind == kind_f16 && __cta_group == cta_group_1)
1239
+ {
1240
+ asm volatile(
1241
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1242
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1243
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1244
+ "}"
1245
+ :
1246
+ : "r"(__d_tmem),
1247
+ "r"(__a_tmem),
1248
+ "l"(__b_desc),
1249
+ "r"(__idesc),
1250
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1251
+ : "memory");
1252
+ }
1253
+ else if constexpr (__kind == kind_f16 && __cta_group == cta_group_2)
1254
+ {
1255
+ asm volatile(
1256
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1257
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1258
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1259
+ "}"
1260
+ :
1261
+ : "r"(__d_tmem),
1262
+ "r"(__a_tmem),
1263
+ "l"(__b_desc),
1264
+ "r"(__idesc),
1265
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1266
+ : "memory");
1267
+ }
1268
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_1)
1269
+ {
1270
+ asm volatile(
1271
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1272
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1273
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1274
+ "}"
1275
+ :
1276
+ : "r"(__d_tmem),
1277
+ "r"(__a_tmem),
1278
+ "l"(__b_desc),
1279
+ "r"(__idesc),
1280
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1281
+ : "memory");
1282
+ }
1283
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_2)
1284
+ {
1285
+ asm volatile(
1286
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1287
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1288
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1289
+ "}"
1290
+ :
1291
+ : "r"(__d_tmem),
1292
+ "r"(__a_tmem),
1293
+ "l"(__b_desc),
1294
+ "r"(__idesc),
1295
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1296
+ : "memory");
1297
+ }
1298
+ else if constexpr (__kind == kind_f8f6f4 && __cta_group == cta_group_1)
1299
+ {
1300
+ asm volatile(
1301
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1302
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1303
+ "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1304
+ "}"
1305
+ :
1306
+ : "r"(__d_tmem),
1307
+ "r"(__a_tmem),
1308
+ "l"(__b_desc),
1309
+ "r"(__idesc),
1310
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1311
+ : "memory");
1312
+ }
1313
+ else if constexpr (__kind == kind_f8f6f4 && __cta_group == cta_group_2)
1314
+ {
1315
+ asm volatile(
1316
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1317
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1318
+ "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1319
+ "}"
1320
+ :
1321
+ : "r"(__d_tmem),
1322
+ "r"(__a_tmem),
1323
+ "l"(__b_desc),
1324
+ "r"(__idesc),
1325
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1326
+ : "memory");
1327
+ }
1328
+ else if constexpr (__kind == kind_i8 && __cta_group == cta_group_1)
1329
+ {
1330
+ asm volatile(
1331
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1332
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1333
+ "tcgen05.mma.cta_group::1.kind::i8 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1334
+ "}"
1335
+ :
1336
+ : "r"(__d_tmem),
1337
+ "r"(__a_tmem),
1338
+ "l"(__b_desc),
1339
+ "r"(__idesc),
1340
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1341
+ : "memory");
1342
+ }
1343
+ else if constexpr (__kind == kind_i8 && __cta_group == cta_group_2)
1344
+ {
1345
+ asm volatile(
1346
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1347
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1348
+ "tcgen05.mma.cta_group::2.kind::i8 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1349
+ "}"
1350
+ :
1351
+ : "r"(__d_tmem),
1352
+ "r"(__a_tmem),
1353
+ "l"(__b_desc),
1354
+ "r"(__idesc),
1355
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1356
+ : "memory");
1357
+ }
1358
+ # else
1359
+ // Unsupported architectures will have a linker error with a semi-decent error message
1360
+ __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__();
1361
+ # endif
1362
+ }
1363
+ #endif // __cccl_ptx_isa >= 860
1364
+
1365
+ /*
1366
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem],
1367
+ enable_input_d; // PTX ISA 86, SM_100a, SM_101a
1368
+ // .kind = { .kind::mxf8f6f4 }
1369
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1370
+ template <cuda::ptx::dot_cta_group Cta_Group>
1371
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x(
1372
+ cuda::ptx::kind_mxf8f6f4_t,
1373
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1374
+ uint32_t d_tmem,
1375
+ uint64_t a_desc,
1376
+ uint64_t b_desc,
1377
+ uint32_t idesc,
1378
+ uint32_t scale_A_tmem,
1379
+ uint32_t scale_B_tmem,
1380
+ bool enable_input_d);
1381
+ */
1382
+ #if __cccl_ptx_isa >= 860
1383
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_1x_is_not_supported_before_SM_100a_SM_101a__();
1384
+ template <dot_cta_group _Cta_Group>
1385
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x(
1386
+ kind_mxf8f6f4_t,
1387
+ cta_group_t<_Cta_Group> __cta_group,
1388
+ _CUDA_VSTD::uint32_t __d_tmem,
1389
+ _CUDA_VSTD::uint64_t __a_desc,
1390
+ _CUDA_VSTD::uint64_t __b_desc,
1391
+ _CUDA_VSTD::uint32_t __idesc,
1392
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
1393
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
1394
+ bool __enable_input_d)
1395
+ {
1396
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
1397
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1398
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1399
+ if constexpr (__cta_group == cta_group_1)
1400
+ {
1401
+ asm volatile(
1402
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1403
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1404
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], "
1405
+ "PRED_enable_input_d;\n\t"
1406
+ "}"
1407
+ :
1408
+ : "r"(__d_tmem),
1409
+ "l"(__a_desc),
1410
+ "l"(__b_desc),
1411
+ "r"(__idesc),
1412
+ "r"(__scale_A_tmem),
1413
+ "r"(__scale_B_tmem),
1414
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1415
+ : "memory");
1416
+ }
1417
+ else if constexpr (__cta_group == cta_group_2)
1418
+ {
1419
+ asm volatile(
1420
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1421
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1422
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], "
1423
+ "PRED_enable_input_d;\n\t"
1424
+ "}"
1425
+ :
1426
+ : "r"(__d_tmem),
1427
+ "l"(__a_desc),
1428
+ "l"(__b_desc),
1429
+ "r"(__idesc),
1430
+ "r"(__scale_A_tmem),
1431
+ "r"(__scale_B_tmem),
1432
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1433
+ : "memory");
1434
+ }
1435
+ # else
1436
+ // Unsupported architectures will have a linker error with a semi-decent error message
1437
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_is_not_supported_before_SM_100a_SM_101a__();
1438
+ # endif
1439
+ }
1440
+ #endif // __cccl_ptx_isa >= 860
1441
+
1442
+ /*
1443
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem],
1444
+ enable_input_d; // PTX ISA 86, SM_100a, SM_101a
1445
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
1446
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1447
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
1448
+ __device__ static inline void tcgen05_mma_block_scale_vec_2x(
1449
+ cuda::ptx::kind_t<Kind> kind,
1450
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1451
+ uint32_t d_tmem,
1452
+ uint64_t a_desc,
1453
+ uint64_t b_desc,
1454
+ uint32_t idesc,
1455
+ uint32_t scale_A_tmem,
1456
+ uint32_t scale_B_tmem,
1457
+ bool enable_input_d);
1458
+ */
1459
+ #if __cccl_ptx_isa >= 860
1460
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_2x_is_not_supported_before_SM_100a_SM_101a__();
1461
+ template <dot_kind _Kind, dot_cta_group _Cta_Group>
1462
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x(
1463
+ kind_t<_Kind> __kind,
1464
+ cta_group_t<_Cta_Group> __cta_group,
1465
+ _CUDA_VSTD::uint32_t __d_tmem,
1466
+ _CUDA_VSTD::uint64_t __a_desc,
1467
+ _CUDA_VSTD::uint64_t __b_desc,
1468
+ _CUDA_VSTD::uint32_t __idesc,
1469
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
1470
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
1471
+ bool __enable_input_d)
1472
+ {
1473
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
1474
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1475
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1476
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
1477
+ {
1478
+ asm volatile(
1479
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1480
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1481
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1482
+ "PRED_enable_input_d;\n\t"
1483
+ "}"
1484
+ :
1485
+ : "r"(__d_tmem),
1486
+ "l"(__a_desc),
1487
+ "l"(__b_desc),
1488
+ "r"(__idesc),
1489
+ "r"(__scale_A_tmem),
1490
+ "r"(__scale_B_tmem),
1491
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1492
+ : "memory");
1493
+ }
1494
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
1495
+ {
1496
+ asm volatile(
1497
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1498
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1499
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1500
+ "PRED_enable_input_d;\n\t"
1501
+ "}"
1502
+ :
1503
+ : "r"(__d_tmem),
1504
+ "l"(__a_desc),
1505
+ "l"(__b_desc),
1506
+ "r"(__idesc),
1507
+ "r"(__scale_A_tmem),
1508
+ "r"(__scale_B_tmem),
1509
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1510
+ : "memory");
1511
+ }
1512
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
1513
+ {
1514
+ asm volatile(
1515
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1516
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1517
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1518
+ "PRED_enable_input_d;\n\t"
1519
+ "}"
1520
+ :
1521
+ : "r"(__d_tmem),
1522
+ "l"(__a_desc),
1523
+ "l"(__b_desc),
1524
+ "r"(__idesc),
1525
+ "r"(__scale_A_tmem),
1526
+ "r"(__scale_B_tmem),
1527
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1528
+ : "memory");
1529
+ }
1530
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
1531
+ {
1532
+ asm volatile(
1533
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1534
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1535
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1536
+ "PRED_enable_input_d;\n\t"
1537
+ "}"
1538
+ :
1539
+ : "r"(__d_tmem),
1540
+ "l"(__a_desc),
1541
+ "l"(__b_desc),
1542
+ "r"(__idesc),
1543
+ "r"(__scale_A_tmem),
1544
+ "r"(__scale_B_tmem),
1545
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1546
+ : "memory");
1547
+ }
1548
+ # else
1549
+ // Unsupported architectures will have a linker error with a semi-decent error message
1550
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_is_not_supported_before_SM_100a_SM_101a__();
1551
+ # endif
1552
+ }
1553
+ #endif // __cccl_ptx_isa >= 860
1554
+
1555
+ /*
1556
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem],
1557
+ enable_input_d; // PTX ISA 86, SM_100a, SM_101a
1558
+ // .kind = { .kind::mxf4nvf4 }
1559
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1560
+ template <cuda::ptx::dot_cta_group Cta_Group>
1561
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x(
1562
+ cuda::ptx::kind_mxf4nvf4_t,
1563
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1564
+ uint32_t d_tmem,
1565
+ uint64_t a_desc,
1566
+ uint64_t b_desc,
1567
+ uint32_t idesc,
1568
+ uint32_t scale_A_tmem,
1569
+ uint32_t scale_B_tmem,
1570
+ bool enable_input_d);
1571
+ */
1572
+ #if __cccl_ptx_isa >= 860
1573
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_4x_is_not_supported_before_SM_100a_SM_101a__();
1574
+ template <dot_cta_group _Cta_Group>
1575
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x(
1576
+ kind_mxf4nvf4_t,
1577
+ cta_group_t<_Cta_Group> __cta_group,
1578
+ _CUDA_VSTD::uint32_t __d_tmem,
1579
+ _CUDA_VSTD::uint64_t __a_desc,
1580
+ _CUDA_VSTD::uint64_t __b_desc,
1581
+ _CUDA_VSTD::uint32_t __idesc,
1582
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
1583
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
1584
+ bool __enable_input_d)
1585
+ {
1586
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
1587
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1588
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1589
+ if constexpr (__cta_group == cta_group_1)
1590
+ {
1591
+ asm volatile(
1592
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1593
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1594
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], "
1595
+ "PRED_enable_input_d;\n\t"
1596
+ "}"
1597
+ :
1598
+ : "r"(__d_tmem),
1599
+ "l"(__a_desc),
1600
+ "l"(__b_desc),
1601
+ "r"(__idesc),
1602
+ "r"(__scale_A_tmem),
1603
+ "r"(__scale_B_tmem),
1604
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1605
+ : "memory");
1606
+ }
1607
+ else if constexpr (__cta_group == cta_group_2)
1608
+ {
1609
+ asm volatile(
1610
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1611
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1612
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], "
1613
+ "PRED_enable_input_d;\n\t"
1614
+ "}"
1615
+ :
1616
+ : "r"(__d_tmem),
1617
+ "l"(__a_desc),
1618
+ "l"(__b_desc),
1619
+ "r"(__idesc),
1620
+ "r"(__scale_A_tmem),
1621
+ "r"(__scale_B_tmem),
1622
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1623
+ : "memory");
1624
+ }
1625
+ # else
1626
+ // Unsupported architectures will have a linker error with a semi-decent error message
1627
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_is_not_supported_before_SM_100a_SM_101a__();
1628
+ # endif
1629
+ }
1630
+ #endif // __cccl_ptx_isa >= 860
1631
+
1632
+ /*
1633
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem],
1634
+ enable_input_d; // PTX ISA 86, SM_100a, SM_101a
1635
+ // .kind = { .kind::mxf8f6f4 }
1636
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1637
+ template <cuda::ptx::dot_cta_group Cta_Group>
1638
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a(
1639
+ cuda::ptx::kind_mxf8f6f4_t,
1640
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1641
+ uint32_t d_tmem,
1642
+ uint64_t a_desc,
1643
+ uint64_t b_desc,
1644
+ uint32_t idesc,
1645
+ uint32_t scale_A_tmem,
1646
+ uint32_t scale_B_tmem,
1647
+ bool enable_input_d);
1648
+ */
1649
+ #if __cccl_ptx_isa >= 860
1650
+ extern "C" _CCCL_DEVICE void
1651
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_is_not_supported_before_SM_100a_SM_101a__();
1652
+ template <dot_cta_group _Cta_Group>
1653
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a(
1654
+ kind_mxf8f6f4_t,
1655
+ cta_group_t<_Cta_Group> __cta_group,
1656
+ _CUDA_VSTD::uint32_t __d_tmem,
1657
+ _CUDA_VSTD::uint64_t __a_desc,
1658
+ _CUDA_VSTD::uint64_t __b_desc,
1659
+ _CUDA_VSTD::uint32_t __idesc,
1660
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
1661
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
1662
+ bool __enable_input_d)
1663
+ {
1664
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
1665
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1666
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1667
+ if constexpr (__cta_group == cta_group_1)
1668
+ {
1669
+ asm volatile(
1670
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1671
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1672
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], "
1673
+ "PRED_enable_input_d;\n\t"
1674
+ "}"
1675
+ :
1676
+ : "r"(__d_tmem),
1677
+ "l"(__a_desc),
1678
+ "l"(__b_desc),
1679
+ "r"(__idesc),
1680
+ "r"(__scale_A_tmem),
1681
+ "r"(__scale_B_tmem),
1682
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1683
+ : "memory");
1684
+ }
1685
+ else if constexpr (__cta_group == cta_group_2)
1686
+ {
1687
+ asm volatile(
1688
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1689
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1690
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], "
1691
+ "PRED_enable_input_d;\n\t"
1692
+ "}"
1693
+ :
1694
+ : "r"(__d_tmem),
1695
+ "l"(__a_desc),
1696
+ "l"(__b_desc),
1697
+ "r"(__idesc),
1698
+ "r"(__scale_A_tmem),
1699
+ "r"(__scale_B_tmem),
1700
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1701
+ : "memory");
1702
+ }
1703
+ # else
1704
+ // Unsupported architectures will have a linker error with a semi-decent error message
1705
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_is_not_supported_before_SM_100a_SM_101a__();
1706
+ # endif
1707
+ }
1708
+ #endif // __cccl_ptx_isa >= 860
1709
+
1710
+ /*
1711
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem],
1712
+ enable_input_d; // PTX ISA 86, SM_100a, SM_101a
1713
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
1714
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1715
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
1716
+ __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a(
1717
+ cuda::ptx::kind_t<Kind> kind,
1718
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1719
+ uint32_t d_tmem,
1720
+ uint64_t a_desc,
1721
+ uint64_t b_desc,
1722
+ uint32_t idesc,
1723
+ uint32_t scale_A_tmem,
1724
+ uint32_t scale_B_tmem,
1725
+ bool enable_input_d);
1726
+ */
1727
+ #if __cccl_ptx_isa >= 860
1728
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_is_not_supported_before_SM_100a_SM_101a__();
1729
+ template <dot_kind _Kind, dot_cta_group _Cta_Group>
1730
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a(
1731
+ kind_t<_Kind> __kind,
1732
+ cta_group_t<_Cta_Group> __cta_group,
1733
+ _CUDA_VSTD::uint32_t __d_tmem,
1734
+ _CUDA_VSTD::uint64_t __a_desc,
1735
+ _CUDA_VSTD::uint64_t __b_desc,
1736
+ _CUDA_VSTD::uint32_t __idesc,
1737
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
1738
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
1739
+ bool __enable_input_d)
1740
+ {
1741
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
1742
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1743
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1744
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
1745
+ {
1746
+ asm volatile(
1747
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1748
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1749
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1750
+ "PRED_enable_input_d;\n\t"
1751
+ "}"
1752
+ :
1753
+ : "r"(__d_tmem),
1754
+ "l"(__a_desc),
1755
+ "l"(__b_desc),
1756
+ "r"(__idesc),
1757
+ "r"(__scale_A_tmem),
1758
+ "r"(__scale_B_tmem),
1759
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1760
+ : "memory");
1761
+ }
1762
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
1763
+ {
1764
+ asm volatile(
1765
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1766
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1767
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1768
+ "PRED_enable_input_d;\n\t"
1769
+ "}"
1770
+ :
1771
+ : "r"(__d_tmem),
1772
+ "l"(__a_desc),
1773
+ "l"(__b_desc),
1774
+ "r"(__idesc),
1775
+ "r"(__scale_A_tmem),
1776
+ "r"(__scale_B_tmem),
1777
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1778
+ : "memory");
1779
+ }
1780
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
1781
+ {
1782
+ asm volatile(
1783
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1784
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1785
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1786
+ "PRED_enable_input_d;\n\t"
1787
+ "}"
1788
+ :
1789
+ : "r"(__d_tmem),
1790
+ "l"(__a_desc),
1791
+ "l"(__b_desc),
1792
+ "r"(__idesc),
1793
+ "r"(__scale_A_tmem),
1794
+ "r"(__scale_B_tmem),
1795
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1796
+ : "memory");
1797
+ }
1798
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
1799
+ {
1800
+ asm volatile(
1801
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1802
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1803
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1804
+ "PRED_enable_input_d;\n\t"
1805
+ "}"
1806
+ :
1807
+ : "r"(__d_tmem),
1808
+ "l"(__a_desc),
1809
+ "l"(__b_desc),
1810
+ "r"(__idesc),
1811
+ "r"(__scale_A_tmem),
1812
+ "r"(__scale_B_tmem),
1813
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1814
+ : "memory");
1815
+ }
1816
+ # else
1817
+ // Unsupported architectures will have a linker error with a semi-decent error message
1818
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_is_not_supported_before_SM_100a_SM_101a__();
1819
+ # endif
1820
+ }
1821
+ #endif // __cccl_ptx_isa >= 860
1822
+
1823
+ /*
1824
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem],
1825
+ enable_input_d; // PTX ISA 86, SM_100a, SM_101a
1826
+ // .kind = { .kind::mxf4nvf4 }
1827
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1828
+ template <cuda::ptx::dot_cta_group Cta_Group>
1829
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a(
1830
+ cuda::ptx::kind_mxf4nvf4_t,
1831
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1832
+ uint32_t d_tmem,
1833
+ uint64_t a_desc,
1834
+ uint64_t b_desc,
1835
+ uint32_t idesc,
1836
+ uint32_t scale_A_tmem,
1837
+ uint32_t scale_B_tmem,
1838
+ bool enable_input_d);
1839
+ */
1840
+ #if __cccl_ptx_isa >= 860
1841
+ extern "C" _CCCL_DEVICE void
1842
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_is_not_supported_before_SM_100a_SM_101a__();
1843
+ template <dot_cta_group _Cta_Group>
1844
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a(
1845
+ kind_mxf4nvf4_t,
1846
+ cta_group_t<_Cta_Group> __cta_group,
1847
+ _CUDA_VSTD::uint32_t __d_tmem,
1848
+ _CUDA_VSTD::uint64_t __a_desc,
1849
+ _CUDA_VSTD::uint64_t __b_desc,
1850
+ _CUDA_VSTD::uint32_t __idesc,
1851
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
1852
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
1853
+ bool __enable_input_d)
1854
+ {
1855
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
1856
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1857
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1858
+ if constexpr (__cta_group == cta_group_1)
1859
+ {
1860
+ asm volatile(
1861
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1862
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1863
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], "
1864
+ "PRED_enable_input_d;\n\t"
1865
+ "}"
1866
+ :
1867
+ : "r"(__d_tmem),
1868
+ "l"(__a_desc),
1869
+ "l"(__b_desc),
1870
+ "r"(__idesc),
1871
+ "r"(__scale_A_tmem),
1872
+ "r"(__scale_B_tmem),
1873
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1874
+ : "memory");
1875
+ }
1876
+ else if constexpr (__cta_group == cta_group_2)
1877
+ {
1878
+ asm volatile(
1879
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1880
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1881
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], "
1882
+ "PRED_enable_input_d;\n\t"
1883
+ "}"
1884
+ :
1885
+ : "r"(__d_tmem),
1886
+ "l"(__a_desc),
1887
+ "l"(__b_desc),
1888
+ "r"(__idesc),
1889
+ "r"(__scale_A_tmem),
1890
+ "r"(__scale_B_tmem),
1891
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1892
+ : "memory");
1893
+ }
1894
+ # else
1895
+ // Unsupported architectures will have a linker error with a semi-decent error message
1896
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_is_not_supported_before_SM_100a_SM_101a__();
1897
+ # endif
1898
+ }
1899
+ #endif // __cccl_ptx_isa >= 860
1900
+
1901
+ /*
1902
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc,
1903
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
1904
+ // .kind = { .kind::mxf8f6f4 }
1905
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1906
+ template <cuda::ptx::dot_cta_group Cta_Group>
1907
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill(
1908
+ cuda::ptx::kind_mxf8f6f4_t,
1909
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1910
+ uint32_t d_tmem,
1911
+ uint64_t a_desc,
1912
+ uint64_t b_desc,
1913
+ uint32_t idesc,
1914
+ uint32_t scale_A_tmem,
1915
+ uint32_t scale_B_tmem,
1916
+ bool enable_input_d);
1917
+ */
1918
+ #if __cccl_ptx_isa >= 860
1919
+ extern "C" _CCCL_DEVICE void
1920
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__();
1921
+ template <dot_cta_group _Cta_Group>
1922
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill(
1923
+ kind_mxf8f6f4_t,
1924
+ cta_group_t<_Cta_Group> __cta_group,
1925
+ _CUDA_VSTD::uint32_t __d_tmem,
1926
+ _CUDA_VSTD::uint64_t __a_desc,
1927
+ _CUDA_VSTD::uint64_t __b_desc,
1928
+ _CUDA_VSTD::uint32_t __idesc,
1929
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
1930
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
1931
+ bool __enable_input_d)
1932
+ {
1933
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
1934
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1935
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1936
+ if constexpr (__cta_group == cta_group_1)
1937
+ {
1938
+ asm volatile(
1939
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1940
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1941
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], "
1942
+ "[%5], PRED_enable_input_d;\n\t"
1943
+ "}"
1944
+ :
1945
+ : "r"(__d_tmem),
1946
+ "l"(__a_desc),
1947
+ "l"(__b_desc),
1948
+ "r"(__idesc),
1949
+ "r"(__scale_A_tmem),
1950
+ "r"(__scale_B_tmem),
1951
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1952
+ : "memory");
1953
+ }
1954
+ else if constexpr (__cta_group == cta_group_2)
1955
+ {
1956
+ asm volatile(
1957
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1958
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1959
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], "
1960
+ "[%5], PRED_enable_input_d;\n\t"
1961
+ "}"
1962
+ :
1963
+ : "r"(__d_tmem),
1964
+ "l"(__a_desc),
1965
+ "l"(__b_desc),
1966
+ "r"(__idesc),
1967
+ "r"(__scale_A_tmem),
1968
+ "r"(__scale_B_tmem),
1969
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
1970
+ : "memory");
1971
+ }
1972
+ # else
1973
+ // Unsupported architectures will have a linker error with a semi-decent error message
1974
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__();
1975
+ # endif
1976
+ }
1977
+ #endif // __cccl_ptx_isa >= 860
1978
+
1979
+ /*
1980
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc,
1981
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
1982
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
1983
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1984
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
1985
+ __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill(
1986
+ cuda::ptx::kind_t<Kind> kind,
1987
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1988
+ uint32_t d_tmem,
1989
+ uint64_t a_desc,
1990
+ uint64_t b_desc,
1991
+ uint32_t idesc,
1992
+ uint32_t scale_A_tmem,
1993
+ uint32_t scale_B_tmem,
1994
+ bool enable_input_d);
1995
+ */
1996
+ #if __cccl_ptx_isa >= 860
1997
+ extern "C" _CCCL_DEVICE void
1998
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__();
1999
+ template <dot_kind _Kind, dot_cta_group _Cta_Group>
2000
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill(
2001
+ kind_t<_Kind> __kind,
2002
+ cta_group_t<_Cta_Group> __cta_group,
2003
+ _CUDA_VSTD::uint32_t __d_tmem,
2004
+ _CUDA_VSTD::uint64_t __a_desc,
2005
+ _CUDA_VSTD::uint64_t __b_desc,
2006
+ _CUDA_VSTD::uint32_t __idesc,
2007
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
2008
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
2009
+ bool __enable_input_d)
2010
+ {
2011
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
2012
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2013
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2014
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
2015
+ {
2016
+ asm volatile(
2017
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2018
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2019
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], "
2020
+ "PRED_enable_input_d;\n\t"
2021
+ "}"
2022
+ :
2023
+ : "r"(__d_tmem),
2024
+ "l"(__a_desc),
2025
+ "l"(__b_desc),
2026
+ "r"(__idesc),
2027
+ "r"(__scale_A_tmem),
2028
+ "r"(__scale_B_tmem),
2029
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2030
+ : "memory");
2031
+ }
2032
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
2033
+ {
2034
+ asm volatile(
2035
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2036
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2037
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], "
2038
+ "PRED_enable_input_d;\n\t"
2039
+ "}"
2040
+ :
2041
+ : "r"(__d_tmem),
2042
+ "l"(__a_desc),
2043
+ "l"(__b_desc),
2044
+ "r"(__idesc),
2045
+ "r"(__scale_A_tmem),
2046
+ "r"(__scale_B_tmem),
2047
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2048
+ : "memory");
2049
+ }
2050
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
2051
+ {
2052
+ asm volatile(
2053
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2054
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2055
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], "
2056
+ "[%5], PRED_enable_input_d;\n\t"
2057
+ "}"
2058
+ :
2059
+ : "r"(__d_tmem),
2060
+ "l"(__a_desc),
2061
+ "l"(__b_desc),
2062
+ "r"(__idesc),
2063
+ "r"(__scale_A_tmem),
2064
+ "r"(__scale_B_tmem),
2065
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2066
+ : "memory");
2067
+ }
2068
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
2069
+ {
2070
+ asm volatile(
2071
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2072
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2073
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], "
2074
+ "[%5], PRED_enable_input_d;\n\t"
2075
+ "}"
2076
+ :
2077
+ : "r"(__d_tmem),
2078
+ "l"(__a_desc),
2079
+ "l"(__b_desc),
2080
+ "r"(__idesc),
2081
+ "r"(__scale_A_tmem),
2082
+ "r"(__scale_B_tmem),
2083
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2084
+ : "memory");
2085
+ }
2086
+ # else
2087
+ // Unsupported architectures will have a linker error with a semi-decent error message
2088
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__();
2089
+ # endif
2090
+ }
2091
+ #endif // __cccl_ptx_isa >= 860
2092
+
2093
+ /*
2094
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc,
2095
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
2096
+ // .kind = { .kind::mxf4nvf4 }
2097
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2098
+ template <cuda::ptx::dot_cta_group Cta_Group>
2099
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill(
2100
+ cuda::ptx::kind_mxf4nvf4_t,
2101
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2102
+ uint32_t d_tmem,
2103
+ uint64_t a_desc,
2104
+ uint64_t b_desc,
2105
+ uint32_t idesc,
2106
+ uint32_t scale_A_tmem,
2107
+ uint32_t scale_B_tmem,
2108
+ bool enable_input_d);
2109
+ */
2110
+ #if __cccl_ptx_isa >= 860
2111
+ extern "C" _CCCL_DEVICE void
2112
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__();
2113
+ template <dot_cta_group _Cta_Group>
2114
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill(
2115
+ kind_mxf4nvf4_t,
2116
+ cta_group_t<_Cta_Group> __cta_group,
2117
+ _CUDA_VSTD::uint32_t __d_tmem,
2118
+ _CUDA_VSTD::uint64_t __a_desc,
2119
+ _CUDA_VSTD::uint64_t __b_desc,
2120
+ _CUDA_VSTD::uint32_t __idesc,
2121
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
2122
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
2123
+ bool __enable_input_d)
2124
+ {
2125
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
2126
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2127
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2128
+ if constexpr (__cta_group == cta_group_1)
2129
+ {
2130
+ asm volatile(
2131
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2132
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2133
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], "
2134
+ "[%5], PRED_enable_input_d;\n\t"
2135
+ "}"
2136
+ :
2137
+ : "r"(__d_tmem),
2138
+ "l"(__a_desc),
2139
+ "l"(__b_desc),
2140
+ "r"(__idesc),
2141
+ "r"(__scale_A_tmem),
2142
+ "r"(__scale_B_tmem),
2143
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2144
+ : "memory");
2145
+ }
2146
+ else if constexpr (__cta_group == cta_group_2)
2147
+ {
2148
+ asm volatile(
2149
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2150
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2151
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], "
2152
+ "[%5], PRED_enable_input_d;\n\t"
2153
+ "}"
2154
+ :
2155
+ : "r"(__d_tmem),
2156
+ "l"(__a_desc),
2157
+ "l"(__b_desc),
2158
+ "r"(__idesc),
2159
+ "r"(__scale_A_tmem),
2160
+ "r"(__scale_B_tmem),
2161
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2162
+ : "memory");
2163
+ }
2164
+ # else
2165
+ // Unsupported architectures will have a linker error with a semi-decent error message
2166
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__();
2167
+ # endif
2168
+ }
2169
+ #endif // __cccl_ptx_isa >= 860
2170
+
2171
+ /*
2172
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc,
2173
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
2174
+ // .kind = { .kind::mxf8f6f4 }
2175
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2176
+ template <cuda::ptx::dot_cta_group Cta_Group>
2177
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill(
2178
+ cuda::ptx::kind_mxf8f6f4_t,
2179
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2180
+ uint32_t d_tmem,
2181
+ uint64_t a_desc,
2182
+ uint64_t b_desc,
2183
+ uint32_t idesc,
2184
+ uint32_t scale_A_tmem,
2185
+ uint32_t scale_B_tmem,
2186
+ bool enable_input_d);
2187
+ */
2188
+ #if __cccl_ptx_isa >= 860
2189
+ extern "C" _CCCL_DEVICE void
2190
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__();
2191
+ template <dot_cta_group _Cta_Group>
2192
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill(
2193
+ kind_mxf8f6f4_t,
2194
+ cta_group_t<_Cta_Group> __cta_group,
2195
+ _CUDA_VSTD::uint32_t __d_tmem,
2196
+ _CUDA_VSTD::uint64_t __a_desc,
2197
+ _CUDA_VSTD::uint64_t __b_desc,
2198
+ _CUDA_VSTD::uint32_t __idesc,
2199
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
2200
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
2201
+ bool __enable_input_d)
2202
+ {
2203
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
2204
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2205
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2206
+ if constexpr (__cta_group == cta_group_1)
2207
+ {
2208
+ asm volatile(
2209
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2210
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2211
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], "
2212
+ "[%5], PRED_enable_input_d;\n\t"
2213
+ "}"
2214
+ :
2215
+ : "r"(__d_tmem),
2216
+ "l"(__a_desc),
2217
+ "l"(__b_desc),
2218
+ "r"(__idesc),
2219
+ "r"(__scale_A_tmem),
2220
+ "r"(__scale_B_tmem),
2221
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2222
+ : "memory");
2223
+ }
2224
+ else if constexpr (__cta_group == cta_group_2)
2225
+ {
2226
+ asm volatile(
2227
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2228
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2229
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], "
2230
+ "[%5], PRED_enable_input_d;\n\t"
2231
+ "}"
2232
+ :
2233
+ : "r"(__d_tmem),
2234
+ "l"(__a_desc),
2235
+ "l"(__b_desc),
2236
+ "r"(__idesc),
2237
+ "r"(__scale_A_tmem),
2238
+ "r"(__scale_B_tmem),
2239
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2240
+ : "memory");
2241
+ }
2242
+ # else
2243
+ // Unsupported architectures will have a linker error with a semi-decent error message
2244
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__();
2245
+ # endif
2246
+ }
2247
+ #endif // __cccl_ptx_isa >= 860
2248
+
2249
+ /*
2250
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc,
2251
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
2252
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
2253
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2254
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
2255
+ __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill(
2256
+ cuda::ptx::kind_t<Kind> kind,
2257
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2258
+ uint32_t d_tmem,
2259
+ uint64_t a_desc,
2260
+ uint64_t b_desc,
2261
+ uint32_t idesc,
2262
+ uint32_t scale_A_tmem,
2263
+ uint32_t scale_B_tmem,
2264
+ bool enable_input_d);
2265
+ */
2266
+ #if __cccl_ptx_isa >= 860
2267
+ extern "C" _CCCL_DEVICE void
2268
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__();
2269
+ template <dot_kind _Kind, dot_cta_group _Cta_Group>
2270
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill(
2271
+ kind_t<_Kind> __kind,
2272
+ cta_group_t<_Cta_Group> __cta_group,
2273
+ _CUDA_VSTD::uint32_t __d_tmem,
2274
+ _CUDA_VSTD::uint64_t __a_desc,
2275
+ _CUDA_VSTD::uint64_t __b_desc,
2276
+ _CUDA_VSTD::uint32_t __idesc,
2277
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
2278
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
2279
+ bool __enable_input_d)
2280
+ {
2281
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
2282
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2283
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2284
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
2285
+ {
2286
+ asm volatile(
2287
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2288
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2289
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], "
2290
+ "PRED_enable_input_d;\n\t"
2291
+ "}"
2292
+ :
2293
+ : "r"(__d_tmem),
2294
+ "l"(__a_desc),
2295
+ "l"(__b_desc),
2296
+ "r"(__idesc),
2297
+ "r"(__scale_A_tmem),
2298
+ "r"(__scale_B_tmem),
2299
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2300
+ : "memory");
2301
+ }
2302
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
2303
+ {
2304
+ asm volatile(
2305
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2306
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2307
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], "
2308
+ "PRED_enable_input_d;\n\t"
2309
+ "}"
2310
+ :
2311
+ : "r"(__d_tmem),
2312
+ "l"(__a_desc),
2313
+ "l"(__b_desc),
2314
+ "r"(__idesc),
2315
+ "r"(__scale_A_tmem),
2316
+ "r"(__scale_B_tmem),
2317
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2318
+ : "memory");
2319
+ }
2320
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
2321
+ {
2322
+ asm volatile(
2323
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2324
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2325
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], "
2326
+ "[%5], PRED_enable_input_d;\n\t"
2327
+ "}"
2328
+ :
2329
+ : "r"(__d_tmem),
2330
+ "l"(__a_desc),
2331
+ "l"(__b_desc),
2332
+ "r"(__idesc),
2333
+ "r"(__scale_A_tmem),
2334
+ "r"(__scale_B_tmem),
2335
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2336
+ : "memory");
2337
+ }
2338
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
2339
+ {
2340
+ asm volatile(
2341
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2342
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2343
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], "
2344
+ "[%5], PRED_enable_input_d;\n\t"
2345
+ "}"
2346
+ :
2347
+ : "r"(__d_tmem),
2348
+ "l"(__a_desc),
2349
+ "l"(__b_desc),
2350
+ "r"(__idesc),
2351
+ "r"(__scale_A_tmem),
2352
+ "r"(__scale_B_tmem),
2353
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2354
+ : "memory");
2355
+ }
2356
+ # else
2357
+ // Unsupported architectures will have a linker error with a semi-decent error message
2358
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__();
2359
+ # endif
2360
+ }
2361
+ #endif // __cccl_ptx_isa >= 860
2362
+
2363
+ /*
2364
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc,
2365
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
2366
+ // .kind = { .kind::mxf4nvf4 }
2367
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2368
+ template <cuda::ptx::dot_cta_group Cta_Group>
2369
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill(
2370
+ cuda::ptx::kind_mxf4nvf4_t,
2371
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2372
+ uint32_t d_tmem,
2373
+ uint64_t a_desc,
2374
+ uint64_t b_desc,
2375
+ uint32_t idesc,
2376
+ uint32_t scale_A_tmem,
2377
+ uint32_t scale_B_tmem,
2378
+ bool enable_input_d);
2379
+ */
2380
+ #if __cccl_ptx_isa >= 860
2381
+ extern "C" _CCCL_DEVICE void
2382
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__();
2383
+ template <dot_cta_group _Cta_Group>
2384
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill(
2385
+ kind_mxf4nvf4_t,
2386
+ cta_group_t<_Cta_Group> __cta_group,
2387
+ _CUDA_VSTD::uint32_t __d_tmem,
2388
+ _CUDA_VSTD::uint64_t __a_desc,
2389
+ _CUDA_VSTD::uint64_t __b_desc,
2390
+ _CUDA_VSTD::uint32_t __idesc,
2391
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
2392
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
2393
+ bool __enable_input_d)
2394
+ {
2395
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
2396
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2397
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2398
+ if constexpr (__cta_group == cta_group_1)
2399
+ {
2400
+ asm volatile(
2401
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2402
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2403
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], "
2404
+ "[%5], PRED_enable_input_d;\n\t"
2405
+ "}"
2406
+ :
2407
+ : "r"(__d_tmem),
2408
+ "l"(__a_desc),
2409
+ "l"(__b_desc),
2410
+ "r"(__idesc),
2411
+ "r"(__scale_A_tmem),
2412
+ "r"(__scale_B_tmem),
2413
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2414
+ : "memory");
2415
+ }
2416
+ else if constexpr (__cta_group == cta_group_2)
2417
+ {
2418
+ asm volatile(
2419
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2420
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2421
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], "
2422
+ "[%5], PRED_enable_input_d;\n\t"
2423
+ "}"
2424
+ :
2425
+ : "r"(__d_tmem),
2426
+ "l"(__a_desc),
2427
+ "l"(__b_desc),
2428
+ "r"(__idesc),
2429
+ "r"(__scale_A_tmem),
2430
+ "r"(__scale_B_tmem),
2431
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2432
+ : "memory");
2433
+ }
2434
+ # else
2435
+ // Unsupported architectures will have a linker error with a semi-decent error message
2436
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__();
2437
+ # endif
2438
+ }
2439
+ #endif // __cccl_ptx_isa >= 860
2440
+
2441
+ /*
2442
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc,
2443
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
2444
+ // .kind = { .kind::mxf8f6f4 }
2445
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2446
+ template <cuda::ptx::dot_cta_group Cta_Group>
2447
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use(
2448
+ cuda::ptx::kind_mxf8f6f4_t,
2449
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2450
+ uint32_t d_tmem,
2451
+ uint64_t a_desc,
2452
+ uint64_t b_desc,
2453
+ uint32_t idesc,
2454
+ uint32_t scale_A_tmem,
2455
+ uint32_t scale_B_tmem,
2456
+ bool enable_input_d);
2457
+ */
2458
+ #if __cccl_ptx_isa >= 860
2459
+ extern "C" _CCCL_DEVICE void
2460
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__();
2461
+ template <dot_cta_group _Cta_Group>
2462
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use(
2463
+ kind_mxf8f6f4_t,
2464
+ cta_group_t<_Cta_Group> __cta_group,
2465
+ _CUDA_VSTD::uint32_t __d_tmem,
2466
+ _CUDA_VSTD::uint64_t __a_desc,
2467
+ _CUDA_VSTD::uint64_t __b_desc,
2468
+ _CUDA_VSTD::uint32_t __idesc,
2469
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
2470
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
2471
+ bool __enable_input_d)
2472
+ {
2473
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
2474
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2475
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2476
+ if constexpr (__cta_group == cta_group_1)
2477
+ {
2478
+ asm volatile(
2479
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2480
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2481
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], "
2482
+ "[%5], PRED_enable_input_d;\n\t"
2483
+ "}"
2484
+ :
2485
+ : "r"(__d_tmem),
2486
+ "l"(__a_desc),
2487
+ "l"(__b_desc),
2488
+ "r"(__idesc),
2489
+ "r"(__scale_A_tmem),
2490
+ "r"(__scale_B_tmem),
2491
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2492
+ : "memory");
2493
+ }
2494
+ else if constexpr (__cta_group == cta_group_2)
2495
+ {
2496
+ asm volatile(
2497
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2498
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2499
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], "
2500
+ "[%5], PRED_enable_input_d;\n\t"
2501
+ "}"
2502
+ :
2503
+ : "r"(__d_tmem),
2504
+ "l"(__a_desc),
2505
+ "l"(__b_desc),
2506
+ "r"(__idesc),
2507
+ "r"(__scale_A_tmem),
2508
+ "r"(__scale_B_tmem),
2509
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2510
+ : "memory");
2511
+ }
2512
+ # else
2513
+ // Unsupported architectures will have a linker error with a semi-decent error message
2514
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__();
2515
+ # endif
2516
+ }
2517
+ #endif // __cccl_ptx_isa >= 860
2518
+
2519
+ /*
2520
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc,
2521
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
2522
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
2523
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2524
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
2525
+ __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use(
2526
+ cuda::ptx::kind_t<Kind> kind,
2527
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2528
+ uint32_t d_tmem,
2529
+ uint64_t a_desc,
2530
+ uint64_t b_desc,
2531
+ uint32_t idesc,
2532
+ uint32_t scale_A_tmem,
2533
+ uint32_t scale_B_tmem,
2534
+ bool enable_input_d);
2535
+ */
2536
+ #if __cccl_ptx_isa >= 860
2537
+ extern "C" _CCCL_DEVICE void
2538
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__();
2539
+ template <dot_kind _Kind, dot_cta_group _Cta_Group>
2540
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use(
2541
+ kind_t<_Kind> __kind,
2542
+ cta_group_t<_Cta_Group> __cta_group,
2543
+ _CUDA_VSTD::uint32_t __d_tmem,
2544
+ _CUDA_VSTD::uint64_t __a_desc,
2545
+ _CUDA_VSTD::uint64_t __b_desc,
2546
+ _CUDA_VSTD::uint32_t __idesc,
2547
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
2548
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
2549
+ bool __enable_input_d)
2550
+ {
2551
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
2552
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2553
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2554
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
2555
+ {
2556
+ asm volatile(
2557
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2558
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2559
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], "
2560
+ "PRED_enable_input_d;\n\t"
2561
+ "}"
2562
+ :
2563
+ : "r"(__d_tmem),
2564
+ "l"(__a_desc),
2565
+ "l"(__b_desc),
2566
+ "r"(__idesc),
2567
+ "r"(__scale_A_tmem),
2568
+ "r"(__scale_B_tmem),
2569
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2570
+ : "memory");
2571
+ }
2572
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
2573
+ {
2574
+ asm volatile(
2575
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2576
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2577
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], "
2578
+ "PRED_enable_input_d;\n\t"
2579
+ "}"
2580
+ :
2581
+ : "r"(__d_tmem),
2582
+ "l"(__a_desc),
2583
+ "l"(__b_desc),
2584
+ "r"(__idesc),
2585
+ "r"(__scale_A_tmem),
2586
+ "r"(__scale_B_tmem),
2587
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2588
+ : "memory");
2589
+ }
2590
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
2591
+ {
2592
+ asm volatile(
2593
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2594
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2595
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], "
2596
+ "[%5], PRED_enable_input_d;\n\t"
2597
+ "}"
2598
+ :
2599
+ : "r"(__d_tmem),
2600
+ "l"(__a_desc),
2601
+ "l"(__b_desc),
2602
+ "r"(__idesc),
2603
+ "r"(__scale_A_tmem),
2604
+ "r"(__scale_B_tmem),
2605
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2606
+ : "memory");
2607
+ }
2608
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
2609
+ {
2610
+ asm volatile(
2611
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2612
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2613
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], "
2614
+ "[%5], PRED_enable_input_d;\n\t"
2615
+ "}"
2616
+ :
2617
+ : "r"(__d_tmem),
2618
+ "l"(__a_desc),
2619
+ "l"(__b_desc),
2620
+ "r"(__idesc),
2621
+ "r"(__scale_A_tmem),
2622
+ "r"(__scale_B_tmem),
2623
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2624
+ : "memory");
2625
+ }
2626
+ # else
2627
+ // Unsupported architectures will have a linker error with a semi-decent error message
2628
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__();
2629
+ # endif
2630
+ }
2631
+ #endif // __cccl_ptx_isa >= 860
2632
+
2633
+ /*
2634
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc,
2635
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
2636
+ // .kind = { .kind::mxf4nvf4 }
2637
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2638
+ template <cuda::ptx::dot_cta_group Cta_Group>
2639
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use(
2640
+ cuda::ptx::kind_mxf4nvf4_t,
2641
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2642
+ uint32_t d_tmem,
2643
+ uint64_t a_desc,
2644
+ uint64_t b_desc,
2645
+ uint32_t idesc,
2646
+ uint32_t scale_A_tmem,
2647
+ uint32_t scale_B_tmem,
2648
+ bool enable_input_d);
2649
+ */
2650
+ #if __cccl_ptx_isa >= 860
2651
+ extern "C" _CCCL_DEVICE void
2652
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__();
2653
+ template <dot_cta_group _Cta_Group>
2654
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use(
2655
+ kind_mxf4nvf4_t,
2656
+ cta_group_t<_Cta_Group> __cta_group,
2657
+ _CUDA_VSTD::uint32_t __d_tmem,
2658
+ _CUDA_VSTD::uint64_t __a_desc,
2659
+ _CUDA_VSTD::uint64_t __b_desc,
2660
+ _CUDA_VSTD::uint32_t __idesc,
2661
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
2662
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
2663
+ bool __enable_input_d)
2664
+ {
2665
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
2666
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2667
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2668
+ if constexpr (__cta_group == cta_group_1)
2669
+ {
2670
+ asm volatile(
2671
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2672
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2673
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], "
2674
+ "[%5], PRED_enable_input_d;\n\t"
2675
+ "}"
2676
+ :
2677
+ : "r"(__d_tmem),
2678
+ "l"(__a_desc),
2679
+ "l"(__b_desc),
2680
+ "r"(__idesc),
2681
+ "r"(__scale_A_tmem),
2682
+ "r"(__scale_B_tmem),
2683
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2684
+ : "memory");
2685
+ }
2686
+ else if constexpr (__cta_group == cta_group_2)
2687
+ {
2688
+ asm volatile(
2689
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2690
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2691
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], "
2692
+ "[%5], PRED_enable_input_d;\n\t"
2693
+ "}"
2694
+ :
2695
+ : "r"(__d_tmem),
2696
+ "l"(__a_desc),
2697
+ "l"(__b_desc),
2698
+ "r"(__idesc),
2699
+ "r"(__scale_A_tmem),
2700
+ "r"(__scale_B_tmem),
2701
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2702
+ : "memory");
2703
+ }
2704
+ # else
2705
+ // Unsupported architectures will have a linker error with a semi-decent error message
2706
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__();
2707
+ # endif
2708
+ }
2709
+ #endif // __cccl_ptx_isa >= 860
2710
+
2711
+ /*
2712
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc,
2713
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
2714
+ // .kind = { .kind::mxf8f6f4 }
2715
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2716
+ template <cuda::ptx::dot_cta_group Cta_Group>
2717
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use(
2718
+ cuda::ptx::kind_mxf8f6f4_t,
2719
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2720
+ uint32_t d_tmem,
2721
+ uint64_t a_desc,
2722
+ uint64_t b_desc,
2723
+ uint32_t idesc,
2724
+ uint32_t scale_A_tmem,
2725
+ uint32_t scale_B_tmem,
2726
+ bool enable_input_d);
2727
+ */
2728
+ #if __cccl_ptx_isa >= 860
2729
+ extern "C" _CCCL_DEVICE void
2730
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__();
2731
+ template <dot_cta_group _Cta_Group>
2732
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use(
2733
+ kind_mxf8f6f4_t,
2734
+ cta_group_t<_Cta_Group> __cta_group,
2735
+ _CUDA_VSTD::uint32_t __d_tmem,
2736
+ _CUDA_VSTD::uint64_t __a_desc,
2737
+ _CUDA_VSTD::uint64_t __b_desc,
2738
+ _CUDA_VSTD::uint32_t __idesc,
2739
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
2740
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
2741
+ bool __enable_input_d)
2742
+ {
2743
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
2744
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2745
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2746
+ if constexpr (__cta_group == cta_group_1)
2747
+ {
2748
+ asm volatile(
2749
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2750
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2751
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], "
2752
+ "[%5], PRED_enable_input_d;\n\t"
2753
+ "}"
2754
+ :
2755
+ : "r"(__d_tmem),
2756
+ "l"(__a_desc),
2757
+ "l"(__b_desc),
2758
+ "r"(__idesc),
2759
+ "r"(__scale_A_tmem),
2760
+ "r"(__scale_B_tmem),
2761
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2762
+ : "memory");
2763
+ }
2764
+ else if constexpr (__cta_group == cta_group_2)
2765
+ {
2766
+ asm volatile(
2767
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2768
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2769
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], "
2770
+ "[%5], PRED_enable_input_d;\n\t"
2771
+ "}"
2772
+ :
2773
+ : "r"(__d_tmem),
2774
+ "l"(__a_desc),
2775
+ "l"(__b_desc),
2776
+ "r"(__idesc),
2777
+ "r"(__scale_A_tmem),
2778
+ "r"(__scale_B_tmem),
2779
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2780
+ : "memory");
2781
+ }
2782
+ # else
2783
+ // Unsupported architectures will have a linker error with a semi-decent error message
2784
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__();
2785
+ # endif
2786
+ }
2787
+ #endif // __cccl_ptx_isa >= 860
2788
+
2789
+ /*
2790
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc,
2791
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
2792
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
2793
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2794
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
2795
+ __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use(
2796
+ cuda::ptx::kind_t<Kind> kind,
2797
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2798
+ uint32_t d_tmem,
2799
+ uint64_t a_desc,
2800
+ uint64_t b_desc,
2801
+ uint32_t idesc,
2802
+ uint32_t scale_A_tmem,
2803
+ uint32_t scale_B_tmem,
2804
+ bool enable_input_d);
2805
+ */
2806
+ #if __cccl_ptx_isa >= 860
2807
+ extern "C" _CCCL_DEVICE void
2808
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__();
2809
+ template <dot_kind _Kind, dot_cta_group _Cta_Group>
2810
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use(
2811
+ kind_t<_Kind> __kind,
2812
+ cta_group_t<_Cta_Group> __cta_group,
2813
+ _CUDA_VSTD::uint32_t __d_tmem,
2814
+ _CUDA_VSTD::uint64_t __a_desc,
2815
+ _CUDA_VSTD::uint64_t __b_desc,
2816
+ _CUDA_VSTD::uint32_t __idesc,
2817
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
2818
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
2819
+ bool __enable_input_d)
2820
+ {
2821
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
2822
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2823
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2824
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
2825
+ {
2826
+ asm volatile(
2827
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2828
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2829
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], "
2830
+ "PRED_enable_input_d;\n\t"
2831
+ "}"
2832
+ :
2833
+ : "r"(__d_tmem),
2834
+ "l"(__a_desc),
2835
+ "l"(__b_desc),
2836
+ "r"(__idesc),
2837
+ "r"(__scale_A_tmem),
2838
+ "r"(__scale_B_tmem),
2839
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2840
+ : "memory");
2841
+ }
2842
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
2843
+ {
2844
+ asm volatile(
2845
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2846
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2847
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], "
2848
+ "PRED_enable_input_d;\n\t"
2849
+ "}"
2850
+ :
2851
+ : "r"(__d_tmem),
2852
+ "l"(__a_desc),
2853
+ "l"(__b_desc),
2854
+ "r"(__idesc),
2855
+ "r"(__scale_A_tmem),
2856
+ "r"(__scale_B_tmem),
2857
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2858
+ : "memory");
2859
+ }
2860
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
2861
+ {
2862
+ asm volatile(
2863
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2864
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2865
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], "
2866
+ "[%5], PRED_enable_input_d;\n\t"
2867
+ "}"
2868
+ :
2869
+ : "r"(__d_tmem),
2870
+ "l"(__a_desc),
2871
+ "l"(__b_desc),
2872
+ "r"(__idesc),
2873
+ "r"(__scale_A_tmem),
2874
+ "r"(__scale_B_tmem),
2875
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2876
+ : "memory");
2877
+ }
2878
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
2879
+ {
2880
+ asm volatile(
2881
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2882
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2883
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], "
2884
+ "[%5], PRED_enable_input_d;\n\t"
2885
+ "}"
2886
+ :
2887
+ : "r"(__d_tmem),
2888
+ "l"(__a_desc),
2889
+ "l"(__b_desc),
2890
+ "r"(__idesc),
2891
+ "r"(__scale_A_tmem),
2892
+ "r"(__scale_B_tmem),
2893
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2894
+ : "memory");
2895
+ }
2896
+ # else
2897
+ // Unsupported architectures will have a linker error with a semi-decent error message
2898
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__();
2899
+ # endif
2900
+ }
2901
+ #endif // __cccl_ptx_isa >= 860
2902
+
2903
+ /*
2904
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc,
2905
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
2906
+ // .kind = { .kind::mxf4nvf4 }
2907
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2908
+ template <cuda::ptx::dot_cta_group Cta_Group>
2909
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use(
2910
+ cuda::ptx::kind_mxf4nvf4_t,
2911
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2912
+ uint32_t d_tmem,
2913
+ uint64_t a_desc,
2914
+ uint64_t b_desc,
2915
+ uint32_t idesc,
2916
+ uint32_t scale_A_tmem,
2917
+ uint32_t scale_B_tmem,
2918
+ bool enable_input_d);
2919
+ */
2920
+ #if __cccl_ptx_isa >= 860
2921
+ extern "C" _CCCL_DEVICE void
2922
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__();
2923
+ template <dot_cta_group _Cta_Group>
2924
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use(
2925
+ kind_mxf4nvf4_t,
2926
+ cta_group_t<_Cta_Group> __cta_group,
2927
+ _CUDA_VSTD::uint32_t __d_tmem,
2928
+ _CUDA_VSTD::uint64_t __a_desc,
2929
+ _CUDA_VSTD::uint64_t __b_desc,
2930
+ _CUDA_VSTD::uint32_t __idesc,
2931
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
2932
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
2933
+ bool __enable_input_d)
2934
+ {
2935
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
2936
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2937
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2938
+ if constexpr (__cta_group == cta_group_1)
2939
+ {
2940
+ asm volatile(
2941
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2942
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2943
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], "
2944
+ "[%5], PRED_enable_input_d;\n\t"
2945
+ "}"
2946
+ :
2947
+ : "r"(__d_tmem),
2948
+ "l"(__a_desc),
2949
+ "l"(__b_desc),
2950
+ "r"(__idesc),
2951
+ "r"(__scale_A_tmem),
2952
+ "r"(__scale_B_tmem),
2953
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2954
+ : "memory");
2955
+ }
2956
+ else if constexpr (__cta_group == cta_group_2)
2957
+ {
2958
+ asm volatile(
2959
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2960
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2961
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], "
2962
+ "[%5], PRED_enable_input_d;\n\t"
2963
+ "}"
2964
+ :
2965
+ : "r"(__d_tmem),
2966
+ "l"(__a_desc),
2967
+ "l"(__b_desc),
2968
+ "r"(__idesc),
2969
+ "r"(__scale_A_tmem),
2970
+ "r"(__scale_B_tmem),
2971
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
2972
+ : "memory");
2973
+ }
2974
+ # else
2975
+ // Unsupported architectures will have a linker error with a semi-decent error message
2976
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__();
2977
+ # endif
2978
+ }
2979
+ #endif // __cccl_ptx_isa >= 860
2980
+
2981
+ /*
2982
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc,
2983
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
2984
+ // .kind = { .kind::mxf8f6f4 }
2985
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2986
+ template <cuda::ptx::dot_cta_group Cta_Group>
2987
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse(
2988
+ cuda::ptx::kind_mxf8f6f4_t,
2989
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2990
+ uint32_t d_tmem,
2991
+ uint64_t a_desc,
2992
+ uint64_t b_desc,
2993
+ uint32_t idesc,
2994
+ uint32_t scale_A_tmem,
2995
+ uint32_t scale_B_tmem,
2996
+ bool enable_input_d);
2997
+ */
2998
+ #if __cccl_ptx_isa >= 860
2999
+ extern "C" _CCCL_DEVICE void
3000
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__();
3001
+ template <dot_cta_group _Cta_Group>
3002
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse(
3003
+ kind_mxf8f6f4_t,
3004
+ cta_group_t<_Cta_Group> __cta_group,
3005
+ _CUDA_VSTD::uint32_t __d_tmem,
3006
+ _CUDA_VSTD::uint64_t __a_desc,
3007
+ _CUDA_VSTD::uint64_t __b_desc,
3008
+ _CUDA_VSTD::uint32_t __idesc,
3009
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
3010
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
3011
+ bool __enable_input_d)
3012
+ {
3013
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
3014
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3015
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3016
+ if constexpr (__cta_group == cta_group_1)
3017
+ {
3018
+ asm volatile(
3019
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3020
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3021
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3022
+ "[%5], PRED_enable_input_d;\n\t"
3023
+ "}"
3024
+ :
3025
+ : "r"(__d_tmem),
3026
+ "l"(__a_desc),
3027
+ "l"(__b_desc),
3028
+ "r"(__idesc),
3029
+ "r"(__scale_A_tmem),
3030
+ "r"(__scale_B_tmem),
3031
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3032
+ : "memory");
3033
+ }
3034
+ else if constexpr (__cta_group == cta_group_2)
3035
+ {
3036
+ asm volatile(
3037
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3038
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3039
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3040
+ "[%5], PRED_enable_input_d;\n\t"
3041
+ "}"
3042
+ :
3043
+ : "r"(__d_tmem),
3044
+ "l"(__a_desc),
3045
+ "l"(__b_desc),
3046
+ "r"(__idesc),
3047
+ "r"(__scale_A_tmem),
3048
+ "r"(__scale_B_tmem),
3049
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3050
+ : "memory");
3051
+ }
3052
+ # else
3053
+ // Unsupported architectures will have a linker error with a semi-decent error message
3054
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__();
3055
+ # endif
3056
+ }
3057
+ #endif // __cccl_ptx_isa >= 860
3058
+
3059
+ /*
3060
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc,
3061
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
3062
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
3063
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3064
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
3065
+ __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse(
3066
+ cuda::ptx::kind_t<Kind> kind,
3067
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3068
+ uint32_t d_tmem,
3069
+ uint64_t a_desc,
3070
+ uint64_t b_desc,
3071
+ uint32_t idesc,
3072
+ uint32_t scale_A_tmem,
3073
+ uint32_t scale_B_tmem,
3074
+ bool enable_input_d);
3075
+ */
3076
+ #if __cccl_ptx_isa >= 860
3077
+ extern "C" _CCCL_DEVICE void
3078
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__();
3079
+ template <dot_kind _Kind, dot_cta_group _Cta_Group>
3080
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse(
3081
+ kind_t<_Kind> __kind,
3082
+ cta_group_t<_Cta_Group> __cta_group,
3083
+ _CUDA_VSTD::uint32_t __d_tmem,
3084
+ _CUDA_VSTD::uint64_t __a_desc,
3085
+ _CUDA_VSTD::uint64_t __b_desc,
3086
+ _CUDA_VSTD::uint32_t __idesc,
3087
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
3088
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
3089
+ bool __enable_input_d)
3090
+ {
3091
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
3092
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3093
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3094
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
3095
+ {
3096
+ asm volatile(
3097
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3098
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3099
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3100
+ "[%5], PRED_enable_input_d;\n\t"
3101
+ "}"
3102
+ :
3103
+ : "r"(__d_tmem),
3104
+ "l"(__a_desc),
3105
+ "l"(__b_desc),
3106
+ "r"(__idesc),
3107
+ "r"(__scale_A_tmem),
3108
+ "r"(__scale_B_tmem),
3109
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3110
+ : "memory");
3111
+ }
3112
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
3113
+ {
3114
+ asm volatile(
3115
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3116
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3117
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3118
+ "[%5], PRED_enable_input_d;\n\t"
3119
+ "}"
3120
+ :
3121
+ : "r"(__d_tmem),
3122
+ "l"(__a_desc),
3123
+ "l"(__b_desc),
3124
+ "r"(__idesc),
3125
+ "r"(__scale_A_tmem),
3126
+ "r"(__scale_B_tmem),
3127
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3128
+ : "memory");
3129
+ }
3130
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
3131
+ {
3132
+ asm volatile(
3133
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3134
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3135
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3136
+ "[%5], PRED_enable_input_d;\n\t"
3137
+ "}"
3138
+ :
3139
+ : "r"(__d_tmem),
3140
+ "l"(__a_desc),
3141
+ "l"(__b_desc),
3142
+ "r"(__idesc),
3143
+ "r"(__scale_A_tmem),
3144
+ "r"(__scale_B_tmem),
3145
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3146
+ : "memory");
3147
+ }
3148
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
3149
+ {
3150
+ asm volatile(
3151
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3152
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3153
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3154
+ "[%5], PRED_enable_input_d;\n\t"
3155
+ "}"
3156
+ :
3157
+ : "r"(__d_tmem),
3158
+ "l"(__a_desc),
3159
+ "l"(__b_desc),
3160
+ "r"(__idesc),
3161
+ "r"(__scale_A_tmem),
3162
+ "r"(__scale_B_tmem),
3163
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3164
+ : "memory");
3165
+ }
3166
+ # else
3167
+ // Unsupported architectures will have a linker error with a semi-decent error message
3168
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__();
3169
+ # endif
3170
+ }
3171
+ #endif // __cccl_ptx_isa >= 860
3172
+
3173
+ /*
3174
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc,
3175
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
3176
+ // .kind = { .kind::mxf4nvf4 }
3177
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3178
+ template <cuda::ptx::dot_cta_group Cta_Group>
3179
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse(
3180
+ cuda::ptx::kind_mxf4nvf4_t,
3181
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3182
+ uint32_t d_tmem,
3183
+ uint64_t a_desc,
3184
+ uint64_t b_desc,
3185
+ uint32_t idesc,
3186
+ uint32_t scale_A_tmem,
3187
+ uint32_t scale_B_tmem,
3188
+ bool enable_input_d);
3189
+ */
3190
+ #if __cccl_ptx_isa >= 860
3191
+ extern "C" _CCCL_DEVICE void
3192
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__();
3193
+ template <dot_cta_group _Cta_Group>
3194
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse(
3195
+ kind_mxf4nvf4_t,
3196
+ cta_group_t<_Cta_Group> __cta_group,
3197
+ _CUDA_VSTD::uint32_t __d_tmem,
3198
+ _CUDA_VSTD::uint64_t __a_desc,
3199
+ _CUDA_VSTD::uint64_t __b_desc,
3200
+ _CUDA_VSTD::uint32_t __idesc,
3201
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
3202
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
3203
+ bool __enable_input_d)
3204
+ {
3205
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
3206
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3207
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3208
+ if constexpr (__cta_group == cta_group_1)
3209
+ {
3210
+ asm volatile(
3211
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3212
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3213
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3214
+ "[%5], PRED_enable_input_d;\n\t"
3215
+ "}"
3216
+ :
3217
+ : "r"(__d_tmem),
3218
+ "l"(__a_desc),
3219
+ "l"(__b_desc),
3220
+ "r"(__idesc),
3221
+ "r"(__scale_A_tmem),
3222
+ "r"(__scale_B_tmem),
3223
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3224
+ : "memory");
3225
+ }
3226
+ else if constexpr (__cta_group == cta_group_2)
3227
+ {
3228
+ asm volatile(
3229
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3230
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3231
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3232
+ "[%5], PRED_enable_input_d;\n\t"
3233
+ "}"
3234
+ :
3235
+ : "r"(__d_tmem),
3236
+ "l"(__a_desc),
3237
+ "l"(__b_desc),
3238
+ "r"(__idesc),
3239
+ "r"(__scale_A_tmem),
3240
+ "r"(__scale_B_tmem),
3241
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3242
+ : "memory");
3243
+ }
3244
+ # else
3245
+ // Unsupported architectures will have a linker error with a semi-decent error message
3246
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__();
3247
+ # endif
3248
+ }
3249
+ #endif // __cccl_ptx_isa >= 860
3250
+
3251
+ /*
3252
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc,
3253
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
3254
+ // .kind = { .kind::mxf8f6f4 }
3255
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3256
+ template <cuda::ptx::dot_cta_group Cta_Group>
3257
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse(
3258
+ cuda::ptx::kind_mxf8f6f4_t,
3259
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3260
+ uint32_t d_tmem,
3261
+ uint64_t a_desc,
3262
+ uint64_t b_desc,
3263
+ uint32_t idesc,
3264
+ uint32_t scale_A_tmem,
3265
+ uint32_t scale_B_tmem,
3266
+ bool enable_input_d);
3267
+ */
3268
+ #if __cccl_ptx_isa >= 860
3269
+ extern "C" _CCCL_DEVICE void
3270
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__();
3271
+ template <dot_cta_group _Cta_Group>
3272
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse(
3273
+ kind_mxf8f6f4_t,
3274
+ cta_group_t<_Cta_Group> __cta_group,
3275
+ _CUDA_VSTD::uint32_t __d_tmem,
3276
+ _CUDA_VSTD::uint64_t __a_desc,
3277
+ _CUDA_VSTD::uint64_t __b_desc,
3278
+ _CUDA_VSTD::uint32_t __idesc,
3279
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
3280
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
3281
+ bool __enable_input_d)
3282
+ {
3283
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
3284
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3285
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3286
+ if constexpr (__cta_group == cta_group_1)
3287
+ {
3288
+ asm volatile(
3289
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3290
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3291
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3292
+ "[%5], PRED_enable_input_d;\n\t"
3293
+ "}"
3294
+ :
3295
+ : "r"(__d_tmem),
3296
+ "l"(__a_desc),
3297
+ "l"(__b_desc),
3298
+ "r"(__idesc),
3299
+ "r"(__scale_A_tmem),
3300
+ "r"(__scale_B_tmem),
3301
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3302
+ : "memory");
3303
+ }
3304
+ else if constexpr (__cta_group == cta_group_2)
3305
+ {
3306
+ asm volatile(
3307
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3308
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3309
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3310
+ "[%5], PRED_enable_input_d;\n\t"
3311
+ "}"
3312
+ :
3313
+ : "r"(__d_tmem),
3314
+ "l"(__a_desc),
3315
+ "l"(__b_desc),
3316
+ "r"(__idesc),
3317
+ "r"(__scale_A_tmem),
3318
+ "r"(__scale_B_tmem),
3319
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3320
+ : "memory");
3321
+ }
3322
+ # else
3323
+ // Unsupported architectures will have a linker error with a semi-decent error message
3324
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__();
3325
+ # endif
3326
+ }
3327
+ #endif // __cccl_ptx_isa >= 860
3328
+
3329
+ /*
3330
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc,
3331
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
3332
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
3333
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3334
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
3335
+ __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse(
3336
+ cuda::ptx::kind_t<Kind> kind,
3337
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3338
+ uint32_t d_tmem,
3339
+ uint64_t a_desc,
3340
+ uint64_t b_desc,
3341
+ uint32_t idesc,
3342
+ uint32_t scale_A_tmem,
3343
+ uint32_t scale_B_tmem,
3344
+ bool enable_input_d);
3345
+ */
3346
+ #if __cccl_ptx_isa >= 860
3347
+ extern "C" _CCCL_DEVICE void
3348
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__();
3349
+ template <dot_kind _Kind, dot_cta_group _Cta_Group>
3350
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse(
3351
+ kind_t<_Kind> __kind,
3352
+ cta_group_t<_Cta_Group> __cta_group,
3353
+ _CUDA_VSTD::uint32_t __d_tmem,
3354
+ _CUDA_VSTD::uint64_t __a_desc,
3355
+ _CUDA_VSTD::uint64_t __b_desc,
3356
+ _CUDA_VSTD::uint32_t __idesc,
3357
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
3358
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
3359
+ bool __enable_input_d)
3360
+ {
3361
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
3362
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3363
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3364
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
3365
+ {
3366
+ asm volatile(
3367
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3368
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3369
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3370
+ "[%5], PRED_enable_input_d;\n\t"
3371
+ "}"
3372
+ :
3373
+ : "r"(__d_tmem),
3374
+ "l"(__a_desc),
3375
+ "l"(__b_desc),
3376
+ "r"(__idesc),
3377
+ "r"(__scale_A_tmem),
3378
+ "r"(__scale_B_tmem),
3379
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3380
+ : "memory");
3381
+ }
3382
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
3383
+ {
3384
+ asm volatile(
3385
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3386
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3387
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3388
+ "[%5], PRED_enable_input_d;\n\t"
3389
+ "}"
3390
+ :
3391
+ : "r"(__d_tmem),
3392
+ "l"(__a_desc),
3393
+ "l"(__b_desc),
3394
+ "r"(__idesc),
3395
+ "r"(__scale_A_tmem),
3396
+ "r"(__scale_B_tmem),
3397
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3398
+ : "memory");
3399
+ }
3400
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
3401
+ {
3402
+ asm volatile(
3403
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3404
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3405
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3406
+ "[%5], PRED_enable_input_d;\n\t"
3407
+ "}"
3408
+ :
3409
+ : "r"(__d_tmem),
3410
+ "l"(__a_desc),
3411
+ "l"(__b_desc),
3412
+ "r"(__idesc),
3413
+ "r"(__scale_A_tmem),
3414
+ "r"(__scale_B_tmem),
3415
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3416
+ : "memory");
3417
+ }
3418
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
3419
+ {
3420
+ asm volatile(
3421
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3422
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3423
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3424
+ "[%5], PRED_enable_input_d;\n\t"
3425
+ "}"
3426
+ :
3427
+ : "r"(__d_tmem),
3428
+ "l"(__a_desc),
3429
+ "l"(__b_desc),
3430
+ "r"(__idesc),
3431
+ "r"(__scale_A_tmem),
3432
+ "r"(__scale_B_tmem),
3433
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3434
+ : "memory");
3435
+ }
3436
+ # else
3437
+ // Unsupported architectures will have a linker error with a semi-decent error message
3438
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__();
3439
+ # endif
3440
+ }
3441
+ #endif // __cccl_ptx_isa >= 860
3442
+
3443
+ /*
3444
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc,
3445
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
3446
+ // .kind = { .kind::mxf4nvf4 }
3447
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3448
+ template <cuda::ptx::dot_cta_group Cta_Group>
3449
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse(
3450
+ cuda::ptx::kind_mxf4nvf4_t,
3451
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3452
+ uint32_t d_tmem,
3453
+ uint64_t a_desc,
3454
+ uint64_t b_desc,
3455
+ uint32_t idesc,
3456
+ uint32_t scale_A_tmem,
3457
+ uint32_t scale_B_tmem,
3458
+ bool enable_input_d);
3459
+ */
3460
+ #if __cccl_ptx_isa >= 860
3461
+ extern "C" _CCCL_DEVICE void
3462
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__();
3463
+ template <dot_cta_group _Cta_Group>
3464
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse(
3465
+ kind_mxf4nvf4_t,
3466
+ cta_group_t<_Cta_Group> __cta_group,
3467
+ _CUDA_VSTD::uint32_t __d_tmem,
3468
+ _CUDA_VSTD::uint64_t __a_desc,
3469
+ _CUDA_VSTD::uint64_t __b_desc,
3470
+ _CUDA_VSTD::uint32_t __idesc,
3471
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
3472
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
3473
+ bool __enable_input_d)
3474
+ {
3475
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
3476
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3477
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3478
+ if constexpr (__cta_group == cta_group_1)
3479
+ {
3480
+ asm volatile(
3481
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3482
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3483
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3484
+ "[%5], PRED_enable_input_d;\n\t"
3485
+ "}"
3486
+ :
3487
+ : "r"(__d_tmem),
3488
+ "l"(__a_desc),
3489
+ "l"(__b_desc),
3490
+ "r"(__idesc),
3491
+ "r"(__scale_A_tmem),
3492
+ "r"(__scale_B_tmem),
3493
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3494
+ : "memory");
3495
+ }
3496
+ else if constexpr (__cta_group == cta_group_2)
3497
+ {
3498
+ asm volatile(
3499
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3500
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3501
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3502
+ "[%5], PRED_enable_input_d;\n\t"
3503
+ "}"
3504
+ :
3505
+ : "r"(__d_tmem),
3506
+ "l"(__a_desc),
3507
+ "l"(__b_desc),
3508
+ "r"(__idesc),
3509
+ "r"(__scale_A_tmem),
3510
+ "r"(__scale_B_tmem),
3511
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3512
+ : "memory");
3513
+ }
3514
+ # else
3515
+ // Unsupported architectures will have a linker error with a semi-decent error message
3516
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__();
3517
+ # endif
3518
+ }
3519
+ #endif // __cccl_ptx_isa >= 860
3520
+
3521
+ /*
3522
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc,
3523
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
3524
+ // .kind = { .kind::mxf8f6f4 }
3525
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3526
+ template <cuda::ptx::dot_cta_group Cta_Group>
3527
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard(
3528
+ cuda::ptx::kind_mxf8f6f4_t,
3529
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3530
+ uint32_t d_tmem,
3531
+ uint64_t a_desc,
3532
+ uint64_t b_desc,
3533
+ uint32_t idesc,
3534
+ uint32_t scale_A_tmem,
3535
+ uint32_t scale_B_tmem,
3536
+ bool enable_input_d);
3537
+ */
3538
+ #if __cccl_ptx_isa >= 860
3539
+ extern "C" _CCCL_DEVICE void
3540
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__();
3541
+ template <dot_cta_group _Cta_Group>
3542
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard(
3543
+ kind_mxf8f6f4_t,
3544
+ cta_group_t<_Cta_Group> __cta_group,
3545
+ _CUDA_VSTD::uint32_t __d_tmem,
3546
+ _CUDA_VSTD::uint64_t __a_desc,
3547
+ _CUDA_VSTD::uint64_t __b_desc,
3548
+ _CUDA_VSTD::uint32_t __idesc,
3549
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
3550
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
3551
+ bool __enable_input_d)
3552
+ {
3553
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
3554
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3555
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3556
+ if constexpr (__cta_group == cta_group_1)
3557
+ {
3558
+ asm volatile(
3559
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3560
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3561
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], "
3562
+ "[%5], PRED_enable_input_d;\n\t"
3563
+ "}"
3564
+ :
3565
+ : "r"(__d_tmem),
3566
+ "l"(__a_desc),
3567
+ "l"(__b_desc),
3568
+ "r"(__idesc),
3569
+ "r"(__scale_A_tmem),
3570
+ "r"(__scale_B_tmem),
3571
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3572
+ : "memory");
3573
+ }
3574
+ else if constexpr (__cta_group == cta_group_2)
3575
+ {
3576
+ asm volatile(
3577
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3578
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3579
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], "
3580
+ "[%5], PRED_enable_input_d;\n\t"
3581
+ "}"
3582
+ :
3583
+ : "r"(__d_tmem),
3584
+ "l"(__a_desc),
3585
+ "l"(__b_desc),
3586
+ "r"(__idesc),
3587
+ "r"(__scale_A_tmem),
3588
+ "r"(__scale_B_tmem),
3589
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3590
+ : "memory");
3591
+ }
3592
+ # else
3593
+ // Unsupported architectures will have a linker error with a semi-decent error message
3594
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__();
3595
+ # endif
3596
+ }
3597
+ #endif // __cccl_ptx_isa >= 860
3598
+
3599
+ /*
3600
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc,
3601
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
3602
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
3603
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3604
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
3605
+ __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard(
3606
+ cuda::ptx::kind_t<Kind> kind,
3607
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3608
+ uint32_t d_tmem,
3609
+ uint64_t a_desc,
3610
+ uint64_t b_desc,
3611
+ uint32_t idesc,
3612
+ uint32_t scale_A_tmem,
3613
+ uint32_t scale_B_tmem,
3614
+ bool enable_input_d);
3615
+ */
3616
+ #if __cccl_ptx_isa >= 860
3617
+ extern "C" _CCCL_DEVICE void
3618
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__();
3619
+ template <dot_kind _Kind, dot_cta_group _Cta_Group>
3620
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard(
3621
+ kind_t<_Kind> __kind,
3622
+ cta_group_t<_Cta_Group> __cta_group,
3623
+ _CUDA_VSTD::uint32_t __d_tmem,
3624
+ _CUDA_VSTD::uint64_t __a_desc,
3625
+ _CUDA_VSTD::uint64_t __b_desc,
3626
+ _CUDA_VSTD::uint32_t __idesc,
3627
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
3628
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
3629
+ bool __enable_input_d)
3630
+ {
3631
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
3632
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3633
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3634
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
3635
+ {
3636
+ asm volatile(
3637
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3638
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3639
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
3640
+ "[%5], PRED_enable_input_d;\n\t"
3641
+ "}"
3642
+ :
3643
+ : "r"(__d_tmem),
3644
+ "l"(__a_desc),
3645
+ "l"(__b_desc),
3646
+ "r"(__idesc),
3647
+ "r"(__scale_A_tmem),
3648
+ "r"(__scale_B_tmem),
3649
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3650
+ : "memory");
3651
+ }
3652
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
3653
+ {
3654
+ asm volatile(
3655
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3656
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3657
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
3658
+ "[%5], PRED_enable_input_d;\n\t"
3659
+ "}"
3660
+ :
3661
+ : "r"(__d_tmem),
3662
+ "l"(__a_desc),
3663
+ "l"(__b_desc),
3664
+ "r"(__idesc),
3665
+ "r"(__scale_A_tmem),
3666
+ "r"(__scale_B_tmem),
3667
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3668
+ : "memory");
3669
+ }
3670
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
3671
+ {
3672
+ asm volatile(
3673
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3674
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3675
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
3676
+ "[%5], PRED_enable_input_d;\n\t"
3677
+ "}"
3678
+ :
3679
+ : "r"(__d_tmem),
3680
+ "l"(__a_desc),
3681
+ "l"(__b_desc),
3682
+ "r"(__idesc),
3683
+ "r"(__scale_A_tmem),
3684
+ "r"(__scale_B_tmem),
3685
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3686
+ : "memory");
3687
+ }
3688
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
3689
+ {
3690
+ asm volatile(
3691
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3692
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3693
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
3694
+ "[%5], PRED_enable_input_d;\n\t"
3695
+ "}"
3696
+ :
3697
+ : "r"(__d_tmem),
3698
+ "l"(__a_desc),
3699
+ "l"(__b_desc),
3700
+ "r"(__idesc),
3701
+ "r"(__scale_A_tmem),
3702
+ "r"(__scale_B_tmem),
3703
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3704
+ : "memory");
3705
+ }
3706
+ # else
3707
+ // Unsupported architectures will have a linker error with a semi-decent error message
3708
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__();
3709
+ # endif
3710
+ }
3711
+ #endif // __cccl_ptx_isa >= 860
3712
+
3713
+ /*
3714
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc,
3715
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
3716
+ // .kind = { .kind::mxf4nvf4 }
3717
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3718
+ template <cuda::ptx::dot_cta_group Cta_Group>
3719
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard(
3720
+ cuda::ptx::kind_mxf4nvf4_t,
3721
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3722
+ uint32_t d_tmem,
3723
+ uint64_t a_desc,
3724
+ uint64_t b_desc,
3725
+ uint32_t idesc,
3726
+ uint32_t scale_A_tmem,
3727
+ uint32_t scale_B_tmem,
3728
+ bool enable_input_d);
3729
+ */
3730
+ #if __cccl_ptx_isa >= 860
3731
+ extern "C" _CCCL_DEVICE void
3732
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__();
3733
+ template <dot_cta_group _Cta_Group>
3734
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard(
3735
+ kind_mxf4nvf4_t,
3736
+ cta_group_t<_Cta_Group> __cta_group,
3737
+ _CUDA_VSTD::uint32_t __d_tmem,
3738
+ _CUDA_VSTD::uint64_t __a_desc,
3739
+ _CUDA_VSTD::uint64_t __b_desc,
3740
+ _CUDA_VSTD::uint32_t __idesc,
3741
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
3742
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
3743
+ bool __enable_input_d)
3744
+ {
3745
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
3746
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3747
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3748
+ if constexpr (__cta_group == cta_group_1)
3749
+ {
3750
+ asm volatile(
3751
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3752
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3753
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], "
3754
+ "[%5], PRED_enable_input_d;\n\t"
3755
+ "}"
3756
+ :
3757
+ : "r"(__d_tmem),
3758
+ "l"(__a_desc),
3759
+ "l"(__b_desc),
3760
+ "r"(__idesc),
3761
+ "r"(__scale_A_tmem),
3762
+ "r"(__scale_B_tmem),
3763
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3764
+ : "memory");
3765
+ }
3766
+ else if constexpr (__cta_group == cta_group_2)
3767
+ {
3768
+ asm volatile(
3769
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3770
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3771
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], "
3772
+ "[%5], PRED_enable_input_d;\n\t"
3773
+ "}"
3774
+ :
3775
+ : "r"(__d_tmem),
3776
+ "l"(__a_desc),
3777
+ "l"(__b_desc),
3778
+ "r"(__idesc),
3779
+ "r"(__scale_A_tmem),
3780
+ "r"(__scale_B_tmem),
3781
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3782
+ : "memory");
3783
+ }
3784
+ # else
3785
+ // Unsupported architectures will have a linker error with a semi-decent error message
3786
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__();
3787
+ # endif
3788
+ }
3789
+ #endif // __cccl_ptx_isa >= 860
3790
+
3791
+ /*
3792
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc,
3793
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
3794
+ // .kind = { .kind::mxf8f6f4 }
3795
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3796
+ template <cuda::ptx::dot_cta_group Cta_Group>
3797
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard(
3798
+ cuda::ptx::kind_mxf8f6f4_t,
3799
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3800
+ uint32_t d_tmem,
3801
+ uint64_t a_desc,
3802
+ uint64_t b_desc,
3803
+ uint32_t idesc,
3804
+ uint32_t scale_A_tmem,
3805
+ uint32_t scale_B_tmem,
3806
+ bool enable_input_d);
3807
+ */
3808
+ #if __cccl_ptx_isa >= 860
3809
+ extern "C" _CCCL_DEVICE void
3810
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__();
3811
+ template <dot_cta_group _Cta_Group>
3812
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard(
3813
+ kind_mxf8f6f4_t,
3814
+ cta_group_t<_Cta_Group> __cta_group,
3815
+ _CUDA_VSTD::uint32_t __d_tmem,
3816
+ _CUDA_VSTD::uint64_t __a_desc,
3817
+ _CUDA_VSTD::uint64_t __b_desc,
3818
+ _CUDA_VSTD::uint32_t __idesc,
3819
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
3820
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
3821
+ bool __enable_input_d)
3822
+ {
3823
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
3824
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3825
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3826
+ if constexpr (__cta_group == cta_group_1)
3827
+ {
3828
+ asm volatile(
3829
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3830
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3831
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], "
3832
+ "[%5], PRED_enable_input_d;\n\t"
3833
+ "}"
3834
+ :
3835
+ : "r"(__d_tmem),
3836
+ "l"(__a_desc),
3837
+ "l"(__b_desc),
3838
+ "r"(__idesc),
3839
+ "r"(__scale_A_tmem),
3840
+ "r"(__scale_B_tmem),
3841
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3842
+ : "memory");
3843
+ }
3844
+ else if constexpr (__cta_group == cta_group_2)
3845
+ {
3846
+ asm volatile(
3847
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3848
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3849
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], "
3850
+ "[%5], PRED_enable_input_d;\n\t"
3851
+ "}"
3852
+ :
3853
+ : "r"(__d_tmem),
3854
+ "l"(__a_desc),
3855
+ "l"(__b_desc),
3856
+ "r"(__idesc),
3857
+ "r"(__scale_A_tmem),
3858
+ "r"(__scale_B_tmem),
3859
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3860
+ : "memory");
3861
+ }
3862
+ # else
3863
+ // Unsupported architectures will have a linker error with a semi-decent error message
3864
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__();
3865
+ # endif
3866
+ }
3867
+ #endif // __cccl_ptx_isa >= 860
3868
+
3869
+ /*
3870
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc,
3871
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
3872
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
3873
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3874
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
3875
+ __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard(
3876
+ cuda::ptx::kind_t<Kind> kind,
3877
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3878
+ uint32_t d_tmem,
3879
+ uint64_t a_desc,
3880
+ uint64_t b_desc,
3881
+ uint32_t idesc,
3882
+ uint32_t scale_A_tmem,
3883
+ uint32_t scale_B_tmem,
3884
+ bool enable_input_d);
3885
+ */
3886
+ #if __cccl_ptx_isa >= 860
3887
+ extern "C" _CCCL_DEVICE void
3888
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__();
3889
+ template <dot_kind _Kind, dot_cta_group _Cta_Group>
3890
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard(
3891
+ kind_t<_Kind> __kind,
3892
+ cta_group_t<_Cta_Group> __cta_group,
3893
+ _CUDA_VSTD::uint32_t __d_tmem,
3894
+ _CUDA_VSTD::uint64_t __a_desc,
3895
+ _CUDA_VSTD::uint64_t __b_desc,
3896
+ _CUDA_VSTD::uint32_t __idesc,
3897
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
3898
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
3899
+ bool __enable_input_d)
3900
+ {
3901
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
3902
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3903
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3904
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
3905
+ {
3906
+ asm volatile(
3907
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3908
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3909
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
3910
+ "[%5], PRED_enable_input_d;\n\t"
3911
+ "}"
3912
+ :
3913
+ : "r"(__d_tmem),
3914
+ "l"(__a_desc),
3915
+ "l"(__b_desc),
3916
+ "r"(__idesc),
3917
+ "r"(__scale_A_tmem),
3918
+ "r"(__scale_B_tmem),
3919
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3920
+ : "memory");
3921
+ }
3922
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
3923
+ {
3924
+ asm volatile(
3925
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3926
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3927
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
3928
+ "[%5], PRED_enable_input_d;\n\t"
3929
+ "}"
3930
+ :
3931
+ : "r"(__d_tmem),
3932
+ "l"(__a_desc),
3933
+ "l"(__b_desc),
3934
+ "r"(__idesc),
3935
+ "r"(__scale_A_tmem),
3936
+ "r"(__scale_B_tmem),
3937
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3938
+ : "memory");
3939
+ }
3940
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
3941
+ {
3942
+ asm volatile(
3943
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3944
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3945
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
3946
+ "[%5], PRED_enable_input_d;\n\t"
3947
+ "}"
3948
+ :
3949
+ : "r"(__d_tmem),
3950
+ "l"(__a_desc),
3951
+ "l"(__b_desc),
3952
+ "r"(__idesc),
3953
+ "r"(__scale_A_tmem),
3954
+ "r"(__scale_B_tmem),
3955
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3956
+ : "memory");
3957
+ }
3958
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
3959
+ {
3960
+ asm volatile(
3961
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3962
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3963
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
3964
+ "[%5], PRED_enable_input_d;\n\t"
3965
+ "}"
3966
+ :
3967
+ : "r"(__d_tmem),
3968
+ "l"(__a_desc),
3969
+ "l"(__b_desc),
3970
+ "r"(__idesc),
3971
+ "r"(__scale_A_tmem),
3972
+ "r"(__scale_B_tmem),
3973
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
3974
+ : "memory");
3975
+ }
3976
+ # else
3977
+ // Unsupported architectures will have a linker error with a semi-decent error message
3978
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__();
3979
+ # endif
3980
+ }
3981
+ #endif // __cccl_ptx_isa >= 860
3982
+
3983
+ /*
3984
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc,
3985
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a
3986
+ // .kind = { .kind::mxf4nvf4 }
3987
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3988
+ template <cuda::ptx::dot_cta_group Cta_Group>
3989
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard(
3990
+ cuda::ptx::kind_mxf4nvf4_t,
3991
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3992
+ uint32_t d_tmem,
3993
+ uint64_t a_desc,
3994
+ uint64_t b_desc,
3995
+ uint32_t idesc,
3996
+ uint32_t scale_A_tmem,
3997
+ uint32_t scale_B_tmem,
3998
+ bool enable_input_d);
3999
+ */
4000
+ #if __cccl_ptx_isa >= 860
4001
+ extern "C" _CCCL_DEVICE void
4002
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__();
4003
+ template <dot_cta_group _Cta_Group>
4004
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard(
4005
+ kind_mxf4nvf4_t,
4006
+ cta_group_t<_Cta_Group> __cta_group,
4007
+ _CUDA_VSTD::uint32_t __d_tmem,
4008
+ _CUDA_VSTD::uint64_t __a_desc,
4009
+ _CUDA_VSTD::uint64_t __b_desc,
4010
+ _CUDA_VSTD::uint32_t __idesc,
4011
+ _CUDA_VSTD::uint32_t __scale_A_tmem,
4012
+ _CUDA_VSTD::uint32_t __scale_B_tmem,
4013
+ bool __enable_input_d)
4014
+ {
4015
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
4016
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
4017
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
4018
+ if constexpr (__cta_group == cta_group_1)
4019
+ {
4020
+ asm volatile(
4021
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
4022
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
4023
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], "
4024
+ "[%5], PRED_enable_input_d;\n\t"
4025
+ "}"
4026
+ :
4027
+ : "r"(__d_tmem),
4028
+ "l"(__a_desc),
4029
+ "l"(__b_desc),
4030
+ "r"(__idesc),
4031
+ "r"(__scale_A_tmem),
4032
+ "r"(__scale_B_tmem),
4033
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
4034
+ : "memory");
4035
+ }
4036
+ else if constexpr (__cta_group == cta_group_2)
4037
+ {
4038
+ asm volatile(
4039
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
4040
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
4041
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], "
4042
+ "[%5], PRED_enable_input_d;\n\t"
4043
+ "}"
4044
+ :
4045
+ : "r"(__d_tmem),
4046
+ "l"(__a_desc),
4047
+ "l"(__b_desc),
4048
+ "r"(__idesc),
4049
+ "r"(__scale_A_tmem),
4050
+ "r"(__scale_B_tmem),
4051
+ "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d))
4052
+ : "memory");
4053
+ }
4054
+ # else
4055
+ // Unsupported architectures will have a linker error with a semi-decent error message
4056
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__();
4057
+ # endif
4058
+ }
4059
+ #endif // __cccl_ptx_isa >= 860
4060
+
4061
+ #endif // _CUDA_PTX_GENERATED_TCGEN05_MMA_H_