cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1968) hide show
  1. cuda/cccl/__init__.py +27 -0
  2. cuda/cccl/_cuda_version_utils.py +24 -0
  3. cuda/cccl/cooperative/__init__.py +9 -0
  4. cuda/cccl/cooperative/experimental/__init__.py +24 -0
  5. cuda/cccl/headers/__init__.py +7 -0
  6. cuda/cccl/headers/include/__init__.py +1 -0
  7. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
  8. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
  9. cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
  10. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
  11. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
  12. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
  13. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
  14. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  15. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
  16. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
  17. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
  18. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
  19. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
  20. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
  21. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
  22. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
  23. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
  24. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
  25. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
  26. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  27. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
  28. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
  29. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
  30. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
  31. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
  32. cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
  33. cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
  34. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  35. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
  36. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
  37. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
  38. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  39. cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
  40. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
  41. cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
  42. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  43. cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
  44. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
  45. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  46. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  47. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  48. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  49. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
  50. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  51. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  52. cuda/cccl/headers/include/cub/config.cuh +53 -0
  53. cuda/cccl/headers/include/cub/cub.cuh +120 -0
  54. cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
  55. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
  56. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
  57. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  58. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
  59. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
  60. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
  61. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
  62. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
  63. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
  64. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  65. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  66. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
  67. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  68. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
  69. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
  70. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  71. cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
  72. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  73. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  74. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  75. cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
  76. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
  77. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  78. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  79. cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
  80. cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
  81. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
  82. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  83. cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
  84. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  85. cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
  86. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
  87. cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
  88. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
  89. cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
  90. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  91. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
  92. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  93. cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
  94. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  95. cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
  149. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
  150. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  151. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  152. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
  153. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
  154. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
  155. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
  156. cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
  157. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
  158. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
  159. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
  160. cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
  161. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
  162. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
  163. cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
  164. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  165. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  166. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  167. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  168. cuda/cccl/headers/include/cub/util_device.cuh +800 -0
  169. cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
  170. cuda/cccl/headers/include/cub/util_math.cuh +118 -0
  171. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  172. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  173. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  174. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  175. cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
  176. cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
  177. cuda/cccl/headers/include/cub/version.cuh +89 -0
  178. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  179. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  180. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
  181. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
  182. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
  184. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  185. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  186. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  187. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
  188. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
  189. cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
  190. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
  191. cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
  192. cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
  193. cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
  194. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
  201. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  202. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  203. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  204. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  209. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  210. cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
  211. cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
  212. cuda/cccl/headers/include/cuda/__cccl_config +37 -0
  213. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  214. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  217. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  218. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  225. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
  226. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  227. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  228. cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
  229. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  230. cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
  231. cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
  232. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  233. cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
  234. cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
  235. cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
  236. cuda/cccl/headers/include/cuda/__event/event.h +171 -0
  237. cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
  238. cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
  239. cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
  240. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  241. cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
  242. cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
  243. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  244. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
  245. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  246. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  247. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  248. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  249. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  250. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  251. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  252. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  253. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  254. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
  255. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
  256. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
  257. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
  258. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
  259. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
  260. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
  261. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
  262. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
  263. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
  264. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
  265. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  266. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
  267. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
  268. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  269. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  270. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
  271. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
  272. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
  273. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
  274. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  275. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  276. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  277. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  278. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
  279. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
  280. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
  281. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  282. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  283. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
  284. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  285. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  286. cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
  287. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  288. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  289. cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
  290. cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
  291. cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
  292. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
  293. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  294. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  295. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  296. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
  297. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
  298. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
  299. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
  300. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
  301. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
  302. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  303. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
  304. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  305. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
  306. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  383. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  384. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  385. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  386. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  387. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  388. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  389. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  390. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  391. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  392. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  393. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  394. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  395. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  396. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  397. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  398. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  399. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  400. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  401. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  402. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  403. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  404. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  405. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  406. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  407. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  408. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  409. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  410. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  411. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  412. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  413. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  414. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  415. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
  416. cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
  417. cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
  418. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
  419. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  420. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  421. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  422. cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
  423. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
  424. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  425. cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
  426. cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
  427. cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
  428. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
  429. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
  430. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
  431. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
  432. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
  433. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
  434. cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
  435. cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
  436. cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
  437. cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
  438. cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
  439. cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
  440. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
  441. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
  442. cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
  443. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
  444. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
  445. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
  446. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
  447. cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
  448. cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
  449. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  450. cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
  451. cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
  452. cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
  453. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  454. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  455. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
  456. cuda/cccl/headers/include/cuda/access_property +26 -0
  457. cuda/cccl/headers/include/cuda/algorithm +27 -0
  458. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  459. cuda/cccl/headers/include/cuda/atomic +27 -0
  460. cuda/cccl/headers/include/cuda/barrier +267 -0
  461. cuda/cccl/headers/include/cuda/bit +29 -0
  462. cuda/cccl/headers/include/cuda/cmath +37 -0
  463. cuda/cccl/headers/include/cuda/devices +33 -0
  464. cuda/cccl/headers/include/cuda/discard_memory +32 -0
  465. cuda/cccl/headers/include/cuda/functional +32 -0
  466. cuda/cccl/headers/include/cuda/iterator +39 -0
  467. cuda/cccl/headers/include/cuda/latch +27 -0
  468. cuda/cccl/headers/include/cuda/mdspan +28 -0
  469. cuda/cccl/headers/include/cuda/memory +35 -0
  470. cuda/cccl/headers/include/cuda/memory_resource +35 -0
  471. cuda/cccl/headers/include/cuda/numeric +29 -0
  472. cuda/cccl/headers/include/cuda/pipeline +579 -0
  473. cuda/cccl/headers/include/cuda/ptx +129 -0
  474. cuda/cccl/headers/include/cuda/semaphore +31 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
  503. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
  504. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  505. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  506. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  507. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  508. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  509. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  510. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  511. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  512. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  513. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  514. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  515. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
  516. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  517. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
  518. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  519. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  520. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  521. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  522. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
  523. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  524. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
  525. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  526. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  527. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  528. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  529. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  530. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  531. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  532. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  533. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
  534. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  535. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
  536. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
  537. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
  538. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  539. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  540. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  541. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  542. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  543. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  544. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  545. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  546. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  547. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  548. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  549. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  550. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  551. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  552. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  553. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  554. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  555. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  556. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  557. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  558. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  559. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  560. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  561. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  562. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  563. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  564. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  565. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  566. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  567. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  568. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  569. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  570. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  571. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  572. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
  573. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  574. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  575. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
  576. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  577. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  578. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  579. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  580. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  581. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  582. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  583. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  584. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  585. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
  586. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
  587. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  588. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
  589. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  590. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  591. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  592. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  593. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  594. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  595. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  596. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
  597. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
  598. cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
  599. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  600. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  601. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  602. cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
  603. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  604. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
  605. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  606. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  607. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  608. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
  609. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
  610. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
  611. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  612. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
  613. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
  614. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
  615. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
  616. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
  617. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
  618. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
  619. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
  620. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
  621. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  622. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  623. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  624. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  625. cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
  627. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
  628. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
  629. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  630. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  631. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  632. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  633. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  634. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
  635. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  636. cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
  637. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  638. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  639. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  640. cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
  641. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  642. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  643. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  644. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  645. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  646. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  647. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  648. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  649. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
  650. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  651. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
  652. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
  653. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  654. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
  655. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  656. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  657. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
  658. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  659. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  660. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
  661. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  662. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  663. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  664. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  665. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
  666. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  667. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
  668. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
  669. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
  670. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  671. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  672. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  673. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  674. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  675. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
  676. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
  677. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  678. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  679. cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
  680. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
  681. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  682. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
  683. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  684. cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
  685. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
  686. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  687. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  688. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  689. cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
  690. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  691. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  692. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
  693. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  694. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  695. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  696. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
  697. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  698. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  699. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
  700. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
  701. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
  702. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  703. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
  704. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  705. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  706. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  707. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  708. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  709. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  710. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  711. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  712. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
  713. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  714. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  715. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  716. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  717. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  718. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  719. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  720. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  721. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
  722. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
  723. cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
  724. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  725. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
  726. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  727. cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
  728. cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
  729. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  730. cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
  731. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
  732. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  733. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
  734. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  735. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
  736. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  737. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  738. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
  739. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
  740. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
  741. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  742. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  743. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
  744. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
  745. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  746. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  747. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  748. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  749. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  750. cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
  751. cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
  752. cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
  753. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
  754. cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
  755. cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
  756. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  757. cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
  758. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
  759. cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
  760. cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
  761. cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
  762. cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
  763. cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
  764. cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
  765. cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
  766. cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
  767. cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
  768. cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
  769. cuda/cccl/headers/include/cuda/std/__format_ +45 -0
  770. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  771. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  772. cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
  773. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  774. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  775. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  776. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  777. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  778. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  779. cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
  780. cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
  781. cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
  782. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
  783. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
  784. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
  785. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
  786. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  787. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  788. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
  789. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
  790. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
  791. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  792. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  793. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  794. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  795. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  796. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
  797. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
  798. cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
  799. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
  800. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
  801. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  802. cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
  803. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  804. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  805. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  806. cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
  807. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
  808. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  809. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  810. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
  811. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  812. cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
  813. cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
  814. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
  815. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  816. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  817. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  818. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  819. cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
  820. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
  821. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  822. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  823. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
  824. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
  825. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  826. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  827. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  828. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  829. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  830. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  831. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  832. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
  833. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
  834. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  835. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
  836. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
  837. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  839. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  840. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  841. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
  842. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  843. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
  844. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  845. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  846. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
  847. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  848. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  849. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
  850. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  851. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
  852. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  853. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
  854. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  855. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  856. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  857. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
  858. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  859. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  860. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
  861. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  862. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  863. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  864. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  865. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
  866. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
  867. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  868. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  869. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
  870. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
  871. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
  872. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
  873. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  874. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
  875. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  876. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  877. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  878. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
  879. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  880. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  881. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  882. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  883. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  884. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  885. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
  886. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  887. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  888. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
  889. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
  890. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  891. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
  892. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
  893. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  894. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  895. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
  896. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
  897. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
  898. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  899. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  900. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  901. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  902. cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
  903. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  904. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  905. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  906. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  907. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
  908. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  909. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  910. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  911. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
  912. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  913. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  914. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  915. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  916. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  917. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
  918. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  919. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  920. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
  922. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
  923. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  924. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  925. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  926. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  927. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
  928. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  929. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  930. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  931. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  932. cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
  933. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  934. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
  935. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  936. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  937. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  938. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  939. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  940. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  941. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
  942. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  943. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  944. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  945. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
  946. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
  947. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
  948. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  949. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
  950. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  951. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  952. cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
  953. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  954. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
  955. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  956. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  957. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  958. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
  959. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  960. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
  961. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  962. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  963. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  964. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  965. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  966. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  967. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  968. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  969. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
  970. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  971. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  972. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
  973. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  974. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  975. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
  976. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
  977. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
  978. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  979. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  980. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
  981. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
  982. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  983. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  984. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  1016. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  1017. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
  1018. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  1019. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  1020. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  1021. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
  1022. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
  1023. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  1024. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  1025. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  1026. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
  1027. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  1028. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  1029. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  1030. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  1031. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  1032. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  1033. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
  1034. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
  1035. cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
  1036. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  1037. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
  1038. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
  1039. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  1040. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  1041. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  1042. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  1043. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  1044. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  1045. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  1046. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
  1047. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  1048. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  1049. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  1050. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  1051. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  1052. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  1053. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  1054. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  1055. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  1056. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  1057. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  1058. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  1059. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  1060. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  1061. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  1062. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  1063. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  1064. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
  1065. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  1066. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
  1067. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  1068. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
  1069. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
  1070. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  1071. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  1072. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  1073. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
  1074. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  1075. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  1076. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  1077. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  1078. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
  1079. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
  1080. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
  1081. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
  1082. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
  1083. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
  1084. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
  1085. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
  1086. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
  1087. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
  1088. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
  1089. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
  1090. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  1091. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
  1092. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  1093. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  1094. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  1095. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  1096. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  1097. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  1098. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  1099. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  1100. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  1101. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  1102. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  1103. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  1104. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  1105. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
  1106. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  1107. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  1108. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1109. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1110. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1111. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1112. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1113. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1114. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1115. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1116. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1117. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1118. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1120. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1121. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1122. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
  1123. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
  1124. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1125. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1126. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
  1127. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1128. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1129. cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
  1130. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1131. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1132. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1133. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1134. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
  1135. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1136. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1137. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1138. cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
  1139. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1140. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
  1141. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1142. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1143. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1144. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1145. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1146. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1147. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1148. cuda/cccl/headers/include/cuda/std/array +518 -0
  1149. cuda/cccl/headers/include/cuda/std/atomic +810 -0
  1150. cuda/cccl/headers/include/cuda/std/barrier +42 -0
  1151. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1152. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1153. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1154. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1155. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1156. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1157. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1158. cuda/cccl/headers/include/cuda/std/cmath +87 -0
  1159. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1160. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1161. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1162. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1163. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1164. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1165. cuda/cccl/headers/include/cuda/std/ctime +154 -0
  1166. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1167. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
  1168. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
  1169. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
  1170. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1171. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1172. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
  1173. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
  1174. cuda/cccl/headers/include/cuda/std/execution +29 -0
  1175. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1176. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1177. cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
  1178. cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
  1179. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1180. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1181. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1182. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1183. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1184. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1185. cuda/cccl/headers/include/cuda/std/numbers +346 -0
  1186. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1187. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1188. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1189. cuda/cccl/headers/include/cuda/std/ratio +416 -0
  1190. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1191. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1192. cuda/cccl/headers/include/cuda/std/span +628 -0
  1193. cuda/cccl/headers/include/cuda/std/string_view +925 -0
  1194. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1195. cuda/cccl/headers/include/cuda/std/type_traits +177 -0
  1196. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1197. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1198. cuda/cccl/headers/include/cuda/std/version +240 -0
  1199. cuda/cccl/headers/include/cuda/stream +31 -0
  1200. cuda/cccl/headers/include/cuda/stream_ref +59 -0
  1201. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1202. cuda/cccl/headers/include/cuda/utility +28 -0
  1203. cuda/cccl/headers/include/cuda/version +16 -0
  1204. cuda/cccl/headers/include/cuda/warp +28 -0
  1205. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1206. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1207. cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
  1208. cuda/cccl/headers/include/nv/target +240 -0
  1209. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1210. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1211. cuda/cccl/headers/include/thrust/advance.h +57 -0
  1212. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1213. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1214. cuda/cccl/headers/include/thrust/complex.h +858 -0
  1215. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1216. cuda/cccl/headers/include/thrust/count.h +245 -0
  1217. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1218. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1219. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
  1220. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
  1221. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
  1222. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
  1223. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
  1224. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1225. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
  1226. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
  1227. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
  1228. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1229. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1230. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1231. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1232. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1233. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1234. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1235. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1236. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1237. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1238. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1239. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1240. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1241. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1242. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1243. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1244. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1245. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1246. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1247. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1248. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1249. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1250. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1251. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1252. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1253. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1254. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1255. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1256. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1257. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1258. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1259. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1260. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1261. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1262. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1263. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1264. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1265. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1266. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1267. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1268. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1269. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1270. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1271. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1272. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
  1273. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1274. cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
  1275. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1276. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1277. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1278. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1279. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1280. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1281. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1282. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1283. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1284. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1285. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1286. cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
  1287. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1288. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1289. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1290. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1291. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1292. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1293. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1294. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1295. cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
  1296. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1297. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1298. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1299. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1300. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1301. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
  1302. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1303. cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
  1304. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1305. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1306. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1307. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1308. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1309. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1310. cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
  1311. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1312. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1313. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1314. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1315. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1316. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1317. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1318. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1319. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1320. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1321. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1322. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1323. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1324. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1325. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1326. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1327. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1328. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1329. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1330. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1331. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1332. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1333. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1334. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1335. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1336. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1337. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1338. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1339. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1340. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1341. cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
  1342. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
  1343. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1344. cuda/cccl/headers/include/thrust/device_delete.h +74 -0
  1345. cuda/cccl/headers/include/thrust/device_free.h +85 -0
  1346. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1347. cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
  1348. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1349. cuda/cccl/headers/include/thrust/device_new.h +112 -0
  1350. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1351. cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
  1352. cuda/cccl/headers/include/thrust/device_reference.h +983 -0
  1353. cuda/cccl/headers/include/thrust/device_vector.h +576 -0
  1354. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1355. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1356. cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
  1357. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1358. cuda/cccl/headers/include/thrust/fill.h +200 -0
  1359. cuda/cccl/headers/include/thrust/find.h +382 -0
  1360. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1361. cuda/cccl/headers/include/thrust/functional.h +395 -0
  1362. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1363. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1364. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1365. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1366. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
  1367. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1368. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1369. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1370. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1371. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1372. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1373. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
  1374. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
  1375. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1376. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
  1377. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
  1378. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1379. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1380. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1381. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1382. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1383. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1384. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
  1385. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
  1386. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
  1387. cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
  1388. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
  1389. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1390. cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
  1391. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
  1392. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
  1393. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1394. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
  1395. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1396. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1397. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1398. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
  1399. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1400. cuda/cccl/headers/include/thrust/memory.h +299 -0
  1401. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1402. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1403. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1404. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1405. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
  1406. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1407. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
  1408. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1409. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1410. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1411. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1412. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1413. cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
  1414. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1415. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1416. cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
  1417. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1418. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1419. cuda/cccl/headers/include/thrust/pair.h +99 -0
  1420. cuda/cccl/headers/include/thrust/partition.h +1391 -0
  1421. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1422. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1423. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1424. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1425. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1426. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1427. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1428. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1429. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1430. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1431. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1432. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1433. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
  1434. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1435. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
  1436. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1437. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1438. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1439. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1440. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1441. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1442. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1443. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1444. cuda/cccl/headers/include/thrust/random.h +120 -0
  1445. cuda/cccl/headers/include/thrust/reduce.h +1113 -0
  1446. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1447. cuda/cccl/headers/include/thrust/replace.h +826 -0
  1448. cuda/cccl/headers/include/thrust/reverse.h +215 -0
  1449. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1450. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1451. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1452. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1453. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1454. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1455. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1456. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1457. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1458. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1459. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1460. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1461. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1462. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1463. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
  1464. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1465. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1466. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1467. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1468. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1469. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1470. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1471. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1472. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1473. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1474. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1475. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1476. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1477. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1478. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1479. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1480. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1481. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1482. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1483. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1484. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1485. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1486. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1487. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1488. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1489. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1490. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1491. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1492. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1493. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1494. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1495. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1496. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1497. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1498. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1499. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1500. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1501. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
  1502. cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
  1503. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
  1504. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
  1505. cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
  1506. cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
  1507. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1508. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1509. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1510. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1511. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
  1512. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1513. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1514. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1515. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
  1516. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1517. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1518. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
  1519. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1520. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1521. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
  1522. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1523. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
  1524. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
  1525. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1526. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1527. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
  1528. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1529. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1530. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
  1531. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1532. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1533. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1534. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1535. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1536. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
  1537. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1538. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1539. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1540. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
  1541. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
  1542. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1543. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
  1544. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
  1545. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
  1546. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
  1547. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
  1548. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1549. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
  1550. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
  1551. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
  1552. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
  1553. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1554. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1555. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
  1556. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1557. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1558. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
  1559. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
  1560. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
  1561. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
  1562. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
  1563. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1564. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
  1565. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1566. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1567. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1568. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
  1675. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
  1676. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
  1677. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
  1678. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
  1679. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
  1680. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
  1681. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
  1682. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
  1683. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
  1684. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1685. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
  1686. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
  1687. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
  1688. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
  1689. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
  1690. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
  1691. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
  1692. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
  1693. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
  1694. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
  1695. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
  1696. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
  1697. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
  1698. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
  1699. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
  1700. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
  1701. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
  1702. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
  1703. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
  1704. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
  1705. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1706. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
  1708. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
  1709. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1710. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
  1711. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
  1712. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1713. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
  1714. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1715. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
  1716. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1717. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
  1718. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
  1719. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1720. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
  1721. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
  1722. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
  1723. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1724. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
  1725. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1726. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
  1727. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
  1728. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
  1729. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1730. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1731. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
  1732. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
  1733. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1734. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1735. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
  1736. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
  1737. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1738. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
  1739. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
  1740. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
  1741. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
  1742. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
  1743. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
  1744. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1745. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1746. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1747. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1748. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1749. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1750. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
  1751. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1752. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1753. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
  1754. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
  1755. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1756. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1757. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1758. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1760. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1761. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1762. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1763. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1764. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1766. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1767. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1768. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
  1769. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1770. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1771. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1772. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1773. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1774. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1775. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1776. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1777. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1778. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1779. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1780. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1782. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1783. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1784. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1785. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1786. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1787. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1788. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1789. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1790. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1791. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1792. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1793. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1794. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1795. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1796. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
  1799. cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
  1800. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
  1801. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1802. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1803. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1804. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1805. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1806. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1807. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1808. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1809. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1810. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1811. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1812. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1813. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1814. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1815. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1816. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1817. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1818. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
  1819. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1820. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1821. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1822. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1823. cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
  1824. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1825. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1826. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1827. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1828. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1829. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1830. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1831. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1832. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1833. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
  1834. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1835. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1836. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1837. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1838. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1839. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1840. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1841. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1842. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1843. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1844. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1845. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1846. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1847. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1848. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1849. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1850. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1851. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1852. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1853. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1854. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1855. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1856. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1857. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1858. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1859. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1860. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1861. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1862. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1863. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1864. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
  1865. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1866. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1867. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1868. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1869. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1870. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1871. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1872. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1873. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1874. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1875. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1876. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1877. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1878. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1879. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1880. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1881. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1882. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
  1883. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1884. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1885. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1886. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1887. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1888. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1889. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1890. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1891. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1892. cuda/cccl/headers/include/thrust/tuple.h +139 -0
  1893. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1894. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1895. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1896. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1897. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1898. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1899. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1900. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
  1901. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1902. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1903. cuda/cccl/headers/include/thrust/unique.h +1088 -0
  1904. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1905. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1906. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1907. cuda/cccl/headers/include/thrust/version.h +93 -0
  1908. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1909. cuda/cccl/headers/include_paths.py +51 -0
  1910. cuda/cccl/parallel/__init__.py +9 -0
  1911. cuda/cccl/parallel/experimental/__init__.py +24 -0
  1912. cuda/cccl/py.typed +0 -0
  1913. cuda/compute/__init__.py +79 -0
  1914. cuda/compute/_bindings.py +79 -0
  1915. cuda/compute/_bindings.pyi +475 -0
  1916. cuda/compute/_bindings_impl.pyx +2273 -0
  1917. cuda/compute/_caching.py +71 -0
  1918. cuda/compute/_cccl_interop.py +422 -0
  1919. cuda/compute/_utils/__init__.py +0 -0
  1920. cuda/compute/_utils/protocols.py +132 -0
  1921. cuda/compute/_utils/temp_storage_buffer.py +86 -0
  1922. cuda/compute/algorithms/__init__.py +54 -0
  1923. cuda/compute/algorithms/_histogram.py +243 -0
  1924. cuda/compute/algorithms/_merge_sort.py +225 -0
  1925. cuda/compute/algorithms/_radix_sort.py +312 -0
  1926. cuda/compute/algorithms/_reduce.py +182 -0
  1927. cuda/compute/algorithms/_scan.py +331 -0
  1928. cuda/compute/algorithms/_segmented_reduce.py +257 -0
  1929. cuda/compute/algorithms/_three_way_partition.py +261 -0
  1930. cuda/compute/algorithms/_transform.py +329 -0
  1931. cuda/compute/algorithms/_unique_by_key.py +252 -0
  1932. cuda/compute/cccl/.gitkeep +0 -0
  1933. cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1934. cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
  1935. cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
  1936. cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
  1937. cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
  1938. cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
  1939. cuda/compute/iterators/__init__.py +21 -0
  1940. cuda/compute/iterators/_factories.py +219 -0
  1941. cuda/compute/iterators/_iterators.py +817 -0
  1942. cuda/compute/iterators/_zip_iterator.py +199 -0
  1943. cuda/compute/numba_utils.py +53 -0
  1944. cuda/compute/op.py +3 -0
  1945. cuda/compute/struct.py +272 -0
  1946. cuda/compute/typing.py +37 -0
  1947. cuda/coop/__init__.py +8 -0
  1948. cuda/coop/_caching.py +48 -0
  1949. cuda/coop/_common.py +275 -0
  1950. cuda/coop/_nvrtc.py +92 -0
  1951. cuda/coop/_scan_op.py +181 -0
  1952. cuda/coop/_types.py +937 -0
  1953. cuda/coop/_typing.py +107 -0
  1954. cuda/coop/block/__init__.py +39 -0
  1955. cuda/coop/block/_block_exchange.py +251 -0
  1956. cuda/coop/block/_block_load_store.py +215 -0
  1957. cuda/coop/block/_block_merge_sort.py +125 -0
  1958. cuda/coop/block/_block_radix_sort.py +214 -0
  1959. cuda/coop/block/_block_reduce.py +294 -0
  1960. cuda/coop/block/_block_scan.py +983 -0
  1961. cuda/coop/warp/__init__.py +9 -0
  1962. cuda/coop/warp/_warp_merge_sort.py +92 -0
  1963. cuda/coop/warp/_warp_reduce.py +153 -0
  1964. cuda/coop/warp/_warp_scan.py +78 -0
  1965. cuda_cccl-0.3.3.dist-info/METADATA +41 -0
  1966. cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
  1967. cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
  1968. cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,4291 @@
1
+ // This file was automatically generated. Do not edit.
2
+
3
+ #ifndef _CUDA_PTX_GENERATED_TCGEN05_MMA_H_
4
+ #define _CUDA_PTX_GENERATED_TCGEN05_MMA_H_
5
+
6
+ /*
7
+ // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; //
8
+ PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f
9
+ // .kind = { .kind::f16, .kind::tf32 }
10
+ // .cta_group = { .cta_group::1 }
11
+ template <int N32, cuda::ptx::dot_kind Kind>
12
+ __device__ static inline void tcgen05_mma(
13
+ cuda::ptx::kind_t<Kind> kind,
14
+ cuda::ptx::cta_group_1_t,
15
+ uint32_t d_tmem,
16
+ uint64_t a_desc,
17
+ uint64_t b_desc,
18
+ uint32_t idesc,
19
+ const uint32_t (&disable_output_lane)[4],
20
+ bool enable_input_d,
21
+ cuda::ptx::n32_t<N32> scale_input_d);
22
+ */
23
+ #if __cccl_ptx_isa >= 860
24
+ extern "C" _CCCL_DEVICE void
25
+ __cuda_ptx_tcgen05_mma_is_only_supported_on_SM_100a_100f_103a_103f_depending_on_the_variant__();
26
+ template <int _N32, ::cuda::ptx::dot_kind _Kind>
27
+ _CCCL_DEVICE static inline void tcgen05_mma(
28
+ ::cuda::ptx::kind_t<_Kind> __kind,
29
+ ::cuda::ptx::cta_group_1_t,
30
+ ::cuda::std::uint32_t __d_tmem,
31
+ ::cuda::std::uint64_t __a_desc,
32
+ ::cuda::std::uint64_t __b_desc,
33
+ ::cuda::std::uint32_t __idesc,
34
+ const ::cuda::std::uint32_t (&__disable_output_lane)[4],
35
+ bool __enable_input_d,
36
+ ::cuda::ptx::n32_t<_N32> __scale_input_d)
37
+ {
38
+ static_assert(__kind == kind_f16 || __kind == kind_tf32, "");
39
+ // __cta_group == cta_group_1 (due to parameter type constraint)
40
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
41
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
42
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
43
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1000)) \
44
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1030))
45
+ if constexpr (__kind == kind_f16)
46
+ {
47
+ asm volatile(
48
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
49
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
50
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t"
51
+ "}"
52
+ :
53
+ : "r"(__d_tmem),
54
+ "l"(__a_desc),
55
+ "l"(__b_desc),
56
+ "r"(__idesc),
57
+ "r"(__disable_output_lane[0]),
58
+ "r"(__disable_output_lane[1]),
59
+ "r"(__disable_output_lane[2]),
60
+ "r"(__disable_output_lane[3]),
61
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
62
+ "n"(__scale_input_d.value)
63
+ : "memory");
64
+ }
65
+ else if constexpr (__kind == kind_tf32)
66
+ {
67
+ asm volatile(
68
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
69
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
70
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t"
71
+ "}"
72
+ :
73
+ : "r"(__d_tmem),
74
+ "l"(__a_desc),
75
+ "l"(__b_desc),
76
+ "r"(__idesc),
77
+ "r"(__disable_output_lane[0]),
78
+ "r"(__disable_output_lane[1]),
79
+ "r"(__disable_output_lane[2]),
80
+ "r"(__disable_output_lane[3]),
81
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
82
+ "n"(__scale_input_d.value)
83
+ : "memory");
84
+ }
85
+
86
+ # else
87
+ // Unsupported architectures will have a linker error with a semi-decent error message
88
+ __cuda_ptx_tcgen05_mma_is_only_supported_on_SM_100a_100f_103a_103f_depending_on_the_variant__();
89
+ # endif
90
+ }
91
+ #endif // __cccl_ptx_isa >= 860
92
+
93
+ /*
94
+ // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; //
95
+ PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f
96
+ // .kind = { .kind::f16, .kind::tf32 }
97
+ // .cta_group = { .cta_group::2 }
98
+ template <int N32, cuda::ptx::dot_kind Kind>
99
+ __device__ static inline void tcgen05_mma(
100
+ cuda::ptx::kind_t<Kind> kind,
101
+ cuda::ptx::cta_group_2_t,
102
+ uint32_t d_tmem,
103
+ uint64_t a_desc,
104
+ uint64_t b_desc,
105
+ uint32_t idesc,
106
+ const uint32_t (&disable_output_lane)[8],
107
+ bool enable_input_d,
108
+ cuda::ptx::n32_t<N32> scale_input_d);
109
+ */
110
+ #if __cccl_ptx_isa >= 860
111
+ extern "C" _CCCL_DEVICE void
112
+ __cuda_ptx_tcgen05_mma_is_only_supported_on_SM_100a_100f_103a_103f_depending_on_the_variant__();
113
+ template <int _N32, ::cuda::ptx::dot_kind _Kind>
114
+ _CCCL_DEVICE static inline void tcgen05_mma(
115
+ ::cuda::ptx::kind_t<_Kind> __kind,
116
+ ::cuda::ptx::cta_group_2_t,
117
+ ::cuda::std::uint32_t __d_tmem,
118
+ ::cuda::std::uint64_t __a_desc,
119
+ ::cuda::std::uint64_t __b_desc,
120
+ ::cuda::std::uint32_t __idesc,
121
+ const ::cuda::std::uint32_t (&__disable_output_lane)[8],
122
+ bool __enable_input_d,
123
+ ::cuda::ptx::n32_t<_N32> __scale_input_d)
124
+ {
125
+ static_assert(__kind == kind_f16 || __kind == kind_tf32, "");
126
+ // __cta_group == cta_group_2 (due to parameter type constraint)
127
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
128
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
129
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
130
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1000)) \
131
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1030))
132
+ if constexpr (__kind == kind_f16)
133
+ {
134
+ asm volatile(
135
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
136
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
137
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, PRED_enable_input_d, "
138
+ "%13;\n\t"
139
+ "}"
140
+ :
141
+ : "r"(__d_tmem),
142
+ "l"(__a_desc),
143
+ "l"(__b_desc),
144
+ "r"(__idesc),
145
+ "r"(__disable_output_lane[0]),
146
+ "r"(__disable_output_lane[1]),
147
+ "r"(__disable_output_lane[2]),
148
+ "r"(__disable_output_lane[3]),
149
+ "r"(__disable_output_lane[4]),
150
+ "r"(__disable_output_lane[5]),
151
+ "r"(__disable_output_lane[6]),
152
+ "r"(__disable_output_lane[7]),
153
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
154
+ "n"(__scale_input_d.value)
155
+ : "memory");
156
+ }
157
+ else if constexpr (__kind == kind_tf32)
158
+ {
159
+ asm volatile(
160
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
161
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
162
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, PRED_enable_input_d, "
163
+ "%13;\n\t"
164
+ "}"
165
+ :
166
+ : "r"(__d_tmem),
167
+ "l"(__a_desc),
168
+ "l"(__b_desc),
169
+ "r"(__idesc),
170
+ "r"(__disable_output_lane[0]),
171
+ "r"(__disable_output_lane[1]),
172
+ "r"(__disable_output_lane[2]),
173
+ "r"(__disable_output_lane[3]),
174
+ "r"(__disable_output_lane[4]),
175
+ "r"(__disable_output_lane[5]),
176
+ "r"(__disable_output_lane[6]),
177
+ "r"(__disable_output_lane[7]),
178
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
179
+ "n"(__scale_input_d.value)
180
+ : "memory");
181
+ }
182
+
183
+ # else
184
+ // Unsupported architectures will have a linker error with a semi-decent error message
185
+ __cuda_ptx_tcgen05_mma_is_only_supported_on_SM_100a_100f_103a_103f_depending_on_the_variant__();
186
+ # endif
187
+ }
188
+ #endif // __cccl_ptx_isa >= 860
189
+
190
+ /*
191
+ // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86,
192
+ SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
193
+ // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
194
+ // .cta_group = { .cta_group::1 }
195
+ template <cuda::ptx::dot_kind Kind>
196
+ __device__ static inline void tcgen05_mma(
197
+ cuda::ptx::kind_t<Kind> kind,
198
+ cuda::ptx::cta_group_1_t,
199
+ uint32_t d_tmem,
200
+ uint64_t a_desc,
201
+ uint64_t b_desc,
202
+ uint32_t idesc,
203
+ const uint32_t (&disable_output_lane)[4],
204
+ bool enable_input_d);
205
+ */
206
+ #if __cccl_ptx_isa >= 860
207
+ extern "C" _CCCL_DEVICE void
208
+ __cuda_ptx_tcgen05_mma_is_only_supported_on_SM_100a_100f_103a_103f_110a_110f_depending_on_the_variant__();
209
+ template <::cuda::ptx::dot_kind _Kind>
210
+ _CCCL_DEVICE static inline void tcgen05_mma(
211
+ ::cuda::ptx::kind_t<_Kind> __kind,
212
+ ::cuda::ptx::cta_group_1_t,
213
+ ::cuda::std::uint32_t __d_tmem,
214
+ ::cuda::std::uint64_t __a_desc,
215
+ ::cuda::std::uint64_t __b_desc,
216
+ ::cuda::std::uint32_t __idesc,
217
+ const ::cuda::std::uint32_t (&__disable_output_lane)[4],
218
+ bool __enable_input_d)
219
+ {
220
+ static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, "");
221
+ // __cta_group == cta_group_1 (due to parameter type constraint)
222
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
223
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
224
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
225
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100))) \
226
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1000)) \
227
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1030)) \
228
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1100))
229
+ if constexpr (__kind == kind_f16)
230
+ {
231
+ asm volatile(
232
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
233
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
234
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
235
+ "}"
236
+ :
237
+ : "r"(__d_tmem),
238
+ "l"(__a_desc),
239
+ "l"(__b_desc),
240
+ "r"(__idesc),
241
+ "r"(__disable_output_lane[0]),
242
+ "r"(__disable_output_lane[1]),
243
+ "r"(__disable_output_lane[2]),
244
+ "r"(__disable_output_lane[3]),
245
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
246
+ : "memory");
247
+ }
248
+ else if constexpr (__kind == kind_tf32)
249
+ {
250
+ asm volatile(
251
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
252
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
253
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
254
+ "}"
255
+ :
256
+ : "r"(__d_tmem),
257
+ "l"(__a_desc),
258
+ "l"(__b_desc),
259
+ "r"(__idesc),
260
+ "r"(__disable_output_lane[0]),
261
+ "r"(__disable_output_lane[1]),
262
+ "r"(__disable_output_lane[2]),
263
+ "r"(__disable_output_lane[3]),
264
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
265
+ : "memory");
266
+ }
267
+ else if constexpr (__kind == kind_f8f6f4)
268
+ {
269
+ asm volatile(
270
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
271
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
272
+ "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
273
+ "}"
274
+ :
275
+ : "r"(__d_tmem),
276
+ "l"(__a_desc),
277
+ "l"(__b_desc),
278
+ "r"(__idesc),
279
+ "r"(__disable_output_lane[0]),
280
+ "r"(__disable_output_lane[1]),
281
+ "r"(__disable_output_lane[2]),
282
+ "r"(__disable_output_lane[3]),
283
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
284
+ : "memory");
285
+ }
286
+ # elif _CCCL_CUDA_COMPILER(NVHPC) \
287
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
288
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
289
+ if constexpr (__kind == kind_i8)
290
+ {
291
+ asm volatile(
292
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
293
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
294
+ "tcgen05.mma.cta_group::1.kind::i8 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
295
+ "}"
296
+ :
297
+ : "r"(__d_tmem),
298
+ "l"(__a_desc),
299
+ "l"(__b_desc),
300
+ "r"(__idesc),
301
+ "r"(__disable_output_lane[0]),
302
+ "r"(__disable_output_lane[1]),
303
+ "r"(__disable_output_lane[2]),
304
+ "r"(__disable_output_lane[3]),
305
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
306
+ : "memory");
307
+ }
308
+
309
+ # else
310
+ // Unsupported architectures will have a linker error with a semi-decent error message
311
+ __cuda_ptx_tcgen05_mma_is_only_supported_on_SM_100a_100f_103a_103f_110a_110f_depending_on_the_variant__();
312
+ # endif
313
+ }
314
+ #endif // __cccl_ptx_isa >= 860
315
+
316
+ /*
317
+ // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86,
318
+ SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
319
+ // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
320
+ // .cta_group = { .cta_group::2 }
321
+ template <cuda::ptx::dot_kind Kind>
322
+ __device__ static inline void tcgen05_mma(
323
+ cuda::ptx::kind_t<Kind> kind,
324
+ cuda::ptx::cta_group_2_t,
325
+ uint32_t d_tmem,
326
+ uint64_t a_desc,
327
+ uint64_t b_desc,
328
+ uint32_t idesc,
329
+ const uint32_t (&disable_output_lane)[8],
330
+ bool enable_input_d);
331
+ */
332
+ #if __cccl_ptx_isa >= 860
333
+ extern "C" _CCCL_DEVICE void
334
+ __cuda_ptx_tcgen05_mma_is_only_supported_on_SM_100a_100f_103a_103f_110a_110f_depending_on_the_variant__();
335
+ template <::cuda::ptx::dot_kind _Kind>
336
+ _CCCL_DEVICE static inline void tcgen05_mma(
337
+ ::cuda::ptx::kind_t<_Kind> __kind,
338
+ ::cuda::ptx::cta_group_2_t,
339
+ ::cuda::std::uint32_t __d_tmem,
340
+ ::cuda::std::uint64_t __a_desc,
341
+ ::cuda::std::uint64_t __b_desc,
342
+ ::cuda::std::uint32_t __idesc,
343
+ const ::cuda::std::uint32_t (&__disable_output_lane)[8],
344
+ bool __enable_input_d)
345
+ {
346
+ static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, "");
347
+ // __cta_group == cta_group_2 (due to parameter type constraint)
348
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
349
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
350
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
351
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100))) \
352
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1000)) \
353
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1030)) \
354
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1100))
355
+ if constexpr (__kind == kind_f16)
356
+ {
357
+ asm volatile(
358
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
359
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
360
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
361
+ "PRED_enable_input_d;\n\t"
362
+ "}"
363
+ :
364
+ : "r"(__d_tmem),
365
+ "l"(__a_desc),
366
+ "l"(__b_desc),
367
+ "r"(__idesc),
368
+ "r"(__disable_output_lane[0]),
369
+ "r"(__disable_output_lane[1]),
370
+ "r"(__disable_output_lane[2]),
371
+ "r"(__disable_output_lane[3]),
372
+ "r"(__disable_output_lane[4]),
373
+ "r"(__disable_output_lane[5]),
374
+ "r"(__disable_output_lane[6]),
375
+ "r"(__disable_output_lane[7]),
376
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
377
+ : "memory");
378
+ }
379
+ else if constexpr (__kind == kind_tf32)
380
+ {
381
+ asm volatile(
382
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
383
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
384
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
385
+ "PRED_enable_input_d;\n\t"
386
+ "}"
387
+ :
388
+ : "r"(__d_tmem),
389
+ "l"(__a_desc),
390
+ "l"(__b_desc),
391
+ "r"(__idesc),
392
+ "r"(__disable_output_lane[0]),
393
+ "r"(__disable_output_lane[1]),
394
+ "r"(__disable_output_lane[2]),
395
+ "r"(__disable_output_lane[3]),
396
+ "r"(__disable_output_lane[4]),
397
+ "r"(__disable_output_lane[5]),
398
+ "r"(__disable_output_lane[6]),
399
+ "r"(__disable_output_lane[7]),
400
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
401
+ : "memory");
402
+ }
403
+ else if constexpr (__kind == kind_f8f6f4)
404
+ {
405
+ asm volatile(
406
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
407
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
408
+ "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], %1, %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
409
+ "PRED_enable_input_d;\n\t"
410
+ "}"
411
+ :
412
+ : "r"(__d_tmem),
413
+ "l"(__a_desc),
414
+ "l"(__b_desc),
415
+ "r"(__idesc),
416
+ "r"(__disable_output_lane[0]),
417
+ "r"(__disable_output_lane[1]),
418
+ "r"(__disable_output_lane[2]),
419
+ "r"(__disable_output_lane[3]),
420
+ "r"(__disable_output_lane[4]),
421
+ "r"(__disable_output_lane[5]),
422
+ "r"(__disable_output_lane[6]),
423
+ "r"(__disable_output_lane[7]),
424
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
425
+ : "memory");
426
+ }
427
+ # elif _CCCL_CUDA_COMPILER(NVHPC) \
428
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
429
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
430
+ if constexpr (__kind == kind_i8)
431
+ {
432
+ asm volatile(
433
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
434
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
435
+ "tcgen05.mma.cta_group::2.kind::i8 [%0], %1, %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, PRED_enable_input_d;\n\t"
436
+ "}"
437
+ :
438
+ : "r"(__d_tmem),
439
+ "l"(__a_desc),
440
+ "l"(__b_desc),
441
+ "r"(__idesc),
442
+ "r"(__disable_output_lane[0]),
443
+ "r"(__disable_output_lane[1]),
444
+ "r"(__disable_output_lane[2]),
445
+ "r"(__disable_output_lane[3]),
446
+ "r"(__disable_output_lane[4]),
447
+ "r"(__disable_output_lane[5]),
448
+ "r"(__disable_output_lane[6]),
449
+ "r"(__disable_output_lane[7]),
450
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
451
+ : "memory");
452
+ }
453
+
454
+ # else
455
+ // Unsupported architectures will have a linker error with a semi-decent error message
456
+ __cuda_ptx_tcgen05_mma_is_only_supported_on_SM_100a_100f_103a_103f_110a_110f_depending_on_the_variant__();
457
+ # endif
458
+ }
459
+ #endif // __cccl_ptx_isa >= 860
460
+
461
+ /*
462
+ // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a,
463
+ SM_100f, SM_103a, SM_103f
464
+ // .kind = { .kind::f16, .kind::tf32 }
465
+ // .cta_group = { .cta_group::1, .cta_group::2 }
466
+ template <int N32, cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
467
+ __device__ static inline void tcgen05_mma(
468
+ cuda::ptx::kind_t<Kind> kind,
469
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
470
+ uint32_t d_tmem,
471
+ uint64_t a_desc,
472
+ uint64_t b_desc,
473
+ uint32_t idesc,
474
+ bool enable_input_d,
475
+ cuda::ptx::n32_t<N32> scale_input_d);
476
+ */
477
+ #if __cccl_ptx_isa >= 860
478
+ extern "C" _CCCL_DEVICE void
479
+ __cuda_ptx_tcgen05_mma_is_only_supported_on_SM_100a_100f_103a_103f_depending_on_the_variant__();
480
+ template <int _N32, ::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
481
+ _CCCL_DEVICE static inline void tcgen05_mma(
482
+ ::cuda::ptx::kind_t<_Kind> __kind,
483
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
484
+ ::cuda::std::uint32_t __d_tmem,
485
+ ::cuda::std::uint64_t __a_desc,
486
+ ::cuda::std::uint64_t __b_desc,
487
+ ::cuda::std::uint32_t __idesc,
488
+ bool __enable_input_d,
489
+ ::cuda::ptx::n32_t<_N32> __scale_input_d)
490
+ {
491
+ static_assert(__kind == kind_f16 || __kind == kind_tf32, "");
492
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
493
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
494
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
495
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
496
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1000)) \
497
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1030))
498
+ if constexpr (__kind == kind_f16 && __cta_group == cta_group_1)
499
+ {
500
+ asm volatile(
501
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
502
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
503
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t"
504
+ "}"
505
+ :
506
+ : "r"(__d_tmem),
507
+ "l"(__a_desc),
508
+ "l"(__b_desc),
509
+ "r"(__idesc),
510
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
511
+ "n"(__scale_input_d.value)
512
+ : "memory");
513
+ }
514
+ else if constexpr (__kind == kind_f16 && __cta_group == cta_group_2)
515
+ {
516
+ asm volatile(
517
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
518
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
519
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t"
520
+ "}"
521
+ :
522
+ : "r"(__d_tmem),
523
+ "l"(__a_desc),
524
+ "l"(__b_desc),
525
+ "r"(__idesc),
526
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
527
+ "n"(__scale_input_d.value)
528
+ : "memory");
529
+ }
530
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_1)
531
+ {
532
+ asm volatile(
533
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
534
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
535
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t"
536
+ "}"
537
+ :
538
+ : "r"(__d_tmem),
539
+ "l"(__a_desc),
540
+ "l"(__b_desc),
541
+ "r"(__idesc),
542
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
543
+ "n"(__scale_input_d.value)
544
+ : "memory");
545
+ }
546
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_2)
547
+ {
548
+ asm volatile(
549
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
550
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
551
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t"
552
+ "}"
553
+ :
554
+ : "r"(__d_tmem),
555
+ "l"(__a_desc),
556
+ "l"(__b_desc),
557
+ "r"(__idesc),
558
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
559
+ "n"(__scale_input_d.value)
560
+ : "memory");
561
+ }
562
+
563
+ # else
564
+ // Unsupported architectures will have a linker error with a semi-decent error message
565
+ __cuda_ptx_tcgen05_mma_is_only_supported_on_SM_100a_100f_103a_103f_depending_on_the_variant__();
566
+ # endif
567
+ }
568
+ #endif // __cccl_ptx_isa >= 860
569
+
570
+ /*
571
+ // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_100f, SM_103a,
572
+ SM_103f, SM_110a, SM_110f
573
+ // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
574
+ // .cta_group = { .cta_group::1, .cta_group::2 }
575
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
576
+ __device__ static inline void tcgen05_mma(
577
+ cuda::ptx::kind_t<Kind> kind,
578
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
579
+ uint32_t d_tmem,
580
+ uint64_t a_desc,
581
+ uint64_t b_desc,
582
+ uint32_t idesc,
583
+ bool enable_input_d);
584
+ */
585
+ #if __cccl_ptx_isa >= 860
586
+ extern "C" _CCCL_DEVICE void
587
+ __cuda_ptx_tcgen05_mma_is_only_supported_on_SM_100a_100f_103a_103f_110a_110f_depending_on_the_variant__();
588
+ template <::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
589
+ _CCCL_DEVICE static inline void tcgen05_mma(
590
+ ::cuda::ptx::kind_t<_Kind> __kind,
591
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
592
+ ::cuda::std::uint32_t __d_tmem,
593
+ ::cuda::std::uint64_t __a_desc,
594
+ ::cuda::std::uint64_t __b_desc,
595
+ ::cuda::std::uint32_t __idesc,
596
+ bool __enable_input_d)
597
+ {
598
+ static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, "");
599
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
600
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
601
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
602
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
603
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100))) \
604
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1000)) \
605
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1030)) \
606
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1100))
607
+ if constexpr (__kind == kind_f16 && __cta_group == cta_group_1)
608
+ {
609
+ asm volatile(
610
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
611
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
612
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
613
+ "}"
614
+ :
615
+ : "r"(__d_tmem),
616
+ "l"(__a_desc),
617
+ "l"(__b_desc),
618
+ "r"(__idesc),
619
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
620
+ : "memory");
621
+ }
622
+ else if constexpr (__kind == kind_f16 && __cta_group == cta_group_2)
623
+ {
624
+ asm volatile(
625
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
626
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
627
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
628
+ "}"
629
+ :
630
+ : "r"(__d_tmem),
631
+ "l"(__a_desc),
632
+ "l"(__b_desc),
633
+ "r"(__idesc),
634
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
635
+ : "memory");
636
+ }
637
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_1)
638
+ {
639
+ asm volatile(
640
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
641
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
642
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
643
+ "}"
644
+ :
645
+ : "r"(__d_tmem),
646
+ "l"(__a_desc),
647
+ "l"(__b_desc),
648
+ "r"(__idesc),
649
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
650
+ : "memory");
651
+ }
652
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_2)
653
+ {
654
+ asm volatile(
655
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
656
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
657
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
658
+ "}"
659
+ :
660
+ : "r"(__d_tmem),
661
+ "l"(__a_desc),
662
+ "l"(__b_desc),
663
+ "r"(__idesc),
664
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
665
+ : "memory");
666
+ }
667
+ else if constexpr (__kind == kind_f8f6f4 && __cta_group == cta_group_1)
668
+ {
669
+ asm volatile(
670
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
671
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
672
+ "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
673
+ "}"
674
+ :
675
+ : "r"(__d_tmem),
676
+ "l"(__a_desc),
677
+ "l"(__b_desc),
678
+ "r"(__idesc),
679
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
680
+ : "memory");
681
+ }
682
+ else if constexpr (__kind == kind_f8f6f4 && __cta_group == cta_group_2)
683
+ {
684
+ asm volatile(
685
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
686
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
687
+ "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
688
+ "}"
689
+ :
690
+ : "r"(__d_tmem),
691
+ "l"(__a_desc),
692
+ "l"(__b_desc),
693
+ "r"(__idesc),
694
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
695
+ : "memory");
696
+ }
697
+ # elif _CCCL_CUDA_COMPILER(NVHPC) \
698
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
699
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
700
+ if constexpr (__kind == kind_i8 && __cta_group == cta_group_1)
701
+ {
702
+ asm volatile(
703
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
704
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
705
+ "tcgen05.mma.cta_group::1.kind::i8 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
706
+ "}"
707
+ :
708
+ : "r"(__d_tmem),
709
+ "l"(__a_desc),
710
+ "l"(__b_desc),
711
+ "r"(__idesc),
712
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
713
+ : "memory");
714
+ }
715
+ else if constexpr (__kind == kind_i8 && __cta_group == cta_group_2)
716
+ {
717
+ asm volatile(
718
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
719
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
720
+ "tcgen05.mma.cta_group::2.kind::i8 [%0], %1, %2, %3, PRED_enable_input_d;\n\t"
721
+ "}"
722
+ :
723
+ : "r"(__d_tmem),
724
+ "l"(__a_desc),
725
+ "l"(__b_desc),
726
+ "r"(__idesc),
727
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
728
+ : "memory");
729
+ }
730
+
731
+ # else
732
+ // Unsupported architectures will have a linker error with a semi-decent error message
733
+ __cuda_ptx_tcgen05_mma_is_only_supported_on_SM_100a_100f_103a_103f_110a_110f_depending_on_the_variant__();
734
+ # endif
735
+ }
736
+ #endif // __cccl_ptx_isa >= 860
737
+
738
+ /*
739
+ // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; //
740
+ PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f
741
+ // .kind = { .kind::f16, .kind::tf32 }
742
+ // .cta_group = { .cta_group::1 }
743
+ template <int N32, cuda::ptx::dot_kind Kind>
744
+ __device__ static inline void tcgen05_mma_tmem_a(
745
+ cuda::ptx::kind_t<Kind> kind,
746
+ cuda::ptx::cta_group_1_t,
747
+ uint32_t d_tmem,
748
+ uint32_t a_tmem,
749
+ uint64_t b_desc,
750
+ uint32_t idesc,
751
+ const uint32_t (&disable_output_lane)[4],
752
+ bool enable_input_d,
753
+ cuda::ptx::n32_t<N32> scale_input_d);
754
+ */
755
+ #if __cccl_ptx_isa >= 860
756
+ extern "C" _CCCL_DEVICE void
757
+ __cuda_ptx_tcgen05_mma_tmem_a_is_only_supported_on_SM_100a_100f_103a_103f_depending_on_the_variant__();
758
+ template <int _N32, ::cuda::ptx::dot_kind _Kind>
759
+ _CCCL_DEVICE static inline void tcgen05_mma_tmem_a(
760
+ ::cuda::ptx::kind_t<_Kind> __kind,
761
+ ::cuda::ptx::cta_group_1_t,
762
+ ::cuda::std::uint32_t __d_tmem,
763
+ ::cuda::std::uint32_t __a_tmem,
764
+ ::cuda::std::uint64_t __b_desc,
765
+ ::cuda::std::uint32_t __idesc,
766
+ const ::cuda::std::uint32_t (&__disable_output_lane)[4],
767
+ bool __enable_input_d,
768
+ ::cuda::ptx::n32_t<_N32> __scale_input_d)
769
+ {
770
+ static_assert(__kind == kind_f16 || __kind == kind_tf32, "");
771
+ // __cta_group == cta_group_1 (due to parameter type constraint)
772
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
773
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
774
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
775
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1000)) \
776
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1030))
777
+ if constexpr (__kind == kind_f16)
778
+ {
779
+ asm volatile(
780
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
781
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
782
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t"
783
+ "}"
784
+ :
785
+ : "r"(__d_tmem),
786
+ "r"(__a_tmem),
787
+ "l"(__b_desc),
788
+ "r"(__idesc),
789
+ "r"(__disable_output_lane[0]),
790
+ "r"(__disable_output_lane[1]),
791
+ "r"(__disable_output_lane[2]),
792
+ "r"(__disable_output_lane[3]),
793
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
794
+ "n"(__scale_input_d.value)
795
+ : "memory");
796
+ }
797
+ else if constexpr (__kind == kind_tf32)
798
+ {
799
+ asm volatile(
800
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
801
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
802
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t"
803
+ "}"
804
+ :
805
+ : "r"(__d_tmem),
806
+ "r"(__a_tmem),
807
+ "l"(__b_desc),
808
+ "r"(__idesc),
809
+ "r"(__disable_output_lane[0]),
810
+ "r"(__disable_output_lane[1]),
811
+ "r"(__disable_output_lane[2]),
812
+ "r"(__disable_output_lane[3]),
813
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
814
+ "n"(__scale_input_d.value)
815
+ : "memory");
816
+ }
817
+
818
+ # else
819
+ // Unsupported architectures will have a linker error with a semi-decent error message
820
+ __cuda_ptx_tcgen05_mma_tmem_a_is_only_supported_on_SM_100a_100f_103a_103f_depending_on_the_variant__();
821
+ # endif
822
+ }
823
+ #endif // __cccl_ptx_isa >= 860
824
+
825
+ /*
826
+ // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; //
827
+ PTX ISA 86, SM_100a, SM_100f, SM_103a, SM_103f
828
+ // .kind = { .kind::f16, .kind::tf32 }
829
+ // .cta_group = { .cta_group::2 }
830
+ template <int N32, cuda::ptx::dot_kind Kind>
831
+ __device__ static inline void tcgen05_mma_tmem_a(
832
+ cuda::ptx::kind_t<Kind> kind,
833
+ cuda::ptx::cta_group_2_t,
834
+ uint32_t d_tmem,
835
+ uint32_t a_tmem,
836
+ uint64_t b_desc,
837
+ uint32_t idesc,
838
+ const uint32_t (&disable_output_lane)[8],
839
+ bool enable_input_d,
840
+ cuda::ptx::n32_t<N32> scale_input_d);
841
+ */
842
+ #if __cccl_ptx_isa >= 860
843
+ extern "C" _CCCL_DEVICE void
844
+ __cuda_ptx_tcgen05_mma_tmem_a_is_only_supported_on_SM_100a_100f_103a_103f_depending_on_the_variant__();
845
+ template <int _N32, ::cuda::ptx::dot_kind _Kind>
846
+ _CCCL_DEVICE static inline void tcgen05_mma_tmem_a(
847
+ ::cuda::ptx::kind_t<_Kind> __kind,
848
+ ::cuda::ptx::cta_group_2_t,
849
+ ::cuda::std::uint32_t __d_tmem,
850
+ ::cuda::std::uint32_t __a_tmem,
851
+ ::cuda::std::uint64_t __b_desc,
852
+ ::cuda::std::uint32_t __idesc,
853
+ const ::cuda::std::uint32_t (&__disable_output_lane)[8],
854
+ bool __enable_input_d,
855
+ ::cuda::ptx::n32_t<_N32> __scale_input_d)
856
+ {
857
+ static_assert(__kind == kind_f16 || __kind == kind_tf32, "");
858
+ // __cta_group == cta_group_2 (due to parameter type constraint)
859
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
860
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
861
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
862
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1000)) \
863
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1030))
864
+ if constexpr (__kind == kind_f16)
865
+ {
866
+ asm volatile(
867
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
868
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
869
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, PRED_enable_input_d, "
870
+ "%13;\n\t"
871
+ "}"
872
+ :
873
+ : "r"(__d_tmem),
874
+ "r"(__a_tmem),
875
+ "l"(__b_desc),
876
+ "r"(__idesc),
877
+ "r"(__disable_output_lane[0]),
878
+ "r"(__disable_output_lane[1]),
879
+ "r"(__disable_output_lane[2]),
880
+ "r"(__disable_output_lane[3]),
881
+ "r"(__disable_output_lane[4]),
882
+ "r"(__disable_output_lane[5]),
883
+ "r"(__disable_output_lane[6]),
884
+ "r"(__disable_output_lane[7]),
885
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
886
+ "n"(__scale_input_d.value)
887
+ : "memory");
888
+ }
889
+ else if constexpr (__kind == kind_tf32)
890
+ {
891
+ asm volatile(
892
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
893
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
894
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
895
+ "PRED_enable_input_d, %13;\n\t"
896
+ "}"
897
+ :
898
+ : "r"(__d_tmem),
899
+ "r"(__a_tmem),
900
+ "l"(__b_desc),
901
+ "r"(__idesc),
902
+ "r"(__disable_output_lane[0]),
903
+ "r"(__disable_output_lane[1]),
904
+ "r"(__disable_output_lane[2]),
905
+ "r"(__disable_output_lane[3]),
906
+ "r"(__disable_output_lane[4]),
907
+ "r"(__disable_output_lane[5]),
908
+ "r"(__disable_output_lane[6]),
909
+ "r"(__disable_output_lane[7]),
910
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
911
+ "n"(__scale_input_d.value)
912
+ : "memory");
913
+ }
914
+
915
+ # else
916
+ // Unsupported architectures will have a linker error with a semi-decent error message
917
+ __cuda_ptx_tcgen05_mma_tmem_a_is_only_supported_on_SM_100a_100f_103a_103f_depending_on_the_variant__();
918
+ # endif
919
+ }
920
+ #endif // __cccl_ptx_isa >= 860
921
+
922
+ /*
923
+ // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86,
924
+ SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
925
+ // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
926
+ // .cta_group = { .cta_group::1 }
927
+ template <cuda::ptx::dot_kind Kind>
928
+ __device__ static inline void tcgen05_mma_tmem_a(
929
+ cuda::ptx::kind_t<Kind> kind,
930
+ cuda::ptx::cta_group_1_t,
931
+ uint32_t d_tmem,
932
+ uint32_t a_tmem,
933
+ uint64_t b_desc,
934
+ uint32_t idesc,
935
+ const uint32_t (&disable_output_lane)[4],
936
+ bool enable_input_d);
937
+ */
938
+ #if __cccl_ptx_isa >= 860
939
+ extern "C" _CCCL_DEVICE void
940
+ __cuda_ptx_tcgen05_mma_tmem_a_is_only_supported_on_SM_100a_100f_103a_103f_110a_110f_depending_on_the_variant__();
941
+ template <::cuda::ptx::dot_kind _Kind>
942
+ _CCCL_DEVICE static inline void tcgen05_mma_tmem_a(
943
+ ::cuda::ptx::kind_t<_Kind> __kind,
944
+ ::cuda::ptx::cta_group_1_t,
945
+ ::cuda::std::uint32_t __d_tmem,
946
+ ::cuda::std::uint32_t __a_tmem,
947
+ ::cuda::std::uint64_t __b_desc,
948
+ ::cuda::std::uint32_t __idesc,
949
+ const ::cuda::std::uint32_t (&__disable_output_lane)[4],
950
+ bool __enable_input_d)
951
+ {
952
+ static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, "");
953
+ // __cta_group == cta_group_1 (due to parameter type constraint)
954
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
955
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
956
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
957
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100))) \
958
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1000)) \
959
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1030)) \
960
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1100))
961
+ if constexpr (__kind == kind_f16)
962
+ {
963
+ asm volatile(
964
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
965
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
966
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
967
+ "}"
968
+ :
969
+ : "r"(__d_tmem),
970
+ "r"(__a_tmem),
971
+ "l"(__b_desc),
972
+ "r"(__idesc),
973
+ "r"(__disable_output_lane[0]),
974
+ "r"(__disable_output_lane[1]),
975
+ "r"(__disable_output_lane[2]),
976
+ "r"(__disable_output_lane[3]),
977
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
978
+ : "memory");
979
+ }
980
+ else if constexpr (__kind == kind_tf32)
981
+ {
982
+ asm volatile(
983
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
984
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
985
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
986
+ "}"
987
+ :
988
+ : "r"(__d_tmem),
989
+ "r"(__a_tmem),
990
+ "l"(__b_desc),
991
+ "r"(__idesc),
992
+ "r"(__disable_output_lane[0]),
993
+ "r"(__disable_output_lane[1]),
994
+ "r"(__disable_output_lane[2]),
995
+ "r"(__disable_output_lane[3]),
996
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
997
+ : "memory");
998
+ }
999
+ else if constexpr (__kind == kind_f8f6f4)
1000
+ {
1001
+ asm volatile(
1002
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1003
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
1004
+ "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
1005
+ "}"
1006
+ :
1007
+ : "r"(__d_tmem),
1008
+ "r"(__a_tmem),
1009
+ "l"(__b_desc),
1010
+ "r"(__idesc),
1011
+ "r"(__disable_output_lane[0]),
1012
+ "r"(__disable_output_lane[1]),
1013
+ "r"(__disable_output_lane[2]),
1014
+ "r"(__disable_output_lane[3]),
1015
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1016
+ : "memory");
1017
+ }
1018
+ # elif _CCCL_CUDA_COMPILER(NVHPC) \
1019
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
1020
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
1021
+ if constexpr (__kind == kind_i8)
1022
+ {
1023
+ asm volatile(
1024
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1025
+ "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t"
1026
+ "tcgen05.mma.cta_group::1.kind::i8 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t"
1027
+ "}"
1028
+ :
1029
+ : "r"(__d_tmem),
1030
+ "r"(__a_tmem),
1031
+ "l"(__b_desc),
1032
+ "r"(__idesc),
1033
+ "r"(__disable_output_lane[0]),
1034
+ "r"(__disable_output_lane[1]),
1035
+ "r"(__disable_output_lane[2]),
1036
+ "r"(__disable_output_lane[3]),
1037
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1038
+ : "memory");
1039
+ }
1040
+
1041
+ # else
1042
+ // Unsupported architectures will have a linker error with a semi-decent error message
1043
+ __cuda_ptx_tcgen05_mma_tmem_a_is_only_supported_on_SM_100a_100f_103a_103f_110a_110f_depending_on_the_variant__();
1044
+ # endif
1045
+ }
1046
+ #endif // __cccl_ptx_isa >= 860
1047
+
1048
+ /*
1049
+ // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86,
1050
+ SM_100a, SM_100f, SM_103a, SM_103f, SM_110a, SM_110f
1051
+ // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
1052
+ // .cta_group = { .cta_group::2 }
1053
+ template <cuda::ptx::dot_kind Kind>
1054
+ __device__ static inline void tcgen05_mma_tmem_a(
1055
+ cuda::ptx::kind_t<Kind> kind,
1056
+ cuda::ptx::cta_group_2_t,
1057
+ uint32_t d_tmem,
1058
+ uint32_t a_tmem,
1059
+ uint64_t b_desc,
1060
+ uint32_t idesc,
1061
+ const uint32_t (&disable_output_lane)[8],
1062
+ bool enable_input_d);
1063
+ */
1064
+ #if __cccl_ptx_isa >= 860
1065
+ extern "C" _CCCL_DEVICE void
1066
+ __cuda_ptx_tcgen05_mma_tmem_a_is_only_supported_on_SM_100a_100f_103a_103f_110a_110f_depending_on_the_variant__();
1067
+ template <::cuda::ptx::dot_kind _Kind>
1068
+ _CCCL_DEVICE static inline void tcgen05_mma_tmem_a(
1069
+ ::cuda::ptx::kind_t<_Kind> __kind,
1070
+ ::cuda::ptx::cta_group_2_t,
1071
+ ::cuda::std::uint32_t __d_tmem,
1072
+ ::cuda::std::uint32_t __a_tmem,
1073
+ ::cuda::std::uint64_t __b_desc,
1074
+ ::cuda::std::uint32_t __idesc,
1075
+ const ::cuda::std::uint32_t (&__disable_output_lane)[8],
1076
+ bool __enable_input_d)
1077
+ {
1078
+ static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, "");
1079
+ // __cta_group == cta_group_2 (due to parameter type constraint)
1080
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
1081
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
1082
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
1083
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100))) \
1084
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1000)) \
1085
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1030)) \
1086
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1100))
1087
+ if constexpr (__kind == kind_f16)
1088
+ {
1089
+ asm volatile(
1090
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1091
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
1092
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
1093
+ "PRED_enable_input_d;\n\t"
1094
+ "}"
1095
+ :
1096
+ : "r"(__d_tmem),
1097
+ "r"(__a_tmem),
1098
+ "l"(__b_desc),
1099
+ "r"(__idesc),
1100
+ "r"(__disable_output_lane[0]),
1101
+ "r"(__disable_output_lane[1]),
1102
+ "r"(__disable_output_lane[2]),
1103
+ "r"(__disable_output_lane[3]),
1104
+ "r"(__disable_output_lane[4]),
1105
+ "r"(__disable_output_lane[5]),
1106
+ "r"(__disable_output_lane[6]),
1107
+ "r"(__disable_output_lane[7]),
1108
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1109
+ : "memory");
1110
+ }
1111
+ else if constexpr (__kind == kind_tf32)
1112
+ {
1113
+ asm volatile(
1114
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1115
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
1116
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
1117
+ "PRED_enable_input_d;\n\t"
1118
+ "}"
1119
+ :
1120
+ : "r"(__d_tmem),
1121
+ "r"(__a_tmem),
1122
+ "l"(__b_desc),
1123
+ "r"(__idesc),
1124
+ "r"(__disable_output_lane[0]),
1125
+ "r"(__disable_output_lane[1]),
1126
+ "r"(__disable_output_lane[2]),
1127
+ "r"(__disable_output_lane[3]),
1128
+ "r"(__disable_output_lane[4]),
1129
+ "r"(__disable_output_lane[5]),
1130
+ "r"(__disable_output_lane[6]),
1131
+ "r"(__disable_output_lane[7]),
1132
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1133
+ : "memory");
1134
+ }
1135
+ else if constexpr (__kind == kind_f8f6f4)
1136
+ {
1137
+ asm volatile(
1138
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1139
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
1140
+ "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], [%1], %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
1141
+ "PRED_enable_input_d;\n\t"
1142
+ "}"
1143
+ :
1144
+ : "r"(__d_tmem),
1145
+ "r"(__a_tmem),
1146
+ "l"(__b_desc),
1147
+ "r"(__idesc),
1148
+ "r"(__disable_output_lane[0]),
1149
+ "r"(__disable_output_lane[1]),
1150
+ "r"(__disable_output_lane[2]),
1151
+ "r"(__disable_output_lane[3]),
1152
+ "r"(__disable_output_lane[4]),
1153
+ "r"(__disable_output_lane[5]),
1154
+ "r"(__disable_output_lane[6]),
1155
+ "r"(__disable_output_lane[7]),
1156
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1157
+ : "memory");
1158
+ }
1159
+ # elif _CCCL_CUDA_COMPILER(NVHPC) \
1160
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
1161
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
1162
+ if constexpr (__kind == kind_i8)
1163
+ {
1164
+ asm volatile(
1165
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1166
+ "setp.ne.b32 PRED_enable_input_d, %12, 0;\n\t"
1167
+ "tcgen05.mma.cta_group::2.kind::i8 [%0], [%1], %2, %3, {%4, %5, %6, %7, %8, %9, %10, %11}, "
1168
+ "PRED_enable_input_d;\n\t"
1169
+ "}"
1170
+ :
1171
+ : "r"(__d_tmem),
1172
+ "r"(__a_tmem),
1173
+ "l"(__b_desc),
1174
+ "r"(__idesc),
1175
+ "r"(__disable_output_lane[0]),
1176
+ "r"(__disable_output_lane[1]),
1177
+ "r"(__disable_output_lane[2]),
1178
+ "r"(__disable_output_lane[3]),
1179
+ "r"(__disable_output_lane[4]),
1180
+ "r"(__disable_output_lane[5]),
1181
+ "r"(__disable_output_lane[6]),
1182
+ "r"(__disable_output_lane[7]),
1183
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1184
+ : "memory");
1185
+ }
1186
+
1187
+ # else
1188
+ // Unsupported architectures will have a linker error with a semi-decent error message
1189
+ __cuda_ptx_tcgen05_mma_tmem_a_is_only_supported_on_SM_100a_100f_103a_103f_110a_110f_depending_on_the_variant__();
1190
+ # endif
1191
+ }
1192
+ #endif // __cccl_ptx_isa >= 860
1193
+
1194
+ /*
1195
+ // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a,
1196
+ SM_100f, SM_103a, SM_103f
1197
+ // .kind = { .kind::f16, .kind::tf32 }
1198
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1199
+ template <int N32, cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
1200
+ __device__ static inline void tcgen05_mma_tmem_a(
1201
+ cuda::ptx::kind_t<Kind> kind,
1202
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1203
+ uint32_t d_tmem,
1204
+ uint32_t a_tmem,
1205
+ uint64_t b_desc,
1206
+ uint32_t idesc,
1207
+ bool enable_input_d,
1208
+ cuda::ptx::n32_t<N32> scale_input_d);
1209
+ */
1210
+ #if __cccl_ptx_isa >= 860
1211
+ extern "C" _CCCL_DEVICE void
1212
+ __cuda_ptx_tcgen05_mma_tmem_a_is_only_supported_on_SM_100a_100f_103a_103f_depending_on_the_variant__();
1213
+ template <int _N32, ::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
1214
+ _CCCL_DEVICE static inline void tcgen05_mma_tmem_a(
1215
+ ::cuda::ptx::kind_t<_Kind> __kind,
1216
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
1217
+ ::cuda::std::uint32_t __d_tmem,
1218
+ ::cuda::std::uint32_t __a_tmem,
1219
+ ::cuda::std::uint64_t __b_desc,
1220
+ ::cuda::std::uint32_t __idesc,
1221
+ bool __enable_input_d,
1222
+ ::cuda::ptx::n32_t<_N32> __scale_input_d)
1223
+ {
1224
+ static_assert(__kind == kind_f16 || __kind == kind_tf32, "");
1225
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1226
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
1227
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
1228
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
1229
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1000)) \
1230
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1030))
1231
+ if constexpr (__kind == kind_f16 && __cta_group == cta_group_1)
1232
+ {
1233
+ asm volatile(
1234
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1235
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1236
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t"
1237
+ "}"
1238
+ :
1239
+ : "r"(__d_tmem),
1240
+ "r"(__a_tmem),
1241
+ "l"(__b_desc),
1242
+ "r"(__idesc),
1243
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
1244
+ "n"(__scale_input_d.value)
1245
+ : "memory");
1246
+ }
1247
+ else if constexpr (__kind == kind_f16 && __cta_group == cta_group_2)
1248
+ {
1249
+ asm volatile(
1250
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1251
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1252
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t"
1253
+ "}"
1254
+ :
1255
+ : "r"(__d_tmem),
1256
+ "r"(__a_tmem),
1257
+ "l"(__b_desc),
1258
+ "r"(__idesc),
1259
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
1260
+ "n"(__scale_input_d.value)
1261
+ : "memory");
1262
+ }
1263
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_1)
1264
+ {
1265
+ asm volatile(
1266
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1267
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1268
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t"
1269
+ "}"
1270
+ :
1271
+ : "r"(__d_tmem),
1272
+ "r"(__a_tmem),
1273
+ "l"(__b_desc),
1274
+ "r"(__idesc),
1275
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
1276
+ "n"(__scale_input_d.value)
1277
+ : "memory");
1278
+ }
1279
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_2)
1280
+ {
1281
+ asm volatile(
1282
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1283
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1284
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t"
1285
+ "}"
1286
+ :
1287
+ : "r"(__d_tmem),
1288
+ "r"(__a_tmem),
1289
+ "l"(__b_desc),
1290
+ "r"(__idesc),
1291
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d)),
1292
+ "n"(__scale_input_d.value)
1293
+ : "memory");
1294
+ }
1295
+
1296
+ # else
1297
+ // Unsupported architectures will have a linker error with a semi-decent error message
1298
+ __cuda_ptx_tcgen05_mma_tmem_a_is_only_supported_on_SM_100a_100f_103a_103f_depending_on_the_variant__();
1299
+ # endif
1300
+ }
1301
+ #endif // __cccl_ptx_isa >= 860
1302
+
1303
+ /*
1304
+ // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_100f,
1305
+ SM_103a, SM_103f, SM_110a, SM_110f
1306
+ // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 }
1307
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1308
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
1309
+ __device__ static inline void tcgen05_mma_tmem_a(
1310
+ cuda::ptx::kind_t<Kind> kind,
1311
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1312
+ uint32_t d_tmem,
1313
+ uint32_t a_tmem,
1314
+ uint64_t b_desc,
1315
+ uint32_t idesc,
1316
+ bool enable_input_d);
1317
+ */
1318
+ #if __cccl_ptx_isa >= 860
1319
+ extern "C" _CCCL_DEVICE void
1320
+ __cuda_ptx_tcgen05_mma_tmem_a_is_only_supported_on_SM_100a_100f_103a_103f_110a_110f_depending_on_the_variant__();
1321
+ template <::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
1322
+ _CCCL_DEVICE static inline void tcgen05_mma_tmem_a(
1323
+ ::cuda::ptx::kind_t<_Kind> __kind,
1324
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
1325
+ ::cuda::std::uint32_t __d_tmem,
1326
+ ::cuda::std::uint32_t __a_tmem,
1327
+ ::cuda::std::uint64_t __b_desc,
1328
+ ::cuda::std::uint32_t __idesc,
1329
+ bool __enable_input_d)
1330
+ {
1331
+ static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, "");
1332
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1333
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
1334
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
1335
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
1336
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100))) \
1337
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1000)) \
1338
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1030)) \
1339
+ || (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ == 1100))
1340
+ if constexpr (__kind == kind_f16 && __cta_group == cta_group_1)
1341
+ {
1342
+ asm volatile(
1343
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1344
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1345
+ "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1346
+ "}"
1347
+ :
1348
+ : "r"(__d_tmem),
1349
+ "r"(__a_tmem),
1350
+ "l"(__b_desc),
1351
+ "r"(__idesc),
1352
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1353
+ : "memory");
1354
+ }
1355
+ else if constexpr (__kind == kind_f16 && __cta_group == cta_group_2)
1356
+ {
1357
+ asm volatile(
1358
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1359
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1360
+ "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1361
+ "}"
1362
+ :
1363
+ : "r"(__d_tmem),
1364
+ "r"(__a_tmem),
1365
+ "l"(__b_desc),
1366
+ "r"(__idesc),
1367
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1368
+ : "memory");
1369
+ }
1370
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_1)
1371
+ {
1372
+ asm volatile(
1373
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1374
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1375
+ "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1376
+ "}"
1377
+ :
1378
+ : "r"(__d_tmem),
1379
+ "r"(__a_tmem),
1380
+ "l"(__b_desc),
1381
+ "r"(__idesc),
1382
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1383
+ : "memory");
1384
+ }
1385
+ else if constexpr (__kind == kind_tf32 && __cta_group == cta_group_2)
1386
+ {
1387
+ asm volatile(
1388
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1389
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1390
+ "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1391
+ "}"
1392
+ :
1393
+ : "r"(__d_tmem),
1394
+ "r"(__a_tmem),
1395
+ "l"(__b_desc),
1396
+ "r"(__idesc),
1397
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1398
+ : "memory");
1399
+ }
1400
+ else if constexpr (__kind == kind_f8f6f4 && __cta_group == cta_group_1)
1401
+ {
1402
+ asm volatile(
1403
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1404
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1405
+ "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1406
+ "}"
1407
+ :
1408
+ : "r"(__d_tmem),
1409
+ "r"(__a_tmem),
1410
+ "l"(__b_desc),
1411
+ "r"(__idesc),
1412
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1413
+ : "memory");
1414
+ }
1415
+ else if constexpr (__kind == kind_f8f6f4 && __cta_group == cta_group_2)
1416
+ {
1417
+ asm volatile(
1418
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1419
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1420
+ "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1421
+ "}"
1422
+ :
1423
+ : "r"(__d_tmem),
1424
+ "r"(__a_tmem),
1425
+ "l"(__b_desc),
1426
+ "r"(__idesc),
1427
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1428
+ : "memory");
1429
+ }
1430
+ # elif _CCCL_CUDA_COMPILER(NVHPC) \
1431
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
1432
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
1433
+ if constexpr (__kind == kind_i8 && __cta_group == cta_group_1)
1434
+ {
1435
+ asm volatile(
1436
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1437
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1438
+ "tcgen05.mma.cta_group::1.kind::i8 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1439
+ "}"
1440
+ :
1441
+ : "r"(__d_tmem),
1442
+ "r"(__a_tmem),
1443
+ "l"(__b_desc),
1444
+ "r"(__idesc),
1445
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1446
+ : "memory");
1447
+ }
1448
+ else if constexpr (__kind == kind_i8 && __cta_group == cta_group_2)
1449
+ {
1450
+ asm volatile(
1451
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1452
+ "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t"
1453
+ "tcgen05.mma.cta_group::2.kind::i8 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t"
1454
+ "}"
1455
+ :
1456
+ : "r"(__d_tmem),
1457
+ "r"(__a_tmem),
1458
+ "l"(__b_desc),
1459
+ "r"(__idesc),
1460
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1461
+ : "memory");
1462
+ }
1463
+
1464
+ # else
1465
+ // Unsupported architectures will have a linker error with a semi-decent error message
1466
+ __cuda_ptx_tcgen05_mma_tmem_a_is_only_supported_on_SM_100a_100f_103a_103f_110a_110f_depending_on_the_variant__();
1467
+ # endif
1468
+ }
1469
+ #endif // __cccl_ptx_isa >= 860
1470
+
1471
+ /*
1472
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem],
1473
+ enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
1474
+ // .kind = { .kind::mxf8f6f4 }
1475
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1476
+ template <cuda::ptx::dot_cta_group Cta_Group>
1477
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x(
1478
+ cuda::ptx::kind_mxf8f6f4_t,
1479
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1480
+ uint32_t d_tmem,
1481
+ uint64_t a_desc,
1482
+ uint64_t b_desc,
1483
+ uint32_t idesc,
1484
+ uint32_t scale_A_tmem,
1485
+ uint32_t scale_B_tmem,
1486
+ bool enable_input_d);
1487
+ */
1488
+ #if __cccl_ptx_isa >= 860
1489
+ extern "C" _CCCL_DEVICE void
1490
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
1491
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
1492
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x(
1493
+ ::cuda::ptx::kind_mxf8f6f4_t,
1494
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
1495
+ ::cuda::std::uint32_t __d_tmem,
1496
+ ::cuda::std::uint64_t __a_desc,
1497
+ ::cuda::std::uint64_t __b_desc,
1498
+ ::cuda::std::uint32_t __idesc,
1499
+ ::cuda::std::uint32_t __scale_A_tmem,
1500
+ ::cuda::std::uint32_t __scale_B_tmem,
1501
+ bool __enable_input_d)
1502
+ {
1503
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
1504
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1505
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
1506
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
1507
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
1508
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
1509
+ if constexpr (__cta_group == cta_group_1)
1510
+ {
1511
+ asm volatile(
1512
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1513
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1514
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], "
1515
+ "PRED_enable_input_d;\n\t"
1516
+ "}"
1517
+ :
1518
+ : "r"(__d_tmem),
1519
+ "l"(__a_desc),
1520
+ "l"(__b_desc),
1521
+ "r"(__idesc),
1522
+ "r"(__scale_A_tmem),
1523
+ "r"(__scale_B_tmem),
1524
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1525
+ : "memory");
1526
+ }
1527
+ else if constexpr (__cta_group == cta_group_2)
1528
+ {
1529
+ asm volatile(
1530
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1531
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1532
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], "
1533
+ "PRED_enable_input_d;\n\t"
1534
+ "}"
1535
+ :
1536
+ : "r"(__d_tmem),
1537
+ "l"(__a_desc),
1538
+ "l"(__b_desc),
1539
+ "r"(__idesc),
1540
+ "r"(__scale_A_tmem),
1541
+ "r"(__scale_B_tmem),
1542
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1543
+ : "memory");
1544
+ }
1545
+
1546
+ # else
1547
+ // Unsupported architectures will have a linker error with a semi-decent error message
1548
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
1549
+ # endif
1550
+ }
1551
+ #endif // __cccl_ptx_isa >= 860
1552
+
1553
+ /*
1554
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem],
1555
+ enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
1556
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
1557
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1558
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
1559
+ __device__ static inline void tcgen05_mma_block_scale_vec_2x(
1560
+ cuda::ptx::kind_t<Kind> kind,
1561
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1562
+ uint32_t d_tmem,
1563
+ uint64_t a_desc,
1564
+ uint64_t b_desc,
1565
+ uint32_t idesc,
1566
+ uint32_t scale_A_tmem,
1567
+ uint32_t scale_B_tmem,
1568
+ bool enable_input_d);
1569
+ */
1570
+ #if __cccl_ptx_isa >= 860
1571
+ extern "C" _CCCL_DEVICE void
1572
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
1573
+ template <::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
1574
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x(
1575
+ ::cuda::ptx::kind_t<_Kind> __kind,
1576
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
1577
+ ::cuda::std::uint32_t __d_tmem,
1578
+ ::cuda::std::uint64_t __a_desc,
1579
+ ::cuda::std::uint64_t __b_desc,
1580
+ ::cuda::std::uint32_t __idesc,
1581
+ ::cuda::std::uint32_t __scale_A_tmem,
1582
+ ::cuda::std::uint32_t __scale_B_tmem,
1583
+ bool __enable_input_d)
1584
+ {
1585
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
1586
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1587
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
1588
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
1589
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
1590
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
1591
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
1592
+ {
1593
+ asm volatile(
1594
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1595
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1596
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1597
+ "PRED_enable_input_d;\n\t"
1598
+ "}"
1599
+ :
1600
+ : "r"(__d_tmem),
1601
+ "l"(__a_desc),
1602
+ "l"(__b_desc),
1603
+ "r"(__idesc),
1604
+ "r"(__scale_A_tmem),
1605
+ "r"(__scale_B_tmem),
1606
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1607
+ : "memory");
1608
+ }
1609
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
1610
+ {
1611
+ asm volatile(
1612
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1613
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1614
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1615
+ "PRED_enable_input_d;\n\t"
1616
+ "}"
1617
+ :
1618
+ : "r"(__d_tmem),
1619
+ "l"(__a_desc),
1620
+ "l"(__b_desc),
1621
+ "r"(__idesc),
1622
+ "r"(__scale_A_tmem),
1623
+ "r"(__scale_B_tmem),
1624
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1625
+ : "memory");
1626
+ }
1627
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
1628
+ {
1629
+ asm volatile(
1630
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1631
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1632
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1633
+ "PRED_enable_input_d;\n\t"
1634
+ "}"
1635
+ :
1636
+ : "r"(__d_tmem),
1637
+ "l"(__a_desc),
1638
+ "l"(__b_desc),
1639
+ "r"(__idesc),
1640
+ "r"(__scale_A_tmem),
1641
+ "r"(__scale_B_tmem),
1642
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1643
+ : "memory");
1644
+ }
1645
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
1646
+ {
1647
+ asm volatile(
1648
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1649
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1650
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1651
+ "PRED_enable_input_d;\n\t"
1652
+ "}"
1653
+ :
1654
+ : "r"(__d_tmem),
1655
+ "l"(__a_desc),
1656
+ "l"(__b_desc),
1657
+ "r"(__idesc),
1658
+ "r"(__scale_A_tmem),
1659
+ "r"(__scale_B_tmem),
1660
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1661
+ : "memory");
1662
+ }
1663
+
1664
+ # else
1665
+ // Unsupported architectures will have a linker error with a semi-decent error message
1666
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
1667
+ # endif
1668
+ }
1669
+ #endif // __cccl_ptx_isa >= 860
1670
+
1671
+ /*
1672
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem],
1673
+ enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
1674
+ // .kind = { .kind::mxf4nvf4 }
1675
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1676
+ template <cuda::ptx::dot_cta_group Cta_Group>
1677
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x(
1678
+ cuda::ptx::kind_mxf4nvf4_t,
1679
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1680
+ uint32_t d_tmem,
1681
+ uint64_t a_desc,
1682
+ uint64_t b_desc,
1683
+ uint32_t idesc,
1684
+ uint32_t scale_A_tmem,
1685
+ uint32_t scale_B_tmem,
1686
+ bool enable_input_d);
1687
+ */
1688
+ #if __cccl_ptx_isa >= 860
1689
+ extern "C" _CCCL_DEVICE void
1690
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
1691
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
1692
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x(
1693
+ ::cuda::ptx::kind_mxf4nvf4_t,
1694
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
1695
+ ::cuda::std::uint32_t __d_tmem,
1696
+ ::cuda::std::uint64_t __a_desc,
1697
+ ::cuda::std::uint64_t __b_desc,
1698
+ ::cuda::std::uint32_t __idesc,
1699
+ ::cuda::std::uint32_t __scale_A_tmem,
1700
+ ::cuda::std::uint32_t __scale_B_tmem,
1701
+ bool __enable_input_d)
1702
+ {
1703
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
1704
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1705
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
1706
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
1707
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
1708
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
1709
+ if constexpr (__cta_group == cta_group_1)
1710
+ {
1711
+ asm volatile(
1712
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1713
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1714
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], "
1715
+ "PRED_enable_input_d;\n\t"
1716
+ "}"
1717
+ :
1718
+ : "r"(__d_tmem),
1719
+ "l"(__a_desc),
1720
+ "l"(__b_desc),
1721
+ "r"(__idesc),
1722
+ "r"(__scale_A_tmem),
1723
+ "r"(__scale_B_tmem),
1724
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1725
+ : "memory");
1726
+ }
1727
+ else if constexpr (__cta_group == cta_group_2)
1728
+ {
1729
+ asm volatile(
1730
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1731
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1732
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], "
1733
+ "PRED_enable_input_d;\n\t"
1734
+ "}"
1735
+ :
1736
+ : "r"(__d_tmem),
1737
+ "l"(__a_desc),
1738
+ "l"(__b_desc),
1739
+ "r"(__idesc),
1740
+ "r"(__scale_A_tmem),
1741
+ "r"(__scale_B_tmem),
1742
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1743
+ : "memory");
1744
+ }
1745
+
1746
+ # else
1747
+ // Unsupported architectures will have a linker error with a semi-decent error message
1748
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
1749
+ # endif
1750
+ }
1751
+ #endif // __cccl_ptx_isa >= 860
1752
+
1753
+ /*
1754
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem],
1755
+ enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
1756
+ // .kind = { .kind::mxf8f6f4 }
1757
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1758
+ template <cuda::ptx::dot_cta_group Cta_Group>
1759
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a(
1760
+ cuda::ptx::kind_mxf8f6f4_t,
1761
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1762
+ uint32_t d_tmem,
1763
+ uint64_t a_desc,
1764
+ uint64_t b_desc,
1765
+ uint32_t idesc,
1766
+ uint32_t scale_A_tmem,
1767
+ uint32_t scale_B_tmem,
1768
+ bool enable_input_d);
1769
+ */
1770
+ #if __cccl_ptx_isa >= 860
1771
+ extern "C" _CCCL_DEVICE void
1772
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
1773
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
1774
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a(
1775
+ ::cuda::ptx::kind_mxf8f6f4_t,
1776
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
1777
+ ::cuda::std::uint32_t __d_tmem,
1778
+ ::cuda::std::uint64_t __a_desc,
1779
+ ::cuda::std::uint64_t __b_desc,
1780
+ ::cuda::std::uint32_t __idesc,
1781
+ ::cuda::std::uint32_t __scale_A_tmem,
1782
+ ::cuda::std::uint32_t __scale_B_tmem,
1783
+ bool __enable_input_d)
1784
+ {
1785
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
1786
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1787
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
1788
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
1789
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
1790
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
1791
+ if constexpr (__cta_group == cta_group_1)
1792
+ {
1793
+ asm volatile(
1794
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1795
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1796
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], "
1797
+ "PRED_enable_input_d;\n\t"
1798
+ "}"
1799
+ :
1800
+ : "r"(__d_tmem),
1801
+ "l"(__a_desc),
1802
+ "l"(__b_desc),
1803
+ "r"(__idesc),
1804
+ "r"(__scale_A_tmem),
1805
+ "r"(__scale_B_tmem),
1806
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1807
+ : "memory");
1808
+ }
1809
+ else if constexpr (__cta_group == cta_group_2)
1810
+ {
1811
+ asm volatile(
1812
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1813
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1814
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], "
1815
+ "PRED_enable_input_d;\n\t"
1816
+ "}"
1817
+ :
1818
+ : "r"(__d_tmem),
1819
+ "l"(__a_desc),
1820
+ "l"(__b_desc),
1821
+ "r"(__idesc),
1822
+ "r"(__scale_A_tmem),
1823
+ "r"(__scale_B_tmem),
1824
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1825
+ : "memory");
1826
+ }
1827
+
1828
+ # else
1829
+ // Unsupported architectures will have a linker error with a semi-decent error message
1830
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
1831
+ # endif
1832
+ }
1833
+ #endif // __cccl_ptx_isa >= 860
1834
+
1835
+ /*
1836
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem],
1837
+ enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
1838
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
1839
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1840
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
1841
+ __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a(
1842
+ cuda::ptx::kind_t<Kind> kind,
1843
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1844
+ uint32_t d_tmem,
1845
+ uint64_t a_desc,
1846
+ uint64_t b_desc,
1847
+ uint32_t idesc,
1848
+ uint32_t scale_A_tmem,
1849
+ uint32_t scale_B_tmem,
1850
+ bool enable_input_d);
1851
+ */
1852
+ #if __cccl_ptx_isa >= 860
1853
+ extern "C" _CCCL_DEVICE void
1854
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
1855
+ template <::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
1856
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a(
1857
+ ::cuda::ptx::kind_t<_Kind> __kind,
1858
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
1859
+ ::cuda::std::uint32_t __d_tmem,
1860
+ ::cuda::std::uint64_t __a_desc,
1861
+ ::cuda::std::uint64_t __b_desc,
1862
+ ::cuda::std::uint32_t __idesc,
1863
+ ::cuda::std::uint32_t __scale_A_tmem,
1864
+ ::cuda::std::uint32_t __scale_B_tmem,
1865
+ bool __enable_input_d)
1866
+ {
1867
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
1868
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1869
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
1870
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
1871
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
1872
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
1873
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
1874
+ {
1875
+ asm volatile(
1876
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1877
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1878
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1879
+ "PRED_enable_input_d;\n\t"
1880
+ "}"
1881
+ :
1882
+ : "r"(__d_tmem),
1883
+ "l"(__a_desc),
1884
+ "l"(__b_desc),
1885
+ "r"(__idesc),
1886
+ "r"(__scale_A_tmem),
1887
+ "r"(__scale_B_tmem),
1888
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1889
+ : "memory");
1890
+ }
1891
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
1892
+ {
1893
+ asm volatile(
1894
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1895
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1896
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1897
+ "PRED_enable_input_d;\n\t"
1898
+ "}"
1899
+ :
1900
+ : "r"(__d_tmem),
1901
+ "l"(__a_desc),
1902
+ "l"(__b_desc),
1903
+ "r"(__idesc),
1904
+ "r"(__scale_A_tmem),
1905
+ "r"(__scale_B_tmem),
1906
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1907
+ : "memory");
1908
+ }
1909
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
1910
+ {
1911
+ asm volatile(
1912
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1913
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1914
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1915
+ "PRED_enable_input_d;\n\t"
1916
+ "}"
1917
+ :
1918
+ : "r"(__d_tmem),
1919
+ "l"(__a_desc),
1920
+ "l"(__b_desc),
1921
+ "r"(__idesc),
1922
+ "r"(__scale_A_tmem),
1923
+ "r"(__scale_B_tmem),
1924
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1925
+ : "memory");
1926
+ }
1927
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
1928
+ {
1929
+ asm volatile(
1930
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1931
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1932
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], "
1933
+ "PRED_enable_input_d;\n\t"
1934
+ "}"
1935
+ :
1936
+ : "r"(__d_tmem),
1937
+ "l"(__a_desc),
1938
+ "l"(__b_desc),
1939
+ "r"(__idesc),
1940
+ "r"(__scale_A_tmem),
1941
+ "r"(__scale_B_tmem),
1942
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
1943
+ : "memory");
1944
+ }
1945
+
1946
+ # else
1947
+ // Unsupported architectures will have a linker error with a semi-decent error message
1948
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
1949
+ # endif
1950
+ }
1951
+ #endif // __cccl_ptx_isa >= 860
1952
+
1953
+ /*
1954
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem],
1955
+ enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
1956
+ // .kind = { .kind::mxf4nvf4 }
1957
+ // .cta_group = { .cta_group::1, .cta_group::2 }
1958
+ template <cuda::ptx::dot_cta_group Cta_Group>
1959
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a(
1960
+ cuda::ptx::kind_mxf4nvf4_t,
1961
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
1962
+ uint32_t d_tmem,
1963
+ uint64_t a_desc,
1964
+ uint64_t b_desc,
1965
+ uint32_t idesc,
1966
+ uint32_t scale_A_tmem,
1967
+ uint32_t scale_B_tmem,
1968
+ bool enable_input_d);
1969
+ */
1970
+ #if __cccl_ptx_isa >= 860
1971
+ extern "C" _CCCL_DEVICE void
1972
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
1973
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
1974
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a(
1975
+ ::cuda::ptx::kind_mxf4nvf4_t,
1976
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
1977
+ ::cuda::std::uint32_t __d_tmem,
1978
+ ::cuda::std::uint64_t __a_desc,
1979
+ ::cuda::std::uint64_t __b_desc,
1980
+ ::cuda::std::uint32_t __idesc,
1981
+ ::cuda::std::uint32_t __scale_A_tmem,
1982
+ ::cuda::std::uint32_t __scale_B_tmem,
1983
+ bool __enable_input_d)
1984
+ {
1985
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
1986
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
1987
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
1988
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
1989
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
1990
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
1991
+ if constexpr (__cta_group == cta_group_1)
1992
+ {
1993
+ asm volatile(
1994
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
1995
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
1996
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], "
1997
+ "PRED_enable_input_d;\n\t"
1998
+ "}"
1999
+ :
2000
+ : "r"(__d_tmem),
2001
+ "l"(__a_desc),
2002
+ "l"(__b_desc),
2003
+ "r"(__idesc),
2004
+ "r"(__scale_A_tmem),
2005
+ "r"(__scale_B_tmem),
2006
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2007
+ : "memory");
2008
+ }
2009
+ else if constexpr (__cta_group == cta_group_2)
2010
+ {
2011
+ asm volatile(
2012
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2013
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2014
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], "
2015
+ "PRED_enable_input_d;\n\t"
2016
+ "}"
2017
+ :
2018
+ : "r"(__d_tmem),
2019
+ "l"(__a_desc),
2020
+ "l"(__b_desc),
2021
+ "r"(__idesc),
2022
+ "r"(__scale_A_tmem),
2023
+ "r"(__scale_B_tmem),
2024
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2025
+ : "memory");
2026
+ }
2027
+
2028
+ # else
2029
+ // Unsupported architectures will have a linker error with a semi-decent error message
2030
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2031
+ # endif
2032
+ }
2033
+ #endif // __cccl_ptx_isa >= 860
2034
+
2035
+ /*
2036
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc,
2037
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
2038
+ // .kind = { .kind::mxf8f6f4 }
2039
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2040
+ template <cuda::ptx::dot_cta_group Cta_Group>
2041
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill(
2042
+ cuda::ptx::kind_mxf8f6f4_t,
2043
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2044
+ uint32_t d_tmem,
2045
+ uint64_t a_desc,
2046
+ uint64_t b_desc,
2047
+ uint32_t idesc,
2048
+ uint32_t scale_A_tmem,
2049
+ uint32_t scale_B_tmem,
2050
+ bool enable_input_d);
2051
+ */
2052
+ #if __cccl_ptx_isa >= 860
2053
+ extern "C" _CCCL_DEVICE void
2054
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_fill_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2055
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
2056
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill(
2057
+ ::cuda::ptx::kind_mxf8f6f4_t,
2058
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
2059
+ ::cuda::std::uint32_t __d_tmem,
2060
+ ::cuda::std::uint64_t __a_desc,
2061
+ ::cuda::std::uint64_t __b_desc,
2062
+ ::cuda::std::uint32_t __idesc,
2063
+ ::cuda::std::uint32_t __scale_A_tmem,
2064
+ ::cuda::std::uint32_t __scale_B_tmem,
2065
+ bool __enable_input_d)
2066
+ {
2067
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
2068
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2069
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
2070
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
2071
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
2072
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
2073
+ if constexpr (__cta_group == cta_group_1)
2074
+ {
2075
+ asm volatile(
2076
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2077
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2078
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], "
2079
+ "[%5], PRED_enable_input_d;\n\t"
2080
+ "}"
2081
+ :
2082
+ : "r"(__d_tmem),
2083
+ "l"(__a_desc),
2084
+ "l"(__b_desc),
2085
+ "r"(__idesc),
2086
+ "r"(__scale_A_tmem),
2087
+ "r"(__scale_B_tmem),
2088
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2089
+ : "memory");
2090
+ }
2091
+ else if constexpr (__cta_group == cta_group_2)
2092
+ {
2093
+ asm volatile(
2094
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2095
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2096
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], "
2097
+ "[%5], PRED_enable_input_d;\n\t"
2098
+ "}"
2099
+ :
2100
+ : "r"(__d_tmem),
2101
+ "l"(__a_desc),
2102
+ "l"(__b_desc),
2103
+ "r"(__idesc),
2104
+ "r"(__scale_A_tmem),
2105
+ "r"(__scale_B_tmem),
2106
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2107
+ : "memory");
2108
+ }
2109
+
2110
+ # else
2111
+ // Unsupported architectures will have a linker error with a semi-decent error message
2112
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_fill_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2113
+ # endif
2114
+ }
2115
+ #endif // __cccl_ptx_isa >= 860
2116
+
2117
+ /*
2118
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc,
2119
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
2120
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
2121
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2122
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
2123
+ __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill(
2124
+ cuda::ptx::kind_t<Kind> kind,
2125
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2126
+ uint32_t d_tmem,
2127
+ uint64_t a_desc,
2128
+ uint64_t b_desc,
2129
+ uint32_t idesc,
2130
+ uint32_t scale_A_tmem,
2131
+ uint32_t scale_B_tmem,
2132
+ bool enable_input_d);
2133
+ */
2134
+ #if __cccl_ptx_isa >= 860
2135
+ extern "C" _CCCL_DEVICE void
2136
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_fill_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2137
+ template <::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
2138
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill(
2139
+ ::cuda::ptx::kind_t<_Kind> __kind,
2140
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
2141
+ ::cuda::std::uint32_t __d_tmem,
2142
+ ::cuda::std::uint64_t __a_desc,
2143
+ ::cuda::std::uint64_t __b_desc,
2144
+ ::cuda::std::uint32_t __idesc,
2145
+ ::cuda::std::uint32_t __scale_A_tmem,
2146
+ ::cuda::std::uint32_t __scale_B_tmem,
2147
+ bool __enable_input_d)
2148
+ {
2149
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
2150
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2151
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
2152
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
2153
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
2154
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
2155
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
2156
+ {
2157
+ asm volatile(
2158
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2159
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2160
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], "
2161
+ "PRED_enable_input_d;\n\t"
2162
+ "}"
2163
+ :
2164
+ : "r"(__d_tmem),
2165
+ "l"(__a_desc),
2166
+ "l"(__b_desc),
2167
+ "r"(__idesc),
2168
+ "r"(__scale_A_tmem),
2169
+ "r"(__scale_B_tmem),
2170
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2171
+ : "memory");
2172
+ }
2173
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
2174
+ {
2175
+ asm volatile(
2176
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2177
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2178
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], "
2179
+ "PRED_enable_input_d;\n\t"
2180
+ "}"
2181
+ :
2182
+ : "r"(__d_tmem),
2183
+ "l"(__a_desc),
2184
+ "l"(__b_desc),
2185
+ "r"(__idesc),
2186
+ "r"(__scale_A_tmem),
2187
+ "r"(__scale_B_tmem),
2188
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2189
+ : "memory");
2190
+ }
2191
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
2192
+ {
2193
+ asm volatile(
2194
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2195
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2196
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], "
2197
+ "[%5], PRED_enable_input_d;\n\t"
2198
+ "}"
2199
+ :
2200
+ : "r"(__d_tmem),
2201
+ "l"(__a_desc),
2202
+ "l"(__b_desc),
2203
+ "r"(__idesc),
2204
+ "r"(__scale_A_tmem),
2205
+ "r"(__scale_B_tmem),
2206
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2207
+ : "memory");
2208
+ }
2209
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
2210
+ {
2211
+ asm volatile(
2212
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2213
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2214
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], "
2215
+ "[%5], PRED_enable_input_d;\n\t"
2216
+ "}"
2217
+ :
2218
+ : "r"(__d_tmem),
2219
+ "l"(__a_desc),
2220
+ "l"(__b_desc),
2221
+ "r"(__idesc),
2222
+ "r"(__scale_A_tmem),
2223
+ "r"(__scale_B_tmem),
2224
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2225
+ : "memory");
2226
+ }
2227
+
2228
+ # else
2229
+ // Unsupported architectures will have a linker error with a semi-decent error message
2230
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_fill_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2231
+ # endif
2232
+ }
2233
+ #endif // __cccl_ptx_isa >= 860
2234
+
2235
+ /*
2236
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc,
2237
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
2238
+ // .kind = { .kind::mxf4nvf4 }
2239
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2240
+ template <cuda::ptx::dot_cta_group Cta_Group>
2241
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill(
2242
+ cuda::ptx::kind_mxf4nvf4_t,
2243
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2244
+ uint32_t d_tmem,
2245
+ uint64_t a_desc,
2246
+ uint64_t b_desc,
2247
+ uint32_t idesc,
2248
+ uint32_t scale_A_tmem,
2249
+ uint32_t scale_B_tmem,
2250
+ bool enable_input_d);
2251
+ */
2252
+ #if __cccl_ptx_isa >= 860
2253
+ extern "C" _CCCL_DEVICE void
2254
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_fill_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2255
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
2256
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill(
2257
+ ::cuda::ptx::kind_mxf4nvf4_t,
2258
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
2259
+ ::cuda::std::uint32_t __d_tmem,
2260
+ ::cuda::std::uint64_t __a_desc,
2261
+ ::cuda::std::uint64_t __b_desc,
2262
+ ::cuda::std::uint32_t __idesc,
2263
+ ::cuda::std::uint32_t __scale_A_tmem,
2264
+ ::cuda::std::uint32_t __scale_B_tmem,
2265
+ bool __enable_input_d)
2266
+ {
2267
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
2268
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2269
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
2270
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
2271
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
2272
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
2273
+ if constexpr (__cta_group == cta_group_1)
2274
+ {
2275
+ asm volatile(
2276
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2277
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2278
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], "
2279
+ "[%5], PRED_enable_input_d;\n\t"
2280
+ "}"
2281
+ :
2282
+ : "r"(__d_tmem),
2283
+ "l"(__a_desc),
2284
+ "l"(__b_desc),
2285
+ "r"(__idesc),
2286
+ "r"(__scale_A_tmem),
2287
+ "r"(__scale_B_tmem),
2288
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2289
+ : "memory");
2290
+ }
2291
+ else if constexpr (__cta_group == cta_group_2)
2292
+ {
2293
+ asm volatile(
2294
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2295
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2296
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], "
2297
+ "[%5], PRED_enable_input_d;\n\t"
2298
+ "}"
2299
+ :
2300
+ : "r"(__d_tmem),
2301
+ "l"(__a_desc),
2302
+ "l"(__b_desc),
2303
+ "r"(__idesc),
2304
+ "r"(__scale_A_tmem),
2305
+ "r"(__scale_B_tmem),
2306
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2307
+ : "memory");
2308
+ }
2309
+
2310
+ # else
2311
+ // Unsupported architectures will have a linker error with a semi-decent error message
2312
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_fill_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2313
+ # endif
2314
+ }
2315
+ #endif // __cccl_ptx_isa >= 860
2316
+
2317
+ /*
2318
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc,
2319
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
2320
+ // .kind = { .kind::mxf8f6f4 }
2321
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2322
+ template <cuda::ptx::dot_cta_group Cta_Group>
2323
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill(
2324
+ cuda::ptx::kind_mxf8f6f4_t,
2325
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2326
+ uint32_t d_tmem,
2327
+ uint64_t a_desc,
2328
+ uint64_t b_desc,
2329
+ uint32_t idesc,
2330
+ uint32_t scale_A_tmem,
2331
+ uint32_t scale_B_tmem,
2332
+ bool enable_input_d);
2333
+ */
2334
+ #if __cccl_ptx_isa >= 860
2335
+ extern "C" _CCCL_DEVICE void
2336
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2337
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
2338
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill(
2339
+ ::cuda::ptx::kind_mxf8f6f4_t,
2340
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
2341
+ ::cuda::std::uint32_t __d_tmem,
2342
+ ::cuda::std::uint64_t __a_desc,
2343
+ ::cuda::std::uint64_t __b_desc,
2344
+ ::cuda::std::uint32_t __idesc,
2345
+ ::cuda::std::uint32_t __scale_A_tmem,
2346
+ ::cuda::std::uint32_t __scale_B_tmem,
2347
+ bool __enable_input_d)
2348
+ {
2349
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
2350
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2351
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
2352
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
2353
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
2354
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
2355
+ if constexpr (__cta_group == cta_group_1)
2356
+ {
2357
+ asm volatile(
2358
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2359
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2360
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], "
2361
+ "[%5], PRED_enable_input_d;\n\t"
2362
+ "}"
2363
+ :
2364
+ : "r"(__d_tmem),
2365
+ "l"(__a_desc),
2366
+ "l"(__b_desc),
2367
+ "r"(__idesc),
2368
+ "r"(__scale_A_tmem),
2369
+ "r"(__scale_B_tmem),
2370
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2371
+ : "memory");
2372
+ }
2373
+ else if constexpr (__cta_group == cta_group_2)
2374
+ {
2375
+ asm volatile(
2376
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2377
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2378
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], "
2379
+ "[%5], PRED_enable_input_d;\n\t"
2380
+ "}"
2381
+ :
2382
+ : "r"(__d_tmem),
2383
+ "l"(__a_desc),
2384
+ "l"(__b_desc),
2385
+ "r"(__idesc),
2386
+ "r"(__scale_A_tmem),
2387
+ "r"(__scale_B_tmem),
2388
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2389
+ : "memory");
2390
+ }
2391
+
2392
+ # else
2393
+ // Unsupported architectures will have a linker error with a semi-decent error message
2394
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2395
+ # endif
2396
+ }
2397
+ #endif // __cccl_ptx_isa >= 860
2398
+
2399
+ /*
2400
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc,
2401
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
2402
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
2403
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2404
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
2405
+ __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill(
2406
+ cuda::ptx::kind_t<Kind> kind,
2407
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2408
+ uint32_t d_tmem,
2409
+ uint64_t a_desc,
2410
+ uint64_t b_desc,
2411
+ uint32_t idesc,
2412
+ uint32_t scale_A_tmem,
2413
+ uint32_t scale_B_tmem,
2414
+ bool enable_input_d);
2415
+ */
2416
+ #if __cccl_ptx_isa >= 860
2417
+ extern "C" _CCCL_DEVICE void
2418
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2419
+ template <::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
2420
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill(
2421
+ ::cuda::ptx::kind_t<_Kind> __kind,
2422
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
2423
+ ::cuda::std::uint32_t __d_tmem,
2424
+ ::cuda::std::uint64_t __a_desc,
2425
+ ::cuda::std::uint64_t __b_desc,
2426
+ ::cuda::std::uint32_t __idesc,
2427
+ ::cuda::std::uint32_t __scale_A_tmem,
2428
+ ::cuda::std::uint32_t __scale_B_tmem,
2429
+ bool __enable_input_d)
2430
+ {
2431
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
2432
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2433
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
2434
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
2435
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
2436
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
2437
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
2438
+ {
2439
+ asm volatile(
2440
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2441
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2442
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], "
2443
+ "PRED_enable_input_d;\n\t"
2444
+ "}"
2445
+ :
2446
+ : "r"(__d_tmem),
2447
+ "l"(__a_desc),
2448
+ "l"(__b_desc),
2449
+ "r"(__idesc),
2450
+ "r"(__scale_A_tmem),
2451
+ "r"(__scale_B_tmem),
2452
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2453
+ : "memory");
2454
+ }
2455
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
2456
+ {
2457
+ asm volatile(
2458
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2459
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2460
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], "
2461
+ "PRED_enable_input_d;\n\t"
2462
+ "}"
2463
+ :
2464
+ : "r"(__d_tmem),
2465
+ "l"(__a_desc),
2466
+ "l"(__b_desc),
2467
+ "r"(__idesc),
2468
+ "r"(__scale_A_tmem),
2469
+ "r"(__scale_B_tmem),
2470
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2471
+ : "memory");
2472
+ }
2473
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
2474
+ {
2475
+ asm volatile(
2476
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2477
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2478
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], "
2479
+ "[%5], PRED_enable_input_d;\n\t"
2480
+ "}"
2481
+ :
2482
+ : "r"(__d_tmem),
2483
+ "l"(__a_desc),
2484
+ "l"(__b_desc),
2485
+ "r"(__idesc),
2486
+ "r"(__scale_A_tmem),
2487
+ "r"(__scale_B_tmem),
2488
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2489
+ : "memory");
2490
+ }
2491
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
2492
+ {
2493
+ asm volatile(
2494
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2495
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2496
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], "
2497
+ "[%5], PRED_enable_input_d;\n\t"
2498
+ "}"
2499
+ :
2500
+ : "r"(__d_tmem),
2501
+ "l"(__a_desc),
2502
+ "l"(__b_desc),
2503
+ "r"(__idesc),
2504
+ "r"(__scale_A_tmem),
2505
+ "r"(__scale_B_tmem),
2506
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2507
+ : "memory");
2508
+ }
2509
+
2510
+ # else
2511
+ // Unsupported architectures will have a linker error with a semi-decent error message
2512
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2513
+ # endif
2514
+ }
2515
+ #endif // __cccl_ptx_isa >= 860
2516
+
2517
+ /*
2518
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc,
2519
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
2520
+ // .kind = { .kind::mxf4nvf4 }
2521
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2522
+ template <cuda::ptx::dot_cta_group Cta_Group>
2523
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill(
2524
+ cuda::ptx::kind_mxf4nvf4_t,
2525
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2526
+ uint32_t d_tmem,
2527
+ uint64_t a_desc,
2528
+ uint64_t b_desc,
2529
+ uint32_t idesc,
2530
+ uint32_t scale_A_tmem,
2531
+ uint32_t scale_B_tmem,
2532
+ bool enable_input_d);
2533
+ */
2534
+ #if __cccl_ptx_isa >= 860
2535
+ extern "C" _CCCL_DEVICE void
2536
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2537
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
2538
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill(
2539
+ ::cuda::ptx::kind_mxf4nvf4_t,
2540
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
2541
+ ::cuda::std::uint32_t __d_tmem,
2542
+ ::cuda::std::uint64_t __a_desc,
2543
+ ::cuda::std::uint64_t __b_desc,
2544
+ ::cuda::std::uint32_t __idesc,
2545
+ ::cuda::std::uint32_t __scale_A_tmem,
2546
+ ::cuda::std::uint32_t __scale_B_tmem,
2547
+ bool __enable_input_d)
2548
+ {
2549
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
2550
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2551
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
2552
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
2553
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
2554
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
2555
+ if constexpr (__cta_group == cta_group_1)
2556
+ {
2557
+ asm volatile(
2558
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2559
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2560
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], "
2561
+ "[%5], PRED_enable_input_d;\n\t"
2562
+ "}"
2563
+ :
2564
+ : "r"(__d_tmem),
2565
+ "l"(__a_desc),
2566
+ "l"(__b_desc),
2567
+ "r"(__idesc),
2568
+ "r"(__scale_A_tmem),
2569
+ "r"(__scale_B_tmem),
2570
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2571
+ : "memory");
2572
+ }
2573
+ else if constexpr (__cta_group == cta_group_2)
2574
+ {
2575
+ asm volatile(
2576
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2577
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2578
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], "
2579
+ "[%5], PRED_enable_input_d;\n\t"
2580
+ "}"
2581
+ :
2582
+ : "r"(__d_tmem),
2583
+ "l"(__a_desc),
2584
+ "l"(__b_desc),
2585
+ "r"(__idesc),
2586
+ "r"(__scale_A_tmem),
2587
+ "r"(__scale_B_tmem),
2588
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2589
+ : "memory");
2590
+ }
2591
+
2592
+ # else
2593
+ // Unsupported architectures will have a linker error with a semi-decent error message
2594
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2595
+ # endif
2596
+ }
2597
+ #endif // __cccl_ptx_isa >= 860
2598
+
2599
+ /*
2600
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc,
2601
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
2602
+ // .kind = { .kind::mxf8f6f4 }
2603
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2604
+ template <cuda::ptx::dot_cta_group Cta_Group>
2605
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use(
2606
+ cuda::ptx::kind_mxf8f6f4_t,
2607
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2608
+ uint32_t d_tmem,
2609
+ uint64_t a_desc,
2610
+ uint64_t b_desc,
2611
+ uint32_t idesc,
2612
+ uint32_t scale_A_tmem,
2613
+ uint32_t scale_B_tmem,
2614
+ bool enable_input_d);
2615
+ */
2616
+ #if __cccl_ptx_isa >= 860
2617
+ extern "C" _CCCL_DEVICE void
2618
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_use_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2619
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
2620
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use(
2621
+ ::cuda::ptx::kind_mxf8f6f4_t,
2622
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
2623
+ ::cuda::std::uint32_t __d_tmem,
2624
+ ::cuda::std::uint64_t __a_desc,
2625
+ ::cuda::std::uint64_t __b_desc,
2626
+ ::cuda::std::uint32_t __idesc,
2627
+ ::cuda::std::uint32_t __scale_A_tmem,
2628
+ ::cuda::std::uint32_t __scale_B_tmem,
2629
+ bool __enable_input_d)
2630
+ {
2631
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
2632
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2633
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
2634
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
2635
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
2636
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
2637
+ if constexpr (__cta_group == cta_group_1)
2638
+ {
2639
+ asm volatile(
2640
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2641
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2642
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], "
2643
+ "[%5], PRED_enable_input_d;\n\t"
2644
+ "}"
2645
+ :
2646
+ : "r"(__d_tmem),
2647
+ "l"(__a_desc),
2648
+ "l"(__b_desc),
2649
+ "r"(__idesc),
2650
+ "r"(__scale_A_tmem),
2651
+ "r"(__scale_B_tmem),
2652
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2653
+ : "memory");
2654
+ }
2655
+ else if constexpr (__cta_group == cta_group_2)
2656
+ {
2657
+ asm volatile(
2658
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2659
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2660
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], "
2661
+ "[%5], PRED_enable_input_d;\n\t"
2662
+ "}"
2663
+ :
2664
+ : "r"(__d_tmem),
2665
+ "l"(__a_desc),
2666
+ "l"(__b_desc),
2667
+ "r"(__idesc),
2668
+ "r"(__scale_A_tmem),
2669
+ "r"(__scale_B_tmem),
2670
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2671
+ : "memory");
2672
+ }
2673
+
2674
+ # else
2675
+ // Unsupported architectures will have a linker error with a semi-decent error message
2676
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_use_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2677
+ # endif
2678
+ }
2679
+ #endif // __cccl_ptx_isa >= 860
2680
+
2681
+ /*
2682
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc,
2683
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
2684
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
2685
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2686
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
2687
+ __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use(
2688
+ cuda::ptx::kind_t<Kind> kind,
2689
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2690
+ uint32_t d_tmem,
2691
+ uint64_t a_desc,
2692
+ uint64_t b_desc,
2693
+ uint32_t idesc,
2694
+ uint32_t scale_A_tmem,
2695
+ uint32_t scale_B_tmem,
2696
+ bool enable_input_d);
2697
+ */
2698
+ #if __cccl_ptx_isa >= 860
2699
+ extern "C" _CCCL_DEVICE void
2700
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_use_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2701
+ template <::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
2702
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use(
2703
+ ::cuda::ptx::kind_t<_Kind> __kind,
2704
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
2705
+ ::cuda::std::uint32_t __d_tmem,
2706
+ ::cuda::std::uint64_t __a_desc,
2707
+ ::cuda::std::uint64_t __b_desc,
2708
+ ::cuda::std::uint32_t __idesc,
2709
+ ::cuda::std::uint32_t __scale_A_tmem,
2710
+ ::cuda::std::uint32_t __scale_B_tmem,
2711
+ bool __enable_input_d)
2712
+ {
2713
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
2714
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2715
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
2716
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
2717
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
2718
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
2719
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
2720
+ {
2721
+ asm volatile(
2722
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2723
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2724
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], "
2725
+ "PRED_enable_input_d;\n\t"
2726
+ "}"
2727
+ :
2728
+ : "r"(__d_tmem),
2729
+ "l"(__a_desc),
2730
+ "l"(__b_desc),
2731
+ "r"(__idesc),
2732
+ "r"(__scale_A_tmem),
2733
+ "r"(__scale_B_tmem),
2734
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2735
+ : "memory");
2736
+ }
2737
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
2738
+ {
2739
+ asm volatile(
2740
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2741
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2742
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], "
2743
+ "PRED_enable_input_d;\n\t"
2744
+ "}"
2745
+ :
2746
+ : "r"(__d_tmem),
2747
+ "l"(__a_desc),
2748
+ "l"(__b_desc),
2749
+ "r"(__idesc),
2750
+ "r"(__scale_A_tmem),
2751
+ "r"(__scale_B_tmem),
2752
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2753
+ : "memory");
2754
+ }
2755
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
2756
+ {
2757
+ asm volatile(
2758
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2759
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2760
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], "
2761
+ "[%5], PRED_enable_input_d;\n\t"
2762
+ "}"
2763
+ :
2764
+ : "r"(__d_tmem),
2765
+ "l"(__a_desc),
2766
+ "l"(__b_desc),
2767
+ "r"(__idesc),
2768
+ "r"(__scale_A_tmem),
2769
+ "r"(__scale_B_tmem),
2770
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2771
+ : "memory");
2772
+ }
2773
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
2774
+ {
2775
+ asm volatile(
2776
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2777
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2778
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], "
2779
+ "[%5], PRED_enable_input_d;\n\t"
2780
+ "}"
2781
+ :
2782
+ : "r"(__d_tmem),
2783
+ "l"(__a_desc),
2784
+ "l"(__b_desc),
2785
+ "r"(__idesc),
2786
+ "r"(__scale_A_tmem),
2787
+ "r"(__scale_B_tmem),
2788
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2789
+ : "memory");
2790
+ }
2791
+
2792
+ # else
2793
+ // Unsupported architectures will have a linker error with a semi-decent error message
2794
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_use_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2795
+ # endif
2796
+ }
2797
+ #endif // __cccl_ptx_isa >= 860
2798
+
2799
+ /*
2800
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc,
2801
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
2802
+ // .kind = { .kind::mxf4nvf4 }
2803
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2804
+ template <cuda::ptx::dot_cta_group Cta_Group>
2805
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use(
2806
+ cuda::ptx::kind_mxf4nvf4_t,
2807
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2808
+ uint32_t d_tmem,
2809
+ uint64_t a_desc,
2810
+ uint64_t b_desc,
2811
+ uint32_t idesc,
2812
+ uint32_t scale_A_tmem,
2813
+ uint32_t scale_B_tmem,
2814
+ bool enable_input_d);
2815
+ */
2816
+ #if __cccl_ptx_isa >= 860
2817
+ extern "C" _CCCL_DEVICE void
2818
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_use_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2819
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
2820
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use(
2821
+ ::cuda::ptx::kind_mxf4nvf4_t,
2822
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
2823
+ ::cuda::std::uint32_t __d_tmem,
2824
+ ::cuda::std::uint64_t __a_desc,
2825
+ ::cuda::std::uint64_t __b_desc,
2826
+ ::cuda::std::uint32_t __idesc,
2827
+ ::cuda::std::uint32_t __scale_A_tmem,
2828
+ ::cuda::std::uint32_t __scale_B_tmem,
2829
+ bool __enable_input_d)
2830
+ {
2831
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
2832
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2833
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
2834
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
2835
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
2836
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
2837
+ if constexpr (__cta_group == cta_group_1)
2838
+ {
2839
+ asm volatile(
2840
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2841
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2842
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], "
2843
+ "[%5], PRED_enable_input_d;\n\t"
2844
+ "}"
2845
+ :
2846
+ : "r"(__d_tmem),
2847
+ "l"(__a_desc),
2848
+ "l"(__b_desc),
2849
+ "r"(__idesc),
2850
+ "r"(__scale_A_tmem),
2851
+ "r"(__scale_B_tmem),
2852
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2853
+ : "memory");
2854
+ }
2855
+ else if constexpr (__cta_group == cta_group_2)
2856
+ {
2857
+ asm volatile(
2858
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2859
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2860
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], "
2861
+ "[%5], PRED_enable_input_d;\n\t"
2862
+ "}"
2863
+ :
2864
+ : "r"(__d_tmem),
2865
+ "l"(__a_desc),
2866
+ "l"(__b_desc),
2867
+ "r"(__idesc),
2868
+ "r"(__scale_A_tmem),
2869
+ "r"(__scale_B_tmem),
2870
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2871
+ : "memory");
2872
+ }
2873
+
2874
+ # else
2875
+ // Unsupported architectures will have a linker error with a semi-decent error message
2876
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_use_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2877
+ # endif
2878
+ }
2879
+ #endif // __cccl_ptx_isa >= 860
2880
+
2881
+ /*
2882
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc,
2883
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
2884
+ // .kind = { .kind::mxf8f6f4 }
2885
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2886
+ template <cuda::ptx::dot_cta_group Cta_Group>
2887
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use(
2888
+ cuda::ptx::kind_mxf8f6f4_t,
2889
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2890
+ uint32_t d_tmem,
2891
+ uint64_t a_desc,
2892
+ uint64_t b_desc,
2893
+ uint32_t idesc,
2894
+ uint32_t scale_A_tmem,
2895
+ uint32_t scale_B_tmem,
2896
+ bool enable_input_d);
2897
+ */
2898
+ #if __cccl_ptx_isa >= 860
2899
+ extern "C" _CCCL_DEVICE void
2900
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2901
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
2902
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use(
2903
+ ::cuda::ptx::kind_mxf8f6f4_t,
2904
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
2905
+ ::cuda::std::uint32_t __d_tmem,
2906
+ ::cuda::std::uint64_t __a_desc,
2907
+ ::cuda::std::uint64_t __b_desc,
2908
+ ::cuda::std::uint32_t __idesc,
2909
+ ::cuda::std::uint32_t __scale_A_tmem,
2910
+ ::cuda::std::uint32_t __scale_B_tmem,
2911
+ bool __enable_input_d)
2912
+ {
2913
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
2914
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2915
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
2916
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
2917
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
2918
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
2919
+ if constexpr (__cta_group == cta_group_1)
2920
+ {
2921
+ asm volatile(
2922
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2923
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2924
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], "
2925
+ "[%5], PRED_enable_input_d;\n\t"
2926
+ "}"
2927
+ :
2928
+ : "r"(__d_tmem),
2929
+ "l"(__a_desc),
2930
+ "l"(__b_desc),
2931
+ "r"(__idesc),
2932
+ "r"(__scale_A_tmem),
2933
+ "r"(__scale_B_tmem),
2934
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2935
+ : "memory");
2936
+ }
2937
+ else if constexpr (__cta_group == cta_group_2)
2938
+ {
2939
+ asm volatile(
2940
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
2941
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
2942
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], "
2943
+ "[%5], PRED_enable_input_d;\n\t"
2944
+ "}"
2945
+ :
2946
+ : "r"(__d_tmem),
2947
+ "l"(__a_desc),
2948
+ "l"(__b_desc),
2949
+ "r"(__idesc),
2950
+ "r"(__scale_A_tmem),
2951
+ "r"(__scale_B_tmem),
2952
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
2953
+ : "memory");
2954
+ }
2955
+
2956
+ # else
2957
+ // Unsupported architectures will have a linker error with a semi-decent error message
2958
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2959
+ # endif
2960
+ }
2961
+ #endif // __cccl_ptx_isa >= 860
2962
+
2963
+ /*
2964
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc,
2965
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
2966
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
2967
+ // .cta_group = { .cta_group::1, .cta_group::2 }
2968
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
2969
+ __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use(
2970
+ cuda::ptx::kind_t<Kind> kind,
2971
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
2972
+ uint32_t d_tmem,
2973
+ uint64_t a_desc,
2974
+ uint64_t b_desc,
2975
+ uint32_t idesc,
2976
+ uint32_t scale_A_tmem,
2977
+ uint32_t scale_B_tmem,
2978
+ bool enable_input_d);
2979
+ */
2980
+ #if __cccl_ptx_isa >= 860
2981
+ extern "C" _CCCL_DEVICE void
2982
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
2983
+ template <::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
2984
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use(
2985
+ ::cuda::ptx::kind_t<_Kind> __kind,
2986
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
2987
+ ::cuda::std::uint32_t __d_tmem,
2988
+ ::cuda::std::uint64_t __a_desc,
2989
+ ::cuda::std::uint64_t __b_desc,
2990
+ ::cuda::std::uint32_t __idesc,
2991
+ ::cuda::std::uint32_t __scale_A_tmem,
2992
+ ::cuda::std::uint32_t __scale_B_tmem,
2993
+ bool __enable_input_d)
2994
+ {
2995
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
2996
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
2997
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
2998
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
2999
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
3000
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
3001
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
3002
+ {
3003
+ asm volatile(
3004
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3005
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3006
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], "
3007
+ "PRED_enable_input_d;\n\t"
3008
+ "}"
3009
+ :
3010
+ : "r"(__d_tmem),
3011
+ "l"(__a_desc),
3012
+ "l"(__b_desc),
3013
+ "r"(__idesc),
3014
+ "r"(__scale_A_tmem),
3015
+ "r"(__scale_B_tmem),
3016
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3017
+ : "memory");
3018
+ }
3019
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
3020
+ {
3021
+ asm volatile(
3022
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3023
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3024
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], "
3025
+ "PRED_enable_input_d;\n\t"
3026
+ "}"
3027
+ :
3028
+ : "r"(__d_tmem),
3029
+ "l"(__a_desc),
3030
+ "l"(__b_desc),
3031
+ "r"(__idesc),
3032
+ "r"(__scale_A_tmem),
3033
+ "r"(__scale_B_tmem),
3034
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3035
+ : "memory");
3036
+ }
3037
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
3038
+ {
3039
+ asm volatile(
3040
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3041
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3042
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], "
3043
+ "[%5], PRED_enable_input_d;\n\t"
3044
+ "}"
3045
+ :
3046
+ : "r"(__d_tmem),
3047
+ "l"(__a_desc),
3048
+ "l"(__b_desc),
3049
+ "r"(__idesc),
3050
+ "r"(__scale_A_tmem),
3051
+ "r"(__scale_B_tmem),
3052
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3053
+ : "memory");
3054
+ }
3055
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
3056
+ {
3057
+ asm volatile(
3058
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3059
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3060
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], "
3061
+ "[%5], PRED_enable_input_d;\n\t"
3062
+ "}"
3063
+ :
3064
+ : "r"(__d_tmem),
3065
+ "l"(__a_desc),
3066
+ "l"(__b_desc),
3067
+ "r"(__idesc),
3068
+ "r"(__scale_A_tmem),
3069
+ "r"(__scale_B_tmem),
3070
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3071
+ : "memory");
3072
+ }
3073
+
3074
+ # else
3075
+ // Unsupported architectures will have a linker error with a semi-decent error message
3076
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3077
+ # endif
3078
+ }
3079
+ #endif // __cccl_ptx_isa >= 860
3080
+
3081
+ /*
3082
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc,
3083
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
3084
+ // .kind = { .kind::mxf4nvf4 }
3085
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3086
+ template <cuda::ptx::dot_cta_group Cta_Group>
3087
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use(
3088
+ cuda::ptx::kind_mxf4nvf4_t,
3089
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3090
+ uint32_t d_tmem,
3091
+ uint64_t a_desc,
3092
+ uint64_t b_desc,
3093
+ uint32_t idesc,
3094
+ uint32_t scale_A_tmem,
3095
+ uint32_t scale_B_tmem,
3096
+ bool enable_input_d);
3097
+ */
3098
+ #if __cccl_ptx_isa >= 860
3099
+ extern "C" _CCCL_DEVICE void
3100
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3101
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
3102
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use(
3103
+ ::cuda::ptx::kind_mxf4nvf4_t,
3104
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
3105
+ ::cuda::std::uint32_t __d_tmem,
3106
+ ::cuda::std::uint64_t __a_desc,
3107
+ ::cuda::std::uint64_t __b_desc,
3108
+ ::cuda::std::uint32_t __idesc,
3109
+ ::cuda::std::uint32_t __scale_A_tmem,
3110
+ ::cuda::std::uint32_t __scale_B_tmem,
3111
+ bool __enable_input_d)
3112
+ {
3113
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
3114
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3115
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
3116
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
3117
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
3118
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
3119
+ if constexpr (__cta_group == cta_group_1)
3120
+ {
3121
+ asm volatile(
3122
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3123
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3124
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], "
3125
+ "[%5], PRED_enable_input_d;\n\t"
3126
+ "}"
3127
+ :
3128
+ : "r"(__d_tmem),
3129
+ "l"(__a_desc),
3130
+ "l"(__b_desc),
3131
+ "r"(__idesc),
3132
+ "r"(__scale_A_tmem),
3133
+ "r"(__scale_B_tmem),
3134
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3135
+ : "memory");
3136
+ }
3137
+ else if constexpr (__cta_group == cta_group_2)
3138
+ {
3139
+ asm volatile(
3140
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3141
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3142
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], "
3143
+ "[%5], PRED_enable_input_d;\n\t"
3144
+ "}"
3145
+ :
3146
+ : "r"(__d_tmem),
3147
+ "l"(__a_desc),
3148
+ "l"(__b_desc),
3149
+ "r"(__idesc),
3150
+ "r"(__scale_A_tmem),
3151
+ "r"(__scale_B_tmem),
3152
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3153
+ : "memory");
3154
+ }
3155
+
3156
+ # else
3157
+ // Unsupported architectures will have a linker error with a semi-decent error message
3158
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3159
+ # endif
3160
+ }
3161
+ #endif // __cccl_ptx_isa >= 860
3162
+
3163
+ /*
3164
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc,
3165
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
3166
+ // .kind = { .kind::mxf8f6f4 }
3167
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3168
+ template <cuda::ptx::dot_cta_group Cta_Group>
3169
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse(
3170
+ cuda::ptx::kind_mxf8f6f4_t,
3171
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3172
+ uint32_t d_tmem,
3173
+ uint64_t a_desc,
3174
+ uint64_t b_desc,
3175
+ uint32_t idesc,
3176
+ uint32_t scale_A_tmem,
3177
+ uint32_t scale_B_tmem,
3178
+ bool enable_input_d);
3179
+ */
3180
+ #if __cccl_ptx_isa >= 860
3181
+ extern "C" _CCCL_DEVICE void
3182
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_lastuse_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3183
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
3184
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse(
3185
+ ::cuda::ptx::kind_mxf8f6f4_t,
3186
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
3187
+ ::cuda::std::uint32_t __d_tmem,
3188
+ ::cuda::std::uint64_t __a_desc,
3189
+ ::cuda::std::uint64_t __b_desc,
3190
+ ::cuda::std::uint32_t __idesc,
3191
+ ::cuda::std::uint32_t __scale_A_tmem,
3192
+ ::cuda::std::uint32_t __scale_B_tmem,
3193
+ bool __enable_input_d)
3194
+ {
3195
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
3196
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3197
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
3198
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
3199
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
3200
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
3201
+ if constexpr (__cta_group == cta_group_1)
3202
+ {
3203
+ asm volatile(
3204
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3205
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3206
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3207
+ "[%5], PRED_enable_input_d;\n\t"
3208
+ "}"
3209
+ :
3210
+ : "r"(__d_tmem),
3211
+ "l"(__a_desc),
3212
+ "l"(__b_desc),
3213
+ "r"(__idesc),
3214
+ "r"(__scale_A_tmem),
3215
+ "r"(__scale_B_tmem),
3216
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3217
+ : "memory");
3218
+ }
3219
+ else if constexpr (__cta_group == cta_group_2)
3220
+ {
3221
+ asm volatile(
3222
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3223
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3224
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3225
+ "[%5], PRED_enable_input_d;\n\t"
3226
+ "}"
3227
+ :
3228
+ : "r"(__d_tmem),
3229
+ "l"(__a_desc),
3230
+ "l"(__b_desc),
3231
+ "r"(__idesc),
3232
+ "r"(__scale_A_tmem),
3233
+ "r"(__scale_B_tmem),
3234
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3235
+ : "memory");
3236
+ }
3237
+
3238
+ # else
3239
+ // Unsupported architectures will have a linker error with a semi-decent error message
3240
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_lastuse_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3241
+ # endif
3242
+ }
3243
+ #endif // __cccl_ptx_isa >= 860
3244
+
3245
+ /*
3246
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc,
3247
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
3248
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
3249
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3250
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
3251
+ __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse(
3252
+ cuda::ptx::kind_t<Kind> kind,
3253
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3254
+ uint32_t d_tmem,
3255
+ uint64_t a_desc,
3256
+ uint64_t b_desc,
3257
+ uint32_t idesc,
3258
+ uint32_t scale_A_tmem,
3259
+ uint32_t scale_B_tmem,
3260
+ bool enable_input_d);
3261
+ */
3262
+ #if __cccl_ptx_isa >= 860
3263
+ extern "C" _CCCL_DEVICE void
3264
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_lastuse_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3265
+ template <::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
3266
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse(
3267
+ ::cuda::ptx::kind_t<_Kind> __kind,
3268
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
3269
+ ::cuda::std::uint32_t __d_tmem,
3270
+ ::cuda::std::uint64_t __a_desc,
3271
+ ::cuda::std::uint64_t __b_desc,
3272
+ ::cuda::std::uint32_t __idesc,
3273
+ ::cuda::std::uint32_t __scale_A_tmem,
3274
+ ::cuda::std::uint32_t __scale_B_tmem,
3275
+ bool __enable_input_d)
3276
+ {
3277
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
3278
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3279
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
3280
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
3281
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
3282
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
3283
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
3284
+ {
3285
+ asm volatile(
3286
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3287
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3288
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3289
+ "[%5], PRED_enable_input_d;\n\t"
3290
+ "}"
3291
+ :
3292
+ : "r"(__d_tmem),
3293
+ "l"(__a_desc),
3294
+ "l"(__b_desc),
3295
+ "r"(__idesc),
3296
+ "r"(__scale_A_tmem),
3297
+ "r"(__scale_B_tmem),
3298
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3299
+ : "memory");
3300
+ }
3301
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
3302
+ {
3303
+ asm volatile(
3304
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3305
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3306
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3307
+ "[%5], PRED_enable_input_d;\n\t"
3308
+ "}"
3309
+ :
3310
+ : "r"(__d_tmem),
3311
+ "l"(__a_desc),
3312
+ "l"(__b_desc),
3313
+ "r"(__idesc),
3314
+ "r"(__scale_A_tmem),
3315
+ "r"(__scale_B_tmem),
3316
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3317
+ : "memory");
3318
+ }
3319
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
3320
+ {
3321
+ asm volatile(
3322
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3323
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3324
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3325
+ "[%5], PRED_enable_input_d;\n\t"
3326
+ "}"
3327
+ :
3328
+ : "r"(__d_tmem),
3329
+ "l"(__a_desc),
3330
+ "l"(__b_desc),
3331
+ "r"(__idesc),
3332
+ "r"(__scale_A_tmem),
3333
+ "r"(__scale_B_tmem),
3334
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3335
+ : "memory");
3336
+ }
3337
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
3338
+ {
3339
+ asm volatile(
3340
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3341
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3342
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3343
+ "[%5], PRED_enable_input_d;\n\t"
3344
+ "}"
3345
+ :
3346
+ : "r"(__d_tmem),
3347
+ "l"(__a_desc),
3348
+ "l"(__b_desc),
3349
+ "r"(__idesc),
3350
+ "r"(__scale_A_tmem),
3351
+ "r"(__scale_B_tmem),
3352
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3353
+ : "memory");
3354
+ }
3355
+
3356
+ # else
3357
+ // Unsupported architectures will have a linker error with a semi-decent error message
3358
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_lastuse_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3359
+ # endif
3360
+ }
3361
+ #endif // __cccl_ptx_isa >= 860
3362
+
3363
+ /*
3364
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc,
3365
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
3366
+ // .kind = { .kind::mxf4nvf4 }
3367
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3368
+ template <cuda::ptx::dot_cta_group Cta_Group>
3369
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse(
3370
+ cuda::ptx::kind_mxf4nvf4_t,
3371
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3372
+ uint32_t d_tmem,
3373
+ uint64_t a_desc,
3374
+ uint64_t b_desc,
3375
+ uint32_t idesc,
3376
+ uint32_t scale_A_tmem,
3377
+ uint32_t scale_B_tmem,
3378
+ bool enable_input_d);
3379
+ */
3380
+ #if __cccl_ptx_isa >= 860
3381
+ extern "C" _CCCL_DEVICE void
3382
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_lastuse_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3383
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
3384
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse(
3385
+ ::cuda::ptx::kind_mxf4nvf4_t,
3386
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
3387
+ ::cuda::std::uint32_t __d_tmem,
3388
+ ::cuda::std::uint64_t __a_desc,
3389
+ ::cuda::std::uint64_t __b_desc,
3390
+ ::cuda::std::uint32_t __idesc,
3391
+ ::cuda::std::uint32_t __scale_A_tmem,
3392
+ ::cuda::std::uint32_t __scale_B_tmem,
3393
+ bool __enable_input_d)
3394
+ {
3395
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
3396
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3397
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
3398
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
3399
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
3400
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
3401
+ if constexpr (__cta_group == cta_group_1)
3402
+ {
3403
+ asm volatile(
3404
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3405
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3406
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3407
+ "[%5], PRED_enable_input_d;\n\t"
3408
+ "}"
3409
+ :
3410
+ : "r"(__d_tmem),
3411
+ "l"(__a_desc),
3412
+ "l"(__b_desc),
3413
+ "r"(__idesc),
3414
+ "r"(__scale_A_tmem),
3415
+ "r"(__scale_B_tmem),
3416
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3417
+ : "memory");
3418
+ }
3419
+ else if constexpr (__cta_group == cta_group_2)
3420
+ {
3421
+ asm volatile(
3422
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3423
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3424
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3425
+ "[%5], PRED_enable_input_d;\n\t"
3426
+ "}"
3427
+ :
3428
+ : "r"(__d_tmem),
3429
+ "l"(__a_desc),
3430
+ "l"(__b_desc),
3431
+ "r"(__idesc),
3432
+ "r"(__scale_A_tmem),
3433
+ "r"(__scale_B_tmem),
3434
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3435
+ : "memory");
3436
+ }
3437
+
3438
+ # else
3439
+ // Unsupported architectures will have a linker error with a semi-decent error message
3440
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_lastuse_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3441
+ # endif
3442
+ }
3443
+ #endif // __cccl_ptx_isa >= 860
3444
+
3445
+ /*
3446
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc,
3447
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
3448
+ // .kind = { .kind::mxf8f6f4 }
3449
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3450
+ template <cuda::ptx::dot_cta_group Cta_Group>
3451
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse(
3452
+ cuda::ptx::kind_mxf8f6f4_t,
3453
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3454
+ uint32_t d_tmem,
3455
+ uint64_t a_desc,
3456
+ uint64_t b_desc,
3457
+ uint32_t idesc,
3458
+ uint32_t scale_A_tmem,
3459
+ uint32_t scale_B_tmem,
3460
+ bool enable_input_d);
3461
+ */
3462
+ #if __cccl_ptx_isa >= 860
3463
+ extern "C" _CCCL_DEVICE void
3464
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3465
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
3466
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse(
3467
+ ::cuda::ptx::kind_mxf8f6f4_t,
3468
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
3469
+ ::cuda::std::uint32_t __d_tmem,
3470
+ ::cuda::std::uint64_t __a_desc,
3471
+ ::cuda::std::uint64_t __b_desc,
3472
+ ::cuda::std::uint32_t __idesc,
3473
+ ::cuda::std::uint32_t __scale_A_tmem,
3474
+ ::cuda::std::uint32_t __scale_B_tmem,
3475
+ bool __enable_input_d)
3476
+ {
3477
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
3478
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3479
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
3480
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
3481
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
3482
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
3483
+ if constexpr (__cta_group == cta_group_1)
3484
+ {
3485
+ asm volatile(
3486
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3487
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3488
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3489
+ "[%5], PRED_enable_input_d;\n\t"
3490
+ "}"
3491
+ :
3492
+ : "r"(__d_tmem),
3493
+ "l"(__a_desc),
3494
+ "l"(__b_desc),
3495
+ "r"(__idesc),
3496
+ "r"(__scale_A_tmem),
3497
+ "r"(__scale_B_tmem),
3498
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3499
+ : "memory");
3500
+ }
3501
+ else if constexpr (__cta_group == cta_group_2)
3502
+ {
3503
+ asm volatile(
3504
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3505
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3506
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3507
+ "[%5], PRED_enable_input_d;\n\t"
3508
+ "}"
3509
+ :
3510
+ : "r"(__d_tmem),
3511
+ "l"(__a_desc),
3512
+ "l"(__b_desc),
3513
+ "r"(__idesc),
3514
+ "r"(__scale_A_tmem),
3515
+ "r"(__scale_B_tmem),
3516
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3517
+ : "memory");
3518
+ }
3519
+
3520
+ # else
3521
+ // Unsupported architectures will have a linker error with a semi-decent error message
3522
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3523
+ # endif
3524
+ }
3525
+ #endif // __cccl_ptx_isa >= 860
3526
+
3527
+ /*
3528
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc,
3529
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
3530
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
3531
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3532
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
3533
+ __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse(
3534
+ cuda::ptx::kind_t<Kind> kind,
3535
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3536
+ uint32_t d_tmem,
3537
+ uint64_t a_desc,
3538
+ uint64_t b_desc,
3539
+ uint32_t idesc,
3540
+ uint32_t scale_A_tmem,
3541
+ uint32_t scale_B_tmem,
3542
+ bool enable_input_d);
3543
+ */
3544
+ #if __cccl_ptx_isa >= 860
3545
+ extern "C" _CCCL_DEVICE void
3546
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3547
+ template <::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
3548
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse(
3549
+ ::cuda::ptx::kind_t<_Kind> __kind,
3550
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
3551
+ ::cuda::std::uint32_t __d_tmem,
3552
+ ::cuda::std::uint64_t __a_desc,
3553
+ ::cuda::std::uint64_t __b_desc,
3554
+ ::cuda::std::uint32_t __idesc,
3555
+ ::cuda::std::uint32_t __scale_A_tmem,
3556
+ ::cuda::std::uint32_t __scale_B_tmem,
3557
+ bool __enable_input_d)
3558
+ {
3559
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
3560
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3561
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
3562
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
3563
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
3564
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
3565
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
3566
+ {
3567
+ asm volatile(
3568
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3569
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3570
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3571
+ "[%5], PRED_enable_input_d;\n\t"
3572
+ "}"
3573
+ :
3574
+ : "r"(__d_tmem),
3575
+ "l"(__a_desc),
3576
+ "l"(__b_desc),
3577
+ "r"(__idesc),
3578
+ "r"(__scale_A_tmem),
3579
+ "r"(__scale_B_tmem),
3580
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3581
+ : "memory");
3582
+ }
3583
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
3584
+ {
3585
+ asm volatile(
3586
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3587
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3588
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3589
+ "[%5], PRED_enable_input_d;\n\t"
3590
+ "}"
3591
+ :
3592
+ : "r"(__d_tmem),
3593
+ "l"(__a_desc),
3594
+ "l"(__b_desc),
3595
+ "r"(__idesc),
3596
+ "r"(__scale_A_tmem),
3597
+ "r"(__scale_B_tmem),
3598
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3599
+ : "memory");
3600
+ }
3601
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
3602
+ {
3603
+ asm volatile(
3604
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3605
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3606
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3607
+ "[%5], PRED_enable_input_d;\n\t"
3608
+ "}"
3609
+ :
3610
+ : "r"(__d_tmem),
3611
+ "l"(__a_desc),
3612
+ "l"(__b_desc),
3613
+ "r"(__idesc),
3614
+ "r"(__scale_A_tmem),
3615
+ "r"(__scale_B_tmem),
3616
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3617
+ : "memory");
3618
+ }
3619
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
3620
+ {
3621
+ asm volatile(
3622
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3623
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3624
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3625
+ "[%5], PRED_enable_input_d;\n\t"
3626
+ "}"
3627
+ :
3628
+ : "r"(__d_tmem),
3629
+ "l"(__a_desc),
3630
+ "l"(__b_desc),
3631
+ "r"(__idesc),
3632
+ "r"(__scale_A_tmem),
3633
+ "r"(__scale_B_tmem),
3634
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3635
+ : "memory");
3636
+ }
3637
+
3638
+ # else
3639
+ // Unsupported architectures will have a linker error with a semi-decent error message
3640
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3641
+ # endif
3642
+ }
3643
+ #endif // __cccl_ptx_isa >= 860
3644
+
3645
+ /*
3646
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc,
3647
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
3648
+ // .kind = { .kind::mxf4nvf4 }
3649
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3650
+ template <cuda::ptx::dot_cta_group Cta_Group>
3651
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse(
3652
+ cuda::ptx::kind_mxf4nvf4_t,
3653
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3654
+ uint32_t d_tmem,
3655
+ uint64_t a_desc,
3656
+ uint64_t b_desc,
3657
+ uint32_t idesc,
3658
+ uint32_t scale_A_tmem,
3659
+ uint32_t scale_B_tmem,
3660
+ bool enable_input_d);
3661
+ */
3662
+ #if __cccl_ptx_isa >= 860
3663
+ extern "C" _CCCL_DEVICE void
3664
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3665
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
3666
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse(
3667
+ ::cuda::ptx::kind_mxf4nvf4_t,
3668
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
3669
+ ::cuda::std::uint32_t __d_tmem,
3670
+ ::cuda::std::uint64_t __a_desc,
3671
+ ::cuda::std::uint64_t __b_desc,
3672
+ ::cuda::std::uint32_t __idesc,
3673
+ ::cuda::std::uint32_t __scale_A_tmem,
3674
+ ::cuda::std::uint32_t __scale_B_tmem,
3675
+ bool __enable_input_d)
3676
+ {
3677
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
3678
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3679
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
3680
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
3681
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
3682
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
3683
+ if constexpr (__cta_group == cta_group_1)
3684
+ {
3685
+ asm volatile(
3686
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3687
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3688
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3689
+ "[%5], PRED_enable_input_d;\n\t"
3690
+ "}"
3691
+ :
3692
+ : "r"(__d_tmem),
3693
+ "l"(__a_desc),
3694
+ "l"(__b_desc),
3695
+ "r"(__idesc),
3696
+ "r"(__scale_A_tmem),
3697
+ "r"(__scale_B_tmem),
3698
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3699
+ : "memory");
3700
+ }
3701
+ else if constexpr (__cta_group == cta_group_2)
3702
+ {
3703
+ asm volatile(
3704
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3705
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3706
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], "
3707
+ "[%5], PRED_enable_input_d;\n\t"
3708
+ "}"
3709
+ :
3710
+ : "r"(__d_tmem),
3711
+ "l"(__a_desc),
3712
+ "l"(__b_desc),
3713
+ "r"(__idesc),
3714
+ "r"(__scale_A_tmem),
3715
+ "r"(__scale_B_tmem),
3716
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3717
+ : "memory");
3718
+ }
3719
+
3720
+ # else
3721
+ // Unsupported architectures will have a linker error with a semi-decent error message
3722
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3723
+ # endif
3724
+ }
3725
+ #endif // __cccl_ptx_isa >= 860
3726
+
3727
+ /*
3728
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc,
3729
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
3730
+ // .kind = { .kind::mxf8f6f4 }
3731
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3732
+ template <cuda::ptx::dot_cta_group Cta_Group>
3733
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard(
3734
+ cuda::ptx::kind_mxf8f6f4_t,
3735
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3736
+ uint32_t d_tmem,
3737
+ uint64_t a_desc,
3738
+ uint64_t b_desc,
3739
+ uint32_t idesc,
3740
+ uint32_t scale_A_tmem,
3741
+ uint32_t scale_B_tmem,
3742
+ bool enable_input_d);
3743
+ */
3744
+ #if __cccl_ptx_isa >= 860
3745
+ extern "C" _CCCL_DEVICE void
3746
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_discard_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3747
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
3748
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard(
3749
+ ::cuda::ptx::kind_mxf8f6f4_t,
3750
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
3751
+ ::cuda::std::uint32_t __d_tmem,
3752
+ ::cuda::std::uint64_t __a_desc,
3753
+ ::cuda::std::uint64_t __b_desc,
3754
+ ::cuda::std::uint32_t __idesc,
3755
+ ::cuda::std::uint32_t __scale_A_tmem,
3756
+ ::cuda::std::uint32_t __scale_B_tmem,
3757
+ bool __enable_input_d)
3758
+ {
3759
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
3760
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3761
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
3762
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
3763
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
3764
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
3765
+ if constexpr (__cta_group == cta_group_1)
3766
+ {
3767
+ asm volatile(
3768
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3769
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3770
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], "
3771
+ "[%5], PRED_enable_input_d;\n\t"
3772
+ "}"
3773
+ :
3774
+ : "r"(__d_tmem),
3775
+ "l"(__a_desc),
3776
+ "l"(__b_desc),
3777
+ "r"(__idesc),
3778
+ "r"(__scale_A_tmem),
3779
+ "r"(__scale_B_tmem),
3780
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3781
+ : "memory");
3782
+ }
3783
+ else if constexpr (__cta_group == cta_group_2)
3784
+ {
3785
+ asm volatile(
3786
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3787
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3788
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], "
3789
+ "[%5], PRED_enable_input_d;\n\t"
3790
+ "}"
3791
+ :
3792
+ : "r"(__d_tmem),
3793
+ "l"(__a_desc),
3794
+ "l"(__b_desc),
3795
+ "r"(__idesc),
3796
+ "r"(__scale_A_tmem),
3797
+ "r"(__scale_B_tmem),
3798
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3799
+ : "memory");
3800
+ }
3801
+
3802
+ # else
3803
+ // Unsupported architectures will have a linker error with a semi-decent error message
3804
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_discard_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3805
+ # endif
3806
+ }
3807
+ #endif // __cccl_ptx_isa >= 860
3808
+
3809
+ /*
3810
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc,
3811
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
3812
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
3813
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3814
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
3815
+ __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard(
3816
+ cuda::ptx::kind_t<Kind> kind,
3817
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3818
+ uint32_t d_tmem,
3819
+ uint64_t a_desc,
3820
+ uint64_t b_desc,
3821
+ uint32_t idesc,
3822
+ uint32_t scale_A_tmem,
3823
+ uint32_t scale_B_tmem,
3824
+ bool enable_input_d);
3825
+ */
3826
+ #if __cccl_ptx_isa >= 860
3827
+ extern "C" _CCCL_DEVICE void
3828
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_discard_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3829
+ template <::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
3830
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard(
3831
+ ::cuda::ptx::kind_t<_Kind> __kind,
3832
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
3833
+ ::cuda::std::uint32_t __d_tmem,
3834
+ ::cuda::std::uint64_t __a_desc,
3835
+ ::cuda::std::uint64_t __b_desc,
3836
+ ::cuda::std::uint32_t __idesc,
3837
+ ::cuda::std::uint32_t __scale_A_tmem,
3838
+ ::cuda::std::uint32_t __scale_B_tmem,
3839
+ bool __enable_input_d)
3840
+ {
3841
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
3842
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3843
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
3844
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
3845
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
3846
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
3847
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
3848
+ {
3849
+ asm volatile(
3850
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3851
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3852
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
3853
+ "[%5], PRED_enable_input_d;\n\t"
3854
+ "}"
3855
+ :
3856
+ : "r"(__d_tmem),
3857
+ "l"(__a_desc),
3858
+ "l"(__b_desc),
3859
+ "r"(__idesc),
3860
+ "r"(__scale_A_tmem),
3861
+ "r"(__scale_B_tmem),
3862
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3863
+ : "memory");
3864
+ }
3865
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
3866
+ {
3867
+ asm volatile(
3868
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3869
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3870
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
3871
+ "[%5], PRED_enable_input_d;\n\t"
3872
+ "}"
3873
+ :
3874
+ : "r"(__d_tmem),
3875
+ "l"(__a_desc),
3876
+ "l"(__b_desc),
3877
+ "r"(__idesc),
3878
+ "r"(__scale_A_tmem),
3879
+ "r"(__scale_B_tmem),
3880
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3881
+ : "memory");
3882
+ }
3883
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
3884
+ {
3885
+ asm volatile(
3886
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3887
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3888
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
3889
+ "[%5], PRED_enable_input_d;\n\t"
3890
+ "}"
3891
+ :
3892
+ : "r"(__d_tmem),
3893
+ "l"(__a_desc),
3894
+ "l"(__b_desc),
3895
+ "r"(__idesc),
3896
+ "r"(__scale_A_tmem),
3897
+ "r"(__scale_B_tmem),
3898
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3899
+ : "memory");
3900
+ }
3901
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
3902
+ {
3903
+ asm volatile(
3904
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3905
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3906
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
3907
+ "[%5], PRED_enable_input_d;\n\t"
3908
+ "}"
3909
+ :
3910
+ : "r"(__d_tmem),
3911
+ "l"(__a_desc),
3912
+ "l"(__b_desc),
3913
+ "r"(__idesc),
3914
+ "r"(__scale_A_tmem),
3915
+ "r"(__scale_B_tmem),
3916
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3917
+ : "memory");
3918
+ }
3919
+
3920
+ # else
3921
+ // Unsupported architectures will have a linker error with a semi-decent error message
3922
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_discard_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3923
+ # endif
3924
+ }
3925
+ #endif // __cccl_ptx_isa >= 860
3926
+
3927
+ /*
3928
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc,
3929
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
3930
+ // .kind = { .kind::mxf4nvf4 }
3931
+ // .cta_group = { .cta_group::1, .cta_group::2 }
3932
+ template <cuda::ptx::dot_cta_group Cta_Group>
3933
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard(
3934
+ cuda::ptx::kind_mxf4nvf4_t,
3935
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
3936
+ uint32_t d_tmem,
3937
+ uint64_t a_desc,
3938
+ uint64_t b_desc,
3939
+ uint32_t idesc,
3940
+ uint32_t scale_A_tmem,
3941
+ uint32_t scale_B_tmem,
3942
+ bool enable_input_d);
3943
+ */
3944
+ #if __cccl_ptx_isa >= 860
3945
+ extern "C" _CCCL_DEVICE void
3946
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_discard_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
3947
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
3948
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard(
3949
+ ::cuda::ptx::kind_mxf4nvf4_t,
3950
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
3951
+ ::cuda::std::uint32_t __d_tmem,
3952
+ ::cuda::std::uint64_t __a_desc,
3953
+ ::cuda::std::uint64_t __b_desc,
3954
+ ::cuda::std::uint32_t __idesc,
3955
+ ::cuda::std::uint32_t __scale_A_tmem,
3956
+ ::cuda::std::uint32_t __scale_B_tmem,
3957
+ bool __enable_input_d)
3958
+ {
3959
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
3960
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
3961
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
3962
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
3963
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
3964
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
3965
+ if constexpr (__cta_group == cta_group_1)
3966
+ {
3967
+ asm volatile(
3968
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3969
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3970
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], "
3971
+ "[%5], PRED_enable_input_d;\n\t"
3972
+ "}"
3973
+ :
3974
+ : "r"(__d_tmem),
3975
+ "l"(__a_desc),
3976
+ "l"(__b_desc),
3977
+ "r"(__idesc),
3978
+ "r"(__scale_A_tmem),
3979
+ "r"(__scale_B_tmem),
3980
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3981
+ : "memory");
3982
+ }
3983
+ else if constexpr (__cta_group == cta_group_2)
3984
+ {
3985
+ asm volatile(
3986
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
3987
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
3988
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], "
3989
+ "[%5], PRED_enable_input_d;\n\t"
3990
+ "}"
3991
+ :
3992
+ : "r"(__d_tmem),
3993
+ "l"(__a_desc),
3994
+ "l"(__b_desc),
3995
+ "r"(__idesc),
3996
+ "r"(__scale_A_tmem),
3997
+ "r"(__scale_B_tmem),
3998
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
3999
+ : "memory");
4000
+ }
4001
+
4002
+ # else
4003
+ // Unsupported architectures will have a linker error with a semi-decent error message
4004
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_discard_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
4005
+ # endif
4006
+ }
4007
+ #endif // __cccl_ptx_isa >= 860
4008
+
4009
+ /*
4010
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc,
4011
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
4012
+ // .kind = { .kind::mxf8f6f4 }
4013
+ // .cta_group = { .cta_group::1, .cta_group::2 }
4014
+ template <cuda::ptx::dot_cta_group Cta_Group>
4015
+ __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard(
4016
+ cuda::ptx::kind_mxf8f6f4_t,
4017
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
4018
+ uint32_t d_tmem,
4019
+ uint64_t a_desc,
4020
+ uint64_t b_desc,
4021
+ uint32_t idesc,
4022
+ uint32_t scale_A_tmem,
4023
+ uint32_t scale_B_tmem,
4024
+ bool enable_input_d);
4025
+ */
4026
+ #if __cccl_ptx_isa >= 860
4027
+ extern "C" _CCCL_DEVICE void
4028
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
4029
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
4030
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard(
4031
+ ::cuda::ptx::kind_mxf8f6f4_t,
4032
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
4033
+ ::cuda::std::uint32_t __d_tmem,
4034
+ ::cuda::std::uint64_t __a_desc,
4035
+ ::cuda::std::uint64_t __b_desc,
4036
+ ::cuda::std::uint32_t __idesc,
4037
+ ::cuda::std::uint32_t __scale_A_tmem,
4038
+ ::cuda::std::uint32_t __scale_B_tmem,
4039
+ bool __enable_input_d)
4040
+ {
4041
+ // __kind == kind_mxf8f6f4 (due to parameter type constraint)
4042
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
4043
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
4044
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
4045
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
4046
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
4047
+ if constexpr (__cta_group == cta_group_1)
4048
+ {
4049
+ asm volatile(
4050
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
4051
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
4052
+ "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], "
4053
+ "[%5], PRED_enable_input_d;\n\t"
4054
+ "}"
4055
+ :
4056
+ : "r"(__d_tmem),
4057
+ "l"(__a_desc),
4058
+ "l"(__b_desc),
4059
+ "r"(__idesc),
4060
+ "r"(__scale_A_tmem),
4061
+ "r"(__scale_B_tmem),
4062
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
4063
+ : "memory");
4064
+ }
4065
+ else if constexpr (__cta_group == cta_group_2)
4066
+ {
4067
+ asm volatile(
4068
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
4069
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
4070
+ "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], "
4071
+ "[%5], PRED_enable_input_d;\n\t"
4072
+ "}"
4073
+ :
4074
+ : "r"(__d_tmem),
4075
+ "l"(__a_desc),
4076
+ "l"(__b_desc),
4077
+ "r"(__idesc),
4078
+ "r"(__scale_A_tmem),
4079
+ "r"(__scale_B_tmem),
4080
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
4081
+ : "memory");
4082
+ }
4083
+
4084
+ # else
4085
+ // Unsupported architectures will have a linker error with a semi-decent error message
4086
+ __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
4087
+ # endif
4088
+ }
4089
+ #endif // __cccl_ptx_isa >= 860
4090
+
4091
+ /*
4092
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc,
4093
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
4094
+ // .kind = { .kind::mxf4, .kind::mxf4nvf4 }
4095
+ // .cta_group = { .cta_group::1, .cta_group::2 }
4096
+ template <cuda::ptx::dot_kind Kind, cuda::ptx::dot_cta_group Cta_Group>
4097
+ __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard(
4098
+ cuda::ptx::kind_t<Kind> kind,
4099
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
4100
+ uint32_t d_tmem,
4101
+ uint64_t a_desc,
4102
+ uint64_t b_desc,
4103
+ uint32_t idesc,
4104
+ uint32_t scale_A_tmem,
4105
+ uint32_t scale_B_tmem,
4106
+ bool enable_input_d);
4107
+ */
4108
+ #if __cccl_ptx_isa >= 860
4109
+ extern "C" _CCCL_DEVICE void
4110
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
4111
+ template <::cuda::ptx::dot_kind _Kind, ::cuda::ptx::dot_cta_group _Cta_Group>
4112
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard(
4113
+ ::cuda::ptx::kind_t<_Kind> __kind,
4114
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
4115
+ ::cuda::std::uint32_t __d_tmem,
4116
+ ::cuda::std::uint64_t __a_desc,
4117
+ ::cuda::std::uint64_t __b_desc,
4118
+ ::cuda::std::uint32_t __idesc,
4119
+ ::cuda::std::uint32_t __scale_A_tmem,
4120
+ ::cuda::std::uint32_t __scale_B_tmem,
4121
+ bool __enable_input_d)
4122
+ {
4123
+ static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, "");
4124
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
4125
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
4126
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
4127
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
4128
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
4129
+ if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_1)
4130
+ {
4131
+ asm volatile(
4132
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
4133
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
4134
+ "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
4135
+ "[%5], PRED_enable_input_d;\n\t"
4136
+ "}"
4137
+ :
4138
+ : "r"(__d_tmem),
4139
+ "l"(__a_desc),
4140
+ "l"(__b_desc),
4141
+ "r"(__idesc),
4142
+ "r"(__scale_A_tmem),
4143
+ "r"(__scale_B_tmem),
4144
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
4145
+ : "memory");
4146
+ }
4147
+ else if constexpr (__kind == kind_mxf4 && __cta_group == cta_group_2)
4148
+ {
4149
+ asm volatile(
4150
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
4151
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
4152
+ "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
4153
+ "[%5], PRED_enable_input_d;\n\t"
4154
+ "}"
4155
+ :
4156
+ : "r"(__d_tmem),
4157
+ "l"(__a_desc),
4158
+ "l"(__b_desc),
4159
+ "r"(__idesc),
4160
+ "r"(__scale_A_tmem),
4161
+ "r"(__scale_B_tmem),
4162
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
4163
+ : "memory");
4164
+ }
4165
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1)
4166
+ {
4167
+ asm volatile(
4168
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
4169
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
4170
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
4171
+ "[%5], PRED_enable_input_d;\n\t"
4172
+ "}"
4173
+ :
4174
+ : "r"(__d_tmem),
4175
+ "l"(__a_desc),
4176
+ "l"(__b_desc),
4177
+ "r"(__idesc),
4178
+ "r"(__scale_A_tmem),
4179
+ "r"(__scale_B_tmem),
4180
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
4181
+ : "memory");
4182
+ }
4183
+ else if constexpr (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2)
4184
+ {
4185
+ asm volatile(
4186
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
4187
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
4188
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], "
4189
+ "[%5], PRED_enable_input_d;\n\t"
4190
+ "}"
4191
+ :
4192
+ : "r"(__d_tmem),
4193
+ "l"(__a_desc),
4194
+ "l"(__b_desc),
4195
+ "r"(__idesc),
4196
+ "r"(__scale_A_tmem),
4197
+ "r"(__scale_B_tmem),
4198
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
4199
+ : "memory");
4200
+ }
4201
+
4202
+ # else
4203
+ // Unsupported architectures will have a linker error with a semi-decent error message
4204
+ __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
4205
+ # endif
4206
+ }
4207
+ #endif // __cccl_ptx_isa >= 860
4208
+
4209
+ /*
4210
+ // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc,
4211
+ [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_103a, SM_110a
4212
+ // .kind = { .kind::mxf4nvf4 }
4213
+ // .cta_group = { .cta_group::1, .cta_group::2 }
4214
+ template <cuda::ptx::dot_cta_group Cta_Group>
4215
+ __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard(
4216
+ cuda::ptx::kind_mxf4nvf4_t,
4217
+ cuda::ptx::cta_group_t<Cta_Group> cta_group,
4218
+ uint32_t d_tmem,
4219
+ uint64_t a_desc,
4220
+ uint64_t b_desc,
4221
+ uint32_t idesc,
4222
+ uint32_t scale_A_tmem,
4223
+ uint32_t scale_B_tmem,
4224
+ bool enable_input_d);
4225
+ */
4226
+ #if __cccl_ptx_isa >= 860
4227
+ extern "C" _CCCL_DEVICE void
4228
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
4229
+ template <::cuda::ptx::dot_cta_group _Cta_Group>
4230
+ _CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard(
4231
+ ::cuda::ptx::kind_mxf4nvf4_t,
4232
+ ::cuda::ptx::cta_group_t<_Cta_Group> __cta_group,
4233
+ ::cuda::std::uint32_t __d_tmem,
4234
+ ::cuda::std::uint64_t __a_desc,
4235
+ ::cuda::std::uint64_t __b_desc,
4236
+ ::cuda::std::uint32_t __idesc,
4237
+ ::cuda::std::uint32_t __scale_A_tmem,
4238
+ ::cuda::std::uint32_t __scale_B_tmem,
4239
+ bool __enable_input_d)
4240
+ {
4241
+ // __kind == kind_mxf4nvf4 (due to parameter type constraint)
4242
+ static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
4243
+ # if _CCCL_CUDA_COMPILER(NVHPC) \
4244
+ || (defined(__CUDA_ARCH_FEAT_SM100_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1000))) \
4245
+ || (defined(__CUDA_ARCH_FEAT_SM103_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1030))) \
4246
+ || (defined(__CUDA_ARCH_FEAT_SM110_ALL) || (defined(__CUDA_ARCH_SPECIFIC__) && (__CUDA_ARCH_SPECIFIC__ == 1100)))
4247
+ if constexpr (__cta_group == cta_group_1)
4248
+ {
4249
+ asm volatile(
4250
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
4251
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
4252
+ "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], "
4253
+ "[%5], PRED_enable_input_d;\n\t"
4254
+ "}"
4255
+ :
4256
+ : "r"(__d_tmem),
4257
+ "l"(__a_desc),
4258
+ "l"(__b_desc),
4259
+ "r"(__idesc),
4260
+ "r"(__scale_A_tmem),
4261
+ "r"(__scale_B_tmem),
4262
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
4263
+ : "memory");
4264
+ }
4265
+ else if constexpr (__cta_group == cta_group_2)
4266
+ {
4267
+ asm volatile(
4268
+ "{\n\t .reg .pred PRED_enable_input_d; \n\t"
4269
+ "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t"
4270
+ "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], "
4271
+ "[%5], PRED_enable_input_d;\n\t"
4272
+ "}"
4273
+ :
4274
+ : "r"(__d_tmem),
4275
+ "l"(__a_desc),
4276
+ "l"(__b_desc),
4277
+ "r"(__idesc),
4278
+ "r"(__scale_A_tmem),
4279
+ "r"(__scale_B_tmem),
4280
+ "r"(static_cast<::cuda::std::uint32_t>(__enable_input_d))
4281
+ : "memory");
4282
+ }
4283
+
4284
+ # else
4285
+ // Unsupported architectures will have a linker error with a semi-decent error message
4286
+ __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard_is_only_supported_on_SM_100a_103a_110a_depending_on_the_variant__();
4287
+ # endif
4288
+ }
4289
+ #endif // __cccl_ptx_isa >= 860
4290
+
4291
+ #endif // _CUDA_PTX_GENERATED_TCGEN05_MMA_H_