cuda-cccl 0.1.3.1.0.dev1678__cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (1860) hide show
  1. cuda/cccl/__init__.py +14 -0
  2. cuda/cccl/cooperative/__init__.py +3 -0
  3. cuda/cccl/cooperative/experimental/__init__.py +8 -0
  4. cuda/cccl/cooperative/experimental/_caching.py +48 -0
  5. cuda/cccl/cooperative/experimental/_common.py +273 -0
  6. cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
  7. cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
  8. cuda/cccl/cooperative/experimental/_types.py +935 -0
  9. cuda/cccl/cooperative/experimental/_typing.py +107 -0
  10. cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
  11. cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
  12. cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
  13. cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
  14. cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
  15. cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
  16. cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
  17. cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
  18. cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
  19. cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
  20. cuda/cccl/headers/__init__.py +7 -0
  21. cuda/cccl/headers/include/__init__.py +1 -0
  22. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
  23. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
  24. cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
  25. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +925 -0
  26. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
  27. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +753 -0
  28. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
  29. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
  30. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
  31. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
  32. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
  33. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
  34. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
  35. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
  36. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
  37. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
  38. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
  39. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
  40. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
  41. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
  42. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
  43. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
  44. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
  45. cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
  46. cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
  47. cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
  48. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
  49. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
  50. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
  51. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
  52. cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
  53. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
  54. cuda/cccl/headers/include/cub/block/block_scan.cuh +2583 -0
  55. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
  56. cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
  57. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
  58. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
  59. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
  60. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
  61. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
  62. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
  63. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
  64. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
  65. cuda/cccl/headers/include/cub/config.cuh +60 -0
  66. cuda/cccl/headers/include/cub/cub.cuh +112 -0
  67. cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
  68. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
  69. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
  70. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
  71. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
  72. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
  73. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
  74. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
  75. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
  76. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
  77. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
  78. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
  79. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
  80. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
  81. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
  82. cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
  83. cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
  84. cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
  85. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
  86. cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
  87. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
  88. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
  89. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
  90. cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
  91. cuda/cccl/headers/include/cub/device/device_for.cuh +990 -0
  92. cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
  93. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
  94. cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
  95. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
  96. cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
  97. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3435 -0
  98. cuda/cccl/headers/include/cub/device/device_reduce.cuh +1815 -0
  99. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
  100. cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
  101. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
  102. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
  103. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
  104. cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
  105. cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
  107. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
  108. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
  109. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
  110. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
  111. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
  112. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
  113. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
  114. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
  115. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
  116. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
  117. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +497 -0
  118. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
  119. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
  120. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
  121. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
  122. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
  123. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
  125. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
  126. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +467 -0
  127. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
  128. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
  129. cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
  130. cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
  131. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
  132. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +525 -0
  133. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
  134. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
  135. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +936 -0
  136. cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
  137. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
  138. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
  139. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
  140. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
  141. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
  142. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
  143. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
  144. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
  145. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
  146. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
  147. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
  148. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
  149. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
  150. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
  151. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
  152. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +353 -0
  153. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
  154. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
  155. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
  156. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
  157. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
  158. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
  159. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
  160. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
  161. cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
  162. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
  163. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
  164. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
  165. cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
  166. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
  167. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
  168. cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
  169. cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
  170. cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
  171. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
  172. cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
  173. cuda/cccl/headers/include/cub/util_device.cuh +779 -0
  174. cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
  175. cuda/cccl/headers/include/cub/util_math.cuh +115 -0
  176. cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
  177. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
  178. cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
  179. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
  180. cuda/cccl/headers/include/cub/util_type.cuh +1136 -0
  181. cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
  182. cuda/cccl/headers/include/cub/version.cuh +89 -0
  183. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
  184. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
  185. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
  186. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
  187. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
  188. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
  189. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
  190. cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
  191. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
  192. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
  193. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
  194. cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
  195. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
  196. cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
  197. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
  198. cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
  199. cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
  200. cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
  201. cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
  202. cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
  203. cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
  204. cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
  205. cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
  206. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
  207. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
  208. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
  209. cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
  210. cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
  211. cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
  212. cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
  213. cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
  214. cuda/cccl/headers/include/cuda/__cccl_config +36 -0
  215. cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
  216. cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
  217. cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
  218. cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
  219. cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
  220. cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
  221. cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
  222. cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
  223. cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
  224. cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
  225. cuda/cccl/headers/include/cuda/__execution/require.h +74 -0
  226. cuda/cccl/headers/include/cuda/__execution/tune.h +69 -0
  227. cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
  228. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +276 -0
  229. cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
  230. cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
  231. cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
  232. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
  233. cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
  234. cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
  235. cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
  236. cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
  237. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +257 -0
  238. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +460 -0
  239. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
  240. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +421 -0
  241. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +321 -0
  242. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +333 -0
  243. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +465 -0
  244. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +456 -0
  245. cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
  246. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
  247. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
  248. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
  249. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
  250. cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
  251. cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
  252. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
  253. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
  254. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
  255. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
  256. cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
  257. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
  258. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
  259. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
  260. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
  261. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
  262. cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
  263. cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
  264. cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
  265. cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
  266. cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
  267. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
  268. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +157 -0
  269. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
  270. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
  271. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
  272. cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
  273. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
  274. cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
  275. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
  276. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
  277. cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
  278. cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
  279. cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
  280. cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
  281. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
  282. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
  283. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
  284. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
  285. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
  286. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
  287. cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
  288. cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
  289. cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
  290. cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
  291. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
  292. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
  293. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
  294. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
  295. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
  296. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
  297. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
  298. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
  299. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
  300. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
  301. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
  302. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
  303. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
  304. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
  305. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
  306. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
  307. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
  308. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
  309. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
  310. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
  311. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
  312. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
  313. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
  314. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
  315. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
  316. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
  317. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
  318. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
  319. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
  320. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
  321. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
  322. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
  323. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
  324. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
  325. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
  326. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
  327. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
  328. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
  329. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
  330. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
  331. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
  332. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
  333. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
  334. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
  335. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
  336. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
  337. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
  338. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
  339. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
  340. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
  341. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
  342. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
  343. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
  344. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
  345. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
  346. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
  347. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
  348. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
  349. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
  350. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
  351. cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
  352. cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
  353. cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
  354. cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
  355. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
  356. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
  357. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
  358. cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
  359. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
  360. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
  361. cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
  362. cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
  363. cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
  364. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
  365. cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
  366. cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
  367. cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
  368. cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
  369. cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
  370. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
  371. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
  372. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
  373. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
  374. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
  375. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
  376. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
  377. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
  378. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
  379. cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
  380. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
  381. cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
  382. cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
  383. cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
  384. cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
  385. cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
  386. cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
  387. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +163 -0
  388. cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
  389. cuda/cccl/headers/include/cuda/__utility/static_for.h +74 -0
  390. cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
  391. cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
  392. cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
  393. cuda/cccl/headers/include/cuda/access_property +26 -0
  394. cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
  395. cuda/cccl/headers/include/cuda/atomic +27 -0
  396. cuda/cccl/headers/include/cuda/barrier +262 -0
  397. cuda/cccl/headers/include/cuda/bit +29 -0
  398. cuda/cccl/headers/include/cuda/cmath +35 -0
  399. cuda/cccl/headers/include/cuda/discard_memory +60 -0
  400. cuda/cccl/headers/include/cuda/functional +31 -0
  401. cuda/cccl/headers/include/cuda/iterator +34 -0
  402. cuda/cccl/headers/include/cuda/latch +27 -0
  403. cuda/cccl/headers/include/cuda/mdspan +28 -0
  404. cuda/cccl/headers/include/cuda/memory +32 -0
  405. cuda/cccl/headers/include/cuda/memory_resource +41 -0
  406. cuda/cccl/headers/include/cuda/numeric +28 -0
  407. cuda/cccl/headers/include/cuda/pipeline +577 -0
  408. cuda/cccl/headers/include/cuda/ptx +124 -0
  409. cuda/cccl/headers/include/cuda/semaphore +31 -0
  410. cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
  411. cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
  412. cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
  413. cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
  414. cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
  415. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
  416. cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
  417. cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
  418. cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
  419. cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
  420. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
  421. cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
  422. cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
  423. cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
  424. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
  425. cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
  426. cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
  427. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
  428. cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
  429. cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
  430. cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
  431. cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
  432. cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
  433. cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
  434. cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
  435. cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
  436. cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
  437. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
  438. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
  439. cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
  440. cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
  441. cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
  442. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
  443. cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
  444. cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
  445. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
  446. cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
  447. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
  448. cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
  449. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
  450. cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
  451. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +68 -0
  452. cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
  453. cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
  454. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
  455. cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
  456. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +140 -0
  457. cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
  458. cuda/cccl/headers/include/cuda/std/__algorithm/move.h +87 -0
  459. cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
  460. cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
  461. cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
  462. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
  463. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
  464. cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
  465. cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
  466. cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
  467. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +94 -0
  468. cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
  469. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +101 -0
  470. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
  471. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
  472. cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
  473. cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
  474. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
  475. cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
  476. cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
  477. cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
  478. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
  479. cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
  480. cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
  481. cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
  482. cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
  483. cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
  484. cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
  485. cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
  486. cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
  487. cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
  488. cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
  489. cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
  490. cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
  491. cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
  492. cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
  493. cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
  494. cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
  495. cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
  496. cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
  497. cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
  498. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
  499. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
  500. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
  501. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
  502. cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
  503. cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
  504. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
  505. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
  506. cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
  507. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
  508. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
  509. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
  510. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
  511. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
  512. cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
  513. cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
  514. cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
  515. cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
  516. cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
  517. cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
  518. cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
  519. cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
  520. cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
  521. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
  522. cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
  523. cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
  524. cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
  525. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
  526. cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
  527. cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
  528. cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
  529. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
  530. cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
  531. cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
  532. cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
  533. cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
  534. cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
  535. cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
  536. cuda/cccl/headers/include/cuda/std/__bit/reference.h +1270 -0
  537. cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
  538. cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
  539. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +150 -0
  540. cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
  541. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +787 -0
  542. cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
  543. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
  544. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
  545. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
  546. cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +128 -0
  547. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +126 -0
  548. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
  549. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
  550. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
  551. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +133 -0
  552. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
  553. cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
  554. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1276 -0
  555. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
  556. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
  557. cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
  558. cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
  559. cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
  560. cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
  561. cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
  562. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +115 -0
  563. cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
  564. cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
  565. cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
  566. cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
  567. cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
  568. cuda/cccl/headers/include/cuda/std/__cmath/abs.h +246 -0
  569. cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +193 -0
  570. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +724 -0
  571. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
  572. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +216 -0
  573. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
  574. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
  575. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +224 -0
  576. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
  577. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
  578. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
  579. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
  580. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
  581. cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
  582. cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +104 -0
  583. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +582 -0
  584. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +248 -0
  585. cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
  586. cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
  587. cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
  588. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
  589. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
  590. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
  591. cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
  592. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
  593. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
  594. cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
  595. cuda/cccl/headers/include/cuda/std/__complex/complex.h +676 -0
  596. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +388 -0
  597. cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
  598. cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +215 -0
  599. cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
  600. cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
  601. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +53 -0
  602. cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
  603. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
  604. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
  605. cuda/cccl/headers/include/cuda/std/__complex/roots.h +64 -0
  606. cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
  607. cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
  608. cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +131 -0
  609. cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
  610. cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
  611. cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
  612. cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
  613. cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
  614. cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
  615. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +273 -0
  616. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
  617. cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
  618. cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
  619. cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
  620. cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
  621. cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
  622. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
  623. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
  624. cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
  625. cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
  626. cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
  627. cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
  628. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
  629. cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
  630. cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
  631. cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
  632. cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
  633. cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
  634. cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
  635. cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
  636. cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
  637. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
  638. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
  639. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
  640. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +142 -0
  641. cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
  642. cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
  643. cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
  644. cuda/cccl/headers/include/cuda/std/__expected/expected.h +2001 -0
  645. cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1080 -0
  646. cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
  647. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +175 -0
  648. cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
  649. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
  650. cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
  651. cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
  652. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
  653. cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
  654. cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
  655. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
  656. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
  657. cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
  658. cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
  659. cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
  660. cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
  661. cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
  662. cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
  663. cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
  664. cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +128 -0
  665. cuda/cccl/headers/include/cuda/std/__format_ +28 -0
  666. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
  667. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
  668. cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
  669. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
  670. cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
  671. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
  672. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
  673. cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
  674. cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
  675. cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
  676. cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
  677. cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
  678. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +558 -0
  679. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
  680. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
  681. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +213 -0
  682. cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
  683. cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
  684. cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
  685. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
  686. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
  687. cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
  688. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
  689. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
  690. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
  691. cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
  692. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +277 -0
  693. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
  694. cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
  695. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
  696. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
  697. cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
  698. cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
  699. cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
  700. cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
  701. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
  702. cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
  703. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
  704. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
  705. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
  706. cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
  707. cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
  708. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
  709. cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
  710. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
  711. cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
  712. cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
  713. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
  714. cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
  715. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
  716. cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +100 -0
  717. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +254 -0
  718. cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
  719. cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
  720. cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
  721. cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
  722. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
  723. cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
  724. cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
  725. cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +95 -0
  726. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
  727. cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
  728. cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +102 -0
  729. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +140 -0
  730. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +160 -0
  731. cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
  732. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
  733. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
  734. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +932 -0
  735. cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
  736. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +400 -0
  737. cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
  738. cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
  739. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +98 -0
  740. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
  741. cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
  742. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +91 -0
  743. cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
  744. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
  745. cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
  746. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +605 -0
  747. cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
  748. cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
  749. cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
  750. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +247 -0
  751. cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
  752. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
  753. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
  754. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
  755. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
  756. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
  757. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
  758. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +322 -0
  759. cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +98 -0
  760. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
  761. cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
  762. cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +358 -0
  763. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +757 -0
  764. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +315 -0
  765. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +308 -0
  766. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
  767. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +507 -0
  768. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
  769. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
  770. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
  771. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
  772. cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
  773. cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
  774. cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
  775. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
  776. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
  777. cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
  778. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +555 -0
  779. cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
  780. cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
  781. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +230 -0
  782. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
  783. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
  784. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
  785. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +261 -0
  786. cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
  787. cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
  788. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +683 -0
  789. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +768 -0
  790. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
  791. cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
  792. cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
  793. cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
  794. cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
  795. cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
  796. cuda/cccl/headers/include/cuda/std/__new_ +29 -0
  797. cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
  798. cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
  799. cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
  800. cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
  801. cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
  802. cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
  803. cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
  804. cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
  805. cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
  806. cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
  807. cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
  808. cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
  809. cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
  810. cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +75 -0
  811. cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
  812. cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
  813. cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
  814. cuda/cccl/headers/include/cuda/std/__optional/optional.h +900 -0
  815. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +430 -0
  816. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
  817. cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
  818. cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
  819. cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
  820. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +397 -0
  821. cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
  822. cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
  823. cuda/cccl/headers/include/cuda/std/__random_ +29 -0
  824. cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
  825. cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
  826. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
  827. cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
  828. cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
  829. cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
  830. cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
  831. cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
  832. cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
  833. cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
  834. cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
  835. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
  836. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
  837. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
  838. cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +113 -0
  839. cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +174 -0
  840. cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
  841. cuda/cccl/headers/include/cuda/std/__ranges/rend.h +181 -0
  842. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
  843. cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
  844. cuda/cccl/headers/include/cuda/std/__ranges/size.h +199 -0
  845. cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
  846. cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +475 -0
  847. cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
  848. cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
  849. cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
  850. cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
  851. cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
  852. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
  853. cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
  854. cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
  855. cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
  856. cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
  857. cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
  858. cuda/cccl/headers/include/cuda/std/__string_ +29 -0
  859. cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
  860. cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
  861. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
  862. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
  863. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
  864. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +142 -0
  865. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
  866. cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
  867. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
  868. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +269 -0
  869. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
  870. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
  871. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
  872. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
  873. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
  874. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
  875. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
  876. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +277 -0
  877. cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
  878. cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
  879. cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
  880. cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
  881. cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
  882. cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
  883. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
  884. cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
  885. cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
  886. cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
  887. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
  888. cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
  889. cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
  890. cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
  891. cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
  892. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
  893. cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
  894. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
  895. cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
  896. cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
  897. cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
  898. cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
  899. cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
  900. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
  901. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
  902. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
  903. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
  904. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
  905. cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
  906. cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
  907. cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
  908. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
  909. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
  910. cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
  911. cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
  912. cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
  913. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
  914. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
  915. cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
  916. cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
  917. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
  918. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
  919. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
  920. cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
  921. cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
  922. cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
  923. cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
  924. cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
  925. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
  926. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
  927. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
  928. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
  929. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
  930. cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
  931. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
  932. cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
  933. cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
  934. cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
  935. cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
  936. cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
  937. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
  938. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
  939. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
  940. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
  941. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
  942. cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
  943. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
  944. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
  945. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
  946. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
  947. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
  948. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
  949. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
  950. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
  951. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
  952. cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
  953. cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
  954. cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
  955. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
  956. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
  957. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
  958. cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
  959. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
  960. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
  961. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
  962. cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
  963. cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
  964. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
  965. cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
  966. cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
  967. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
  968. cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
  969. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
  970. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +201 -0
  971. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
  972. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
  973. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
  974. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
  975. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
  976. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
  977. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
  978. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
  979. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
  980. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
  981. cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
  982. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
  983. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
  984. cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
  985. cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
  986. cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
  987. cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
  988. cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
  989. cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
  990. cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
  991. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
  992. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
  993. cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
  994. cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
  995. cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
  996. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
  997. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
  998. cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
  999. cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
  1000. cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
  1001. cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
  1002. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
  1003. cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
  1004. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
  1005. cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
  1006. cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
  1007. cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
  1008. cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
  1009. cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
  1010. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
  1011. cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
  1012. cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
  1013. cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
  1014. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
  1015. cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
  1016. cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
  1017. cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
  1018. cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
  1019. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
  1020. cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
  1021. cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
  1022. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
  1023. cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
  1024. cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
  1025. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
  1026. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
  1027. cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
  1028. cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
  1029. cuda/cccl/headers/include/cuda/std/__utility/pair.h +802 -0
  1030. cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
  1031. cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +510 -0
  1032. cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
  1033. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
  1034. cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
  1035. cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
  1036. cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
  1037. cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
  1038. cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
  1039. cuda/cccl/headers/include/cuda/std/array +520 -0
  1040. cuda/cccl/headers/include/cuda/std/atomic +818 -0
  1041. cuda/cccl/headers/include/cuda/std/barrier +43 -0
  1042. cuda/cccl/headers/include/cuda/std/bit +35 -0
  1043. cuda/cccl/headers/include/cuda/std/bitset +994 -0
  1044. cuda/cccl/headers/include/cuda/std/cassert +28 -0
  1045. cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
  1046. cuda/cccl/headers/include/cuda/std/cfloat +59 -0
  1047. cuda/cccl/headers/include/cuda/std/chrono +26 -0
  1048. cuda/cccl/headers/include/cuda/std/climits +61 -0
  1049. cuda/cccl/headers/include/cuda/std/cmath +25 -0
  1050. cuda/cccl/headers/include/cuda/std/complex +50 -0
  1051. cuda/cccl/headers/include/cuda/std/concepts +48 -0
  1052. cuda/cccl/headers/include/cuda/std/cstddef +28 -0
  1053. cuda/cccl/headers/include/cuda/std/cstdint +178 -0
  1054. cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
  1055. cuda/cccl/headers/include/cuda/std/cstring +110 -0
  1056. cuda/cccl/headers/include/cuda/std/ctime +152 -0
  1057. cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
  1058. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +235 -0
  1059. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1720 -0
  1060. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3628 -0
  1061. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +667 -0
  1062. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
  1063. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
  1064. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1367 -0
  1065. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2154 -0
  1066. cuda/cccl/headers/include/cuda/std/execution +27 -0
  1067. cuda/cccl/headers/include/cuda/std/expected +30 -0
  1068. cuda/cccl/headers/include/cuda/std/functional +56 -0
  1069. cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
  1070. cuda/cccl/headers/include/cuda/std/inplace_vector +2163 -0
  1071. cuda/cccl/headers/include/cuda/std/iterator +70 -0
  1072. cuda/cccl/headers/include/cuda/std/latch +34 -0
  1073. cuda/cccl/headers/include/cuda/std/limits +28 -0
  1074. cuda/cccl/headers/include/cuda/std/linalg +30 -0
  1075. cuda/cccl/headers/include/cuda/std/mdspan +38 -0
  1076. cuda/cccl/headers/include/cuda/std/memory +39 -0
  1077. cuda/cccl/headers/include/cuda/std/numbers +335 -0
  1078. cuda/cccl/headers/include/cuda/std/numeric +41 -0
  1079. cuda/cccl/headers/include/cuda/std/optional +31 -0
  1080. cuda/cccl/headers/include/cuda/std/ranges +69 -0
  1081. cuda/cccl/headers/include/cuda/std/ratio +417 -0
  1082. cuda/cccl/headers/include/cuda/std/semaphore +31 -0
  1083. cuda/cccl/headers/include/cuda/std/source_location +83 -0
  1084. cuda/cccl/headers/include/cuda/std/span +640 -0
  1085. cuda/cccl/headers/include/cuda/std/string_view +788 -0
  1086. cuda/cccl/headers/include/cuda/std/tuple +26 -0
  1087. cuda/cccl/headers/include/cuda/std/type_traits +176 -0
  1088. cuda/cccl/headers/include/cuda/std/utility +70 -0
  1089. cuda/cccl/headers/include/cuda/std/variant +25 -0
  1090. cuda/cccl/headers/include/cuda/std/version +245 -0
  1091. cuda/cccl/headers/include/cuda/stream_ref +54 -0
  1092. cuda/cccl/headers/include/cuda/type_traits +27 -0
  1093. cuda/cccl/headers/include/cuda/utility +27 -0
  1094. cuda/cccl/headers/include/cuda/version +16 -0
  1095. cuda/cccl/headers/include/cuda/warp +28 -0
  1096. cuda/cccl/headers/include/cuda/work_stealing +26 -0
  1097. cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
  1098. cuda/cccl/headers/include/nv/detail/__target_macros +641 -0
  1099. cuda/cccl/headers/include/nv/target +240 -0
  1100. cuda/cccl/headers/include/thrust/addressof.h +22 -0
  1101. cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
  1102. cuda/cccl/headers/include/thrust/advance.h +59 -0
  1103. cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
  1104. cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
  1105. cuda/cccl/headers/include/thrust/complex.h +859 -0
  1106. cuda/cccl/headers/include/thrust/copy.h +506 -0
  1107. cuda/cccl/headers/include/thrust/count.h +245 -0
  1108. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
  1109. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
  1110. cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
  1111. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
  1112. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
  1113. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
  1114. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
  1115. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
  1116. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
  1117. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
  1118. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
  1119. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
  1120. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
  1121. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
  1122. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
  1123. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
  1124. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
  1125. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
  1126. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
  1127. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
  1128. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
  1129. cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
  1130. cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
  1131. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
  1132. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
  1133. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
  1134. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
  1135. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
  1136. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
  1137. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
  1138. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
  1139. cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
  1140. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
  1141. cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
  1142. cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
  1143. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
  1144. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
  1145. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
  1146. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
  1147. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
  1148. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
  1149. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
  1150. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
  1151. cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
  1152. cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
  1153. cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
  1154. cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
  1155. cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
  1156. cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
  1157. cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
  1158. cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
  1159. cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
  1160. cuda/cccl/headers/include/thrust/detail/config.h +36 -0
  1161. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
  1162. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
  1163. cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
  1164. cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
  1165. cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
  1166. cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
  1167. cuda/cccl/headers/include/thrust/detail/count.h +55 -0
  1168. cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
  1169. cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
  1170. cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
  1171. cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
  1172. cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
  1173. cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
  1174. cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
  1175. cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
  1176. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
  1177. cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
  1178. cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
  1179. cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
  1180. cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
  1181. cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
  1182. cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
  1183. cuda/cccl/headers/include/thrust/detail/function.h +49 -0
  1184. cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
  1185. cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
  1186. cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
  1187. cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
  1188. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
  1189. cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
  1190. cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
  1191. cuda/cccl/headers/include/thrust/detail/internal_functional.h +289 -0
  1192. cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
  1193. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
  1194. cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
  1195. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
  1196. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
  1197. cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
  1198. cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
  1199. cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
  1200. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
  1201. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
  1202. cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
  1203. cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
  1204. cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
  1205. cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
  1206. cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
  1207. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
  1208. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
  1209. cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
  1210. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
  1211. cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
  1212. cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
  1213. cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
  1214. cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
  1215. cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
  1216. cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
  1217. cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
  1218. cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
  1219. cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
  1220. cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
  1221. cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
  1222. cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
  1223. cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
  1224. cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
  1225. cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
  1226. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
  1227. cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
  1228. cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
  1229. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
  1230. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
  1231. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
  1232. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
  1233. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
  1234. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
  1235. cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
  1236. cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
  1237. cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
  1238. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
  1239. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
  1240. cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
  1241. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
  1242. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
  1243. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
  1244. cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
  1245. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
  1246. cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
  1247. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
  1248. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
  1249. cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
  1250. cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
  1251. cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
  1252. cuda/cccl/headers/include/thrust/detail/vector_base.h +615 -0
  1253. cuda/cccl/headers/include/thrust/detail/vector_base.inl +1212 -0
  1254. cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
  1255. cuda/cccl/headers/include/thrust/device_delete.h +59 -0
  1256. cuda/cccl/headers/include/thrust/device_free.h +72 -0
  1257. cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
  1258. cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
  1259. cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
  1260. cuda/cccl/headers/include/thrust/device_new.h +91 -0
  1261. cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
  1262. cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
  1263. cuda/cccl/headers/include/thrust/device_reference.h +986 -0
  1264. cuda/cccl/headers/include/thrust/device_vector.h +574 -0
  1265. cuda/cccl/headers/include/thrust/distance.h +43 -0
  1266. cuda/cccl/headers/include/thrust/equal.h +247 -0
  1267. cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
  1268. cuda/cccl/headers/include/thrust/extrema.h +657 -0
  1269. cuda/cccl/headers/include/thrust/fill.h +201 -0
  1270. cuda/cccl/headers/include/thrust/find.h +382 -0
  1271. cuda/cccl/headers/include/thrust/for_each.h +261 -0
  1272. cuda/cccl/headers/include/thrust/functional.h +396 -0
  1273. cuda/cccl/headers/include/thrust/gather.h +464 -0
  1274. cuda/cccl/headers/include/thrust/generate.h +193 -0
  1275. cuda/cccl/headers/include/thrust/host_vector.h +576 -0
  1276. cuda/cccl/headers/include/thrust/inner_product.h +264 -0
  1277. cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
  1278. cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
  1279. cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
  1280. cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
  1281. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
  1282. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
  1283. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
  1284. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
  1285. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
  1286. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
  1287. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
  1288. cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
  1289. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
  1290. cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
  1291. cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
  1292. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
  1293. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
  1294. cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
  1295. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
  1296. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
  1297. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
  1298. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +275 -0
  1299. cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
  1300. cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
  1301. cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
  1302. cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
  1303. cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
  1304. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
  1305. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
  1306. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
  1307. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
  1308. cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
  1309. cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
  1310. cuda/cccl/headers/include/thrust/logical.h +290 -0
  1311. cuda/cccl/headers/include/thrust/memory.h +395 -0
  1312. cuda/cccl/headers/include/thrust/merge.h +725 -0
  1313. cuda/cccl/headers/include/thrust/mismatch.h +261 -0
  1314. cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
  1315. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
  1316. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
  1317. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
  1318. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
  1319. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
  1320. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
  1321. cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
  1322. cuda/cccl/headers/include/thrust/mr/new.h +100 -0
  1323. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
  1324. cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
  1325. cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
  1326. cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
  1327. cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
  1328. cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
  1329. cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
  1330. cuda/cccl/headers/include/thrust/pair.h +102 -0
  1331. cuda/cccl/headers/include/thrust/partition.h +1383 -0
  1332. cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
  1333. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
  1334. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
  1335. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
  1336. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
  1337. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
  1338. cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
  1339. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
  1340. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
  1341. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
  1342. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
  1343. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
  1344. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
  1345. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
  1346. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
  1347. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
  1348. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
  1349. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
  1350. cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
  1351. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
  1352. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
  1353. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
  1354. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
  1355. cuda/cccl/headers/include/thrust/random.h +120 -0
  1356. cuda/cccl/headers/include/thrust/reduce.h +1112 -0
  1357. cuda/cccl/headers/include/thrust/remove.h +768 -0
  1358. cuda/cccl/headers/include/thrust/replace.h +827 -0
  1359. cuda/cccl/headers/include/thrust/reverse.h +213 -0
  1360. cuda/cccl/headers/include/thrust/scan.h +1671 -0
  1361. cuda/cccl/headers/include/thrust/scatter.h +446 -0
  1362. cuda/cccl/headers/include/thrust/sequence.h +277 -0
  1363. cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
  1364. cuda/cccl/headers/include/thrust/shuffle.h +182 -0
  1365. cuda/cccl/headers/include/thrust/sort.h +1320 -0
  1366. cuda/cccl/headers/include/thrust/swap.h +147 -0
  1367. cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
  1368. cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
  1369. cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
  1370. cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
  1371. cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
  1372. cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
  1373. cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
  1374. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
  1375. cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
  1376. cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
  1377. cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
  1378. cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
  1379. cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
  1380. cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
  1381. cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
  1382. cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
  1383. cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
  1384. cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
  1385. cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
  1386. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
  1387. cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
  1388. cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
  1389. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
  1390. cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
  1391. cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
  1392. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
  1393. cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
  1394. cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
  1395. cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
  1396. cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
  1397. cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
  1398. cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
  1399. cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
  1400. cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
  1401. cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
  1402. cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
  1403. cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
  1404. cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
  1405. cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
  1406. cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
  1407. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
  1408. cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
  1409. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
  1410. cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
  1411. cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
  1412. cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
  1413. cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
  1414. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
  1415. cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
  1416. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
  1417. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +123 -0
  1418. cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
  1419. cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
  1420. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
  1421. cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
  1422. cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
  1423. cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
  1424. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
  1425. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
  1426. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
  1427. cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
  1428. cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +53 -0
  1429. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
  1430. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +611 -0
  1431. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
  1432. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
  1433. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
  1434. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
  1435. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
  1436. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
  1437. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
  1438. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
  1439. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
  1440. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
  1441. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
  1442. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
  1443. cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
  1444. cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
  1445. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
  1446. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +89 -0
  1447. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
  1448. cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
  1449. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
  1450. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
  1451. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
  1452. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
  1453. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
  1454. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
  1455. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
  1456. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
  1457. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
  1458. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
  1459. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +781 -0
  1460. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
  1461. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
  1462. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +152 -0
  1463. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
  1464. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
  1465. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
  1466. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
  1467. cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
  1468. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
  1469. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
  1470. cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
  1471. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
  1472. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
  1473. cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
  1474. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
  1475. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
  1476. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
  1477. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
  1478. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
  1479. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +646 -0
  1480. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
  1481. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
  1482. cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
  1483. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
  1484. cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
  1485. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
  1486. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
  1487. cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
  1488. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
  1489. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
  1490. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
  1491. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
  1492. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
  1493. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
  1494. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
  1495. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
  1496. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
  1497. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
  1498. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
  1499. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
  1500. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
  1501. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
  1502. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
  1503. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
  1504. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
  1505. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
  1506. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
  1507. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
  1508. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
  1509. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
  1510. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
  1511. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
  1512. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
  1513. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
  1514. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
  1515. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
  1516. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
  1517. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
  1518. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
  1519. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
  1520. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
  1521. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
  1522. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
  1523. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
  1524. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
  1525. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
  1526. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
  1527. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
  1528. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
  1529. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
  1530. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
  1531. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
  1532. cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
  1533. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
  1534. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
  1535. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
  1536. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
  1537. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
  1538. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
  1539. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
  1540. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
  1541. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
  1542. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
  1543. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
  1544. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
  1545. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
  1546. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
  1547. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
  1548. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
  1549. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
  1550. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
  1551. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
  1552. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
  1553. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
  1554. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
  1555. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
  1556. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
  1557. cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
  1558. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
  1559. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
  1560. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
  1561. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
  1562. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
  1563. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
  1564. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
  1565. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
  1566. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
  1567. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
  1568. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
  1569. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
  1570. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
  1571. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
  1572. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
  1573. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
  1574. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
  1575. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
  1576. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
  1577. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
  1578. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
  1579. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
  1580. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
  1581. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
  1582. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
  1583. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
  1584. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
  1585. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
  1586. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
  1587. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
  1588. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
  1589. cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
  1590. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
  1591. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
  1592. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
  1593. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
  1594. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
  1595. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
  1596. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
  1597. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
  1598. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
  1599. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
  1600. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
  1601. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
  1602. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
  1603. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
  1604. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
  1605. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
  1606. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
  1607. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
  1608. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
  1609. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
  1610. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
  1611. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
  1612. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
  1613. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
  1614. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
  1615. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
  1616. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
  1617. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
  1618. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
  1619. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
  1620. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
  1621. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
  1622. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
  1623. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
  1624. cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
  1625. cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
  1626. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
  1627. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
  1628. cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
  1629. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
  1630. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
  1631. cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
  1632. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
  1633. cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
  1634. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
  1635. cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
  1636. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
  1637. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
  1638. cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
  1639. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
  1640. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
  1641. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
  1642. cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
  1643. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
  1644. cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
  1645. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
  1646. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
  1647. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
  1648. cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
  1649. cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
  1650. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
  1651. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
  1652. cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
  1653. cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
  1654. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
  1655. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
  1656. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
  1657. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
  1658. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
  1659. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
  1660. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
  1661. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
  1662. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
  1663. cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
  1664. cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
  1665. cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
  1666. cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
  1667. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
  1668. cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
  1669. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
  1670. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
  1671. cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
  1672. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
  1673. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
  1674. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
  1675. cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
  1676. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
  1677. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
  1678. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
  1679. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
  1680. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
  1681. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
  1682. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
  1683. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
  1684. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
  1685. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
  1686. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
  1687. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
  1688. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
  1689. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
  1690. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
  1691. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
  1692. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
  1693. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
  1694. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
  1695. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
  1696. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
  1697. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
  1698. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
  1699. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
  1700. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
  1701. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
  1702. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
  1703. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
  1704. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
  1705. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
  1706. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
  1707. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
  1708. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
  1709. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
  1710. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
  1711. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
  1712. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
  1713. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
  1714. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
  1715. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
  1716. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
  1717. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
  1718. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
  1719. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
  1720. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
  1721. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
  1722. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
  1723. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
  1724. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
  1725. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
  1726. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
  1727. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
  1728. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
  1729. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
  1730. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
  1731. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
  1732. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
  1733. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
  1734. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
  1735. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
  1736. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
  1737. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
  1738. cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
  1739. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
  1740. cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
  1741. cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
  1742. cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
  1743. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
  1744. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
  1745. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
  1746. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
  1747. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
  1748. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
  1749. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
  1750. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
  1751. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
  1752. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
  1753. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
  1754. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
  1755. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
  1756. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
  1757. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
  1758. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
  1759. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
  1760. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
  1761. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
  1762. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
  1763. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
  1764. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
  1765. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
  1766. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
  1767. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
  1768. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
  1769. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
  1770. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
  1771. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
  1772. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
  1773. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
  1774. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
  1775. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
  1776. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
  1777. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
  1778. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
  1779. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
  1780. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
  1781. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
  1782. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
  1783. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
  1784. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
  1785. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
  1786. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
  1787. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
  1788. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
  1789. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
  1790. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
  1791. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
  1792. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
  1793. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
  1794. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
  1795. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
  1796. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
  1797. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
  1798. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
  1799. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
  1800. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
  1801. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
  1802. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
  1803. cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
  1804. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
  1805. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
  1806. cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
  1807. cuda/cccl/headers/include/thrust/system_error.h +57 -0
  1808. cuda/cccl/headers/include/thrust/tabulate.h +125 -0
  1809. cuda/cccl/headers/include/thrust/transform.h +1045 -0
  1810. cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
  1811. cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
  1812. cuda/cccl/headers/include/thrust/tuple.h +142 -0
  1813. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
  1814. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
  1815. cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
  1816. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
  1817. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
  1818. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
  1819. cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
  1820. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
  1821. cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
  1822. cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
  1823. cuda/cccl/headers/include/thrust/unique.h +1090 -0
  1824. cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
  1825. cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
  1826. cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
  1827. cuda/cccl/headers/include/thrust/version.h +93 -0
  1828. cuda/cccl/headers/include/thrust/zip_function.h +176 -0
  1829. cuda/cccl/headers/include_paths.py +72 -0
  1830. cuda/cccl/parallel/__init__.py +9 -0
  1831. cuda/cccl/parallel/experimental/__init__.py +47 -0
  1832. cuda/cccl/parallel/experimental/_bindings.py +24 -0
  1833. cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
  1834. cuda/cccl/parallel/experimental/_bindings_impl.cpython-310-x86_64-linux-gnu.so +0 -0
  1835. cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
  1836. cuda/cccl/parallel/experimental/_caching.py +71 -0
  1837. cuda/cccl/parallel/experimental/_cccl_interop.py +382 -0
  1838. cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
  1839. cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
  1840. cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
  1841. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
  1842. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
  1843. cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
  1844. cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
  1845. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
  1846. cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
  1847. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
  1848. cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
  1849. cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
  1850. cuda/cccl/parallel/experimental/iterators/__init__.py +17 -0
  1851. cuda/cccl/parallel/experimental/iterators/_factories.py +157 -0
  1852. cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
  1853. cuda/cccl/parallel/experimental/numba_utils.py +6 -0
  1854. cuda/cccl/parallel/experimental/struct.py +150 -0
  1855. cuda/cccl/parallel/experimental/typing.py +27 -0
  1856. cuda/cccl/py.typed +0 -0
  1857. cuda_cccl-0.1.3.1.0.dev1678.dist-info/METADATA +28 -0
  1858. cuda_cccl-0.1.3.1.0.dev1678.dist-info/RECORD +1860 -0
  1859. cuda_cccl-0.1.3.1.0.dev1678.dist-info/WHEEL +6 -0
  1860. cuda_cccl-0.1.3.1.0.dev1678.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,4446 @@
1
+ // This file was automatically generated. Do not edit.
2
+
3
+ #ifndef _CUDA_PTX_GENERATED_TCGEN05_LD_H_
4
+ #define _CUDA_PTX_GENERATED_TCGEN05_LD_H_
5
+
6
+ /*
7
+ // tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
8
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
9
+ __device__ static inline void tcgen05_ld_16x64b(
10
+ B32 (&out)[1],
11
+ uint32_t taddr);
12
+ */
13
+ #if __cccl_ptx_isa >= 860
14
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
15
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
16
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr)
17
+ {
18
+ static_assert(sizeof(_B32) == 4, "");
19
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
20
+ asm("tcgen05.ld.sync.aligned.16x64b.x1.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory");
21
+ # else
22
+ // Unsupported architectures will have a linker error with a semi-decent error message
23
+ __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
24
+ # endif
25
+ }
26
+ #endif // __cccl_ptx_isa >= 860
27
+
28
+ /*
29
+ // tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
30
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
31
+ __device__ static inline void tcgen05_ld_16x64b_pack_16b(
32
+ B32 (&out)[1],
33
+ uint32_t taddr);
34
+ */
35
+ #if __cccl_ptx_isa >= 860
36
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
37
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
38
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr)
39
+ {
40
+ static_assert(sizeof(_B32) == 4, "");
41
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
42
+ asm("tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory");
43
+ # else
44
+ // Unsupported architectures will have a linker error with a semi-decent error message
45
+ __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
46
+ # endif
47
+ }
48
+ #endif // __cccl_ptx_isa >= 860
49
+
50
+ /*
51
+ // tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
52
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
53
+ __device__ static inline void tcgen05_ld_16x64b(
54
+ B32 (&out)[2],
55
+ uint32_t taddr);
56
+ */
57
+ #if __cccl_ptx_isa >= 860
58
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
59
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
60
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr)
61
+ {
62
+ static_assert(sizeof(_B32) == 4, "");
63
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
64
+ asm("tcgen05.ld.sync.aligned.16x64b.x2.b32 {%0, %1}, [%2];"
65
+ : "=r"(__out[0]), "=r"(__out[1])
66
+ : "r"(__taddr)
67
+ : "memory");
68
+ # else
69
+ // Unsupported architectures will have a linker error with a semi-decent error message
70
+ __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
71
+ # endif
72
+ }
73
+ #endif // __cccl_ptx_isa >= 860
74
+
75
+ /*
76
+ // tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
77
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
78
+ __device__ static inline void tcgen05_ld_16x64b_pack_16b(
79
+ B32 (&out)[2],
80
+ uint32_t taddr);
81
+ */
82
+ #if __cccl_ptx_isa >= 860
83
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
84
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
85
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr)
86
+ {
87
+ static_assert(sizeof(_B32) == 4, "");
88
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
89
+ asm("tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 {%0, %1}, [%2];"
90
+ : "=r"(__out[0]), "=r"(__out[1])
91
+ : "r"(__taddr)
92
+ : "memory");
93
+ # else
94
+ // Unsupported architectures will have a linker error with a semi-decent error message
95
+ __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
96
+ # endif
97
+ }
98
+ #endif // __cccl_ptx_isa >= 860
99
+
100
+ /*
101
+ // tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
102
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
103
+ __device__ static inline void tcgen05_ld_16x64b(
104
+ B32 (&out)[4],
105
+ uint32_t taddr);
106
+ */
107
+ #if __cccl_ptx_isa >= 860
108
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
109
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
110
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr)
111
+ {
112
+ static_assert(sizeof(_B32) == 4, "");
113
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
114
+ asm("tcgen05.ld.sync.aligned.16x64b.x4.b32 {%0, %1, %2, %3}, [%4];"
115
+ : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3])
116
+ : "r"(__taddr)
117
+ : "memory");
118
+ # else
119
+ // Unsupported architectures will have a linker error with a semi-decent error message
120
+ __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
121
+ # endif
122
+ }
123
+ #endif // __cccl_ptx_isa >= 860
124
+
125
+ /*
126
+ // tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
127
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
128
+ __device__ static inline void tcgen05_ld_16x64b_pack_16b(
129
+ B32 (&out)[4],
130
+ uint32_t taddr);
131
+ */
132
+ #if __cccl_ptx_isa >= 860
133
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
134
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
135
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr)
136
+ {
137
+ static_assert(sizeof(_B32) == 4, "");
138
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
139
+ asm("tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 {%0, %1, %2, %3}, [%4];"
140
+ : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3])
141
+ : "r"(__taddr)
142
+ : "memory");
143
+ # else
144
+ // Unsupported architectures will have a linker error with a semi-decent error message
145
+ __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
146
+ # endif
147
+ }
148
+ #endif // __cccl_ptx_isa >= 860
149
+
150
+ /*
151
+ // tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
152
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
153
+ __device__ static inline void tcgen05_ld_16x64b(
154
+ B32 (&out)[8],
155
+ uint32_t taddr);
156
+ */
157
+ #if __cccl_ptx_isa >= 860
158
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
159
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
160
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr)
161
+ {
162
+ static_assert(sizeof(_B32) == 4, "");
163
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
164
+ asm("tcgen05.ld.sync.aligned.16x64b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
165
+ : "=r"(__out[0]),
166
+ "=r"(__out[1]),
167
+ "=r"(__out[2]),
168
+ "=r"(__out[3]),
169
+ "=r"(__out[4]),
170
+ "=r"(__out[5]),
171
+ "=r"(__out[6]),
172
+ "=r"(__out[7])
173
+ : "r"(__taddr)
174
+ : "memory");
175
+ # else
176
+ // Unsupported architectures will have a linker error with a semi-decent error message
177
+ __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
178
+ # endif
179
+ }
180
+ #endif // __cccl_ptx_isa >= 860
181
+
182
+ /*
183
+ // tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
184
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
185
+ __device__ static inline void tcgen05_ld_16x64b_pack_16b(
186
+ B32 (&out)[8],
187
+ uint32_t taddr);
188
+ */
189
+ #if __cccl_ptx_isa >= 860
190
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
191
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
192
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr)
193
+ {
194
+ static_assert(sizeof(_B32) == 4, "");
195
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
196
+ asm("tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
197
+ : "=r"(__out[0]),
198
+ "=r"(__out[1]),
199
+ "=r"(__out[2]),
200
+ "=r"(__out[3]),
201
+ "=r"(__out[4]),
202
+ "=r"(__out[5]),
203
+ "=r"(__out[6]),
204
+ "=r"(__out[7])
205
+ : "r"(__taddr)
206
+ : "memory");
207
+ # else
208
+ // Unsupported architectures will have a linker error with a semi-decent error message
209
+ __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
210
+ # endif
211
+ }
212
+ #endif // __cccl_ptx_isa >= 860
213
+
214
+ /*
215
+ // tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
216
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
217
+ __device__ static inline void tcgen05_ld_16x64b(
218
+ B32 (&out)[16],
219
+ uint32_t taddr);
220
+ */
221
+ #if __cccl_ptx_isa >= 860
222
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
223
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
224
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr)
225
+ {
226
+ static_assert(sizeof(_B32) == 4, "");
227
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
228
+ asm("tcgen05.ld.sync.aligned.16x64b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, "
229
+ "[%16];"
230
+ : "=r"(__out[0]),
231
+ "=r"(__out[1]),
232
+ "=r"(__out[2]),
233
+ "=r"(__out[3]),
234
+ "=r"(__out[4]),
235
+ "=r"(__out[5]),
236
+ "=r"(__out[6]),
237
+ "=r"(__out[7]),
238
+ "=r"(__out[8]),
239
+ "=r"(__out[9]),
240
+ "=r"(__out[10]),
241
+ "=r"(__out[11]),
242
+ "=r"(__out[12]),
243
+ "=r"(__out[13]),
244
+ "=r"(__out[14]),
245
+ "=r"(__out[15])
246
+ : "r"(__taddr)
247
+ : "memory");
248
+ # else
249
+ // Unsupported architectures will have a linker error with a semi-decent error message
250
+ __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
251
+ # endif
252
+ }
253
+ #endif // __cccl_ptx_isa >= 860
254
+
255
+ /*
256
+ // tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
257
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
258
+ __device__ static inline void tcgen05_ld_16x64b_pack_16b(
259
+ B32 (&out)[16],
260
+ uint32_t taddr);
261
+ */
262
+ #if __cccl_ptx_isa >= 860
263
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
264
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
265
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr)
266
+ {
267
+ static_assert(sizeof(_B32) == 4, "");
268
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
269
+ asm("tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
270
+ "%14, %15}, [%16];"
271
+ : "=r"(__out[0]),
272
+ "=r"(__out[1]),
273
+ "=r"(__out[2]),
274
+ "=r"(__out[3]),
275
+ "=r"(__out[4]),
276
+ "=r"(__out[5]),
277
+ "=r"(__out[6]),
278
+ "=r"(__out[7]),
279
+ "=r"(__out[8]),
280
+ "=r"(__out[9]),
281
+ "=r"(__out[10]),
282
+ "=r"(__out[11]),
283
+ "=r"(__out[12]),
284
+ "=r"(__out[13]),
285
+ "=r"(__out[14]),
286
+ "=r"(__out[15])
287
+ : "r"(__taddr)
288
+ : "memory");
289
+ # else
290
+ // Unsupported architectures will have a linker error with a semi-decent error message
291
+ __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
292
+ # endif
293
+ }
294
+ #endif // __cccl_ptx_isa >= 860
295
+
296
+ /*
297
+ // tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
298
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
299
+ __device__ static inline void tcgen05_ld_16x64b(
300
+ B32 (&out)[32],
301
+ uint32_t taddr);
302
+ */
303
+ #if __cccl_ptx_isa >= 860
304
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
305
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
306
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr)
307
+ {
308
+ static_assert(sizeof(_B32) == 4, "");
309
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
310
+ asm(
311
+ "tcgen05.ld.sync.aligned.16x64b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
312
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];"
313
+ : "=r"(__out[0]),
314
+ "=r"(__out[1]),
315
+ "=r"(__out[2]),
316
+ "=r"(__out[3]),
317
+ "=r"(__out[4]),
318
+ "=r"(__out[5]),
319
+ "=r"(__out[6]),
320
+ "=r"(__out[7]),
321
+ "=r"(__out[8]),
322
+ "=r"(__out[9]),
323
+ "=r"(__out[10]),
324
+ "=r"(__out[11]),
325
+ "=r"(__out[12]),
326
+ "=r"(__out[13]),
327
+ "=r"(__out[14]),
328
+ "=r"(__out[15]),
329
+ "=r"(__out[16]),
330
+ "=r"(__out[17]),
331
+ "=r"(__out[18]),
332
+ "=r"(__out[19]),
333
+ "=r"(__out[20]),
334
+ "=r"(__out[21]),
335
+ "=r"(__out[22]),
336
+ "=r"(__out[23]),
337
+ "=r"(__out[24]),
338
+ "=r"(__out[25]),
339
+ "=r"(__out[26]),
340
+ "=r"(__out[27]),
341
+ "=r"(__out[28]),
342
+ "=r"(__out[29]),
343
+ "=r"(__out[30]),
344
+ "=r"(__out[31])
345
+ : "r"(__taddr)
346
+ : "memory");
347
+ # else
348
+ // Unsupported architectures will have a linker error with a semi-decent error message
349
+ __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
350
+ # endif
351
+ }
352
+ #endif // __cccl_ptx_isa >= 860
353
+
354
+ /*
355
+ // tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
356
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
357
+ __device__ static inline void tcgen05_ld_16x64b_pack_16b(
358
+ B32 (&out)[32],
359
+ uint32_t taddr);
360
+ */
361
+ #if __cccl_ptx_isa >= 860
362
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
363
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
364
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr)
365
+ {
366
+ static_assert(sizeof(_B32) == 4, "");
367
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
368
+ asm(
369
+ "tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
370
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];"
371
+ : "=r"(__out[0]),
372
+ "=r"(__out[1]),
373
+ "=r"(__out[2]),
374
+ "=r"(__out[3]),
375
+ "=r"(__out[4]),
376
+ "=r"(__out[5]),
377
+ "=r"(__out[6]),
378
+ "=r"(__out[7]),
379
+ "=r"(__out[8]),
380
+ "=r"(__out[9]),
381
+ "=r"(__out[10]),
382
+ "=r"(__out[11]),
383
+ "=r"(__out[12]),
384
+ "=r"(__out[13]),
385
+ "=r"(__out[14]),
386
+ "=r"(__out[15]),
387
+ "=r"(__out[16]),
388
+ "=r"(__out[17]),
389
+ "=r"(__out[18]),
390
+ "=r"(__out[19]),
391
+ "=r"(__out[20]),
392
+ "=r"(__out[21]),
393
+ "=r"(__out[22]),
394
+ "=r"(__out[23]),
395
+ "=r"(__out[24]),
396
+ "=r"(__out[25]),
397
+ "=r"(__out[26]),
398
+ "=r"(__out[27]),
399
+ "=r"(__out[28]),
400
+ "=r"(__out[29]),
401
+ "=r"(__out[30]),
402
+ "=r"(__out[31])
403
+ : "r"(__taddr)
404
+ : "memory");
405
+ # else
406
+ // Unsupported architectures will have a linker error with a semi-decent error message
407
+ __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
408
+ # endif
409
+ }
410
+ #endif // __cccl_ptx_isa >= 860
411
+
412
+ /*
413
+ // tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
414
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
415
+ __device__ static inline void tcgen05_ld_16x64b(
416
+ B32 (&out)[64],
417
+ uint32_t taddr);
418
+ */
419
+ #if __cccl_ptx_isa >= 860
420
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
421
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
422
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr)
423
+ {
424
+ static_assert(sizeof(_B32) == 4, "");
425
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
426
+ asm(
427
+ "tcgen05.ld.sync.aligned.16x64b.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
428
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, "
429
+ "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, "
430
+ "%60, %61, %62, %63}, [%64];"
431
+ : "=r"(__out[0]),
432
+ "=r"(__out[1]),
433
+ "=r"(__out[2]),
434
+ "=r"(__out[3]),
435
+ "=r"(__out[4]),
436
+ "=r"(__out[5]),
437
+ "=r"(__out[6]),
438
+ "=r"(__out[7]),
439
+ "=r"(__out[8]),
440
+ "=r"(__out[9]),
441
+ "=r"(__out[10]),
442
+ "=r"(__out[11]),
443
+ "=r"(__out[12]),
444
+ "=r"(__out[13]),
445
+ "=r"(__out[14]),
446
+ "=r"(__out[15]),
447
+ "=r"(__out[16]),
448
+ "=r"(__out[17]),
449
+ "=r"(__out[18]),
450
+ "=r"(__out[19]),
451
+ "=r"(__out[20]),
452
+ "=r"(__out[21]),
453
+ "=r"(__out[22]),
454
+ "=r"(__out[23]),
455
+ "=r"(__out[24]),
456
+ "=r"(__out[25]),
457
+ "=r"(__out[26]),
458
+ "=r"(__out[27]),
459
+ "=r"(__out[28]),
460
+ "=r"(__out[29]),
461
+ "=r"(__out[30]),
462
+ "=r"(__out[31]),
463
+ "=r"(__out[32]),
464
+ "=r"(__out[33]),
465
+ "=r"(__out[34]),
466
+ "=r"(__out[35]),
467
+ "=r"(__out[36]),
468
+ "=r"(__out[37]),
469
+ "=r"(__out[38]),
470
+ "=r"(__out[39]),
471
+ "=r"(__out[40]),
472
+ "=r"(__out[41]),
473
+ "=r"(__out[42]),
474
+ "=r"(__out[43]),
475
+ "=r"(__out[44]),
476
+ "=r"(__out[45]),
477
+ "=r"(__out[46]),
478
+ "=r"(__out[47]),
479
+ "=r"(__out[48]),
480
+ "=r"(__out[49]),
481
+ "=r"(__out[50]),
482
+ "=r"(__out[51]),
483
+ "=r"(__out[52]),
484
+ "=r"(__out[53]),
485
+ "=r"(__out[54]),
486
+ "=r"(__out[55]),
487
+ "=r"(__out[56]),
488
+ "=r"(__out[57]),
489
+ "=r"(__out[58]),
490
+ "=r"(__out[59]),
491
+ "=r"(__out[60]),
492
+ "=r"(__out[61]),
493
+ "=r"(__out[62]),
494
+ "=r"(__out[63])
495
+ : "r"(__taddr)
496
+ : "memory");
497
+ # else
498
+ // Unsupported architectures will have a linker error with a semi-decent error message
499
+ __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
500
+ # endif
501
+ }
502
+ #endif // __cccl_ptx_isa >= 860
503
+
504
+ /*
505
+ // tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
506
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
507
+ __device__ static inline void tcgen05_ld_16x64b_pack_16b(
508
+ B32 (&out)[64],
509
+ uint32_t taddr);
510
+ */
511
+ #if __cccl_ptx_isa >= 860
512
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
513
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
514
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr)
515
+ {
516
+ static_assert(sizeof(_B32) == 4, "");
517
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
518
+ asm(
519
+ "tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
520
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, "
521
+ "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, "
522
+ "%58, %59, %60, %61, %62, %63}, [%64];"
523
+ : "=r"(__out[0]),
524
+ "=r"(__out[1]),
525
+ "=r"(__out[2]),
526
+ "=r"(__out[3]),
527
+ "=r"(__out[4]),
528
+ "=r"(__out[5]),
529
+ "=r"(__out[6]),
530
+ "=r"(__out[7]),
531
+ "=r"(__out[8]),
532
+ "=r"(__out[9]),
533
+ "=r"(__out[10]),
534
+ "=r"(__out[11]),
535
+ "=r"(__out[12]),
536
+ "=r"(__out[13]),
537
+ "=r"(__out[14]),
538
+ "=r"(__out[15]),
539
+ "=r"(__out[16]),
540
+ "=r"(__out[17]),
541
+ "=r"(__out[18]),
542
+ "=r"(__out[19]),
543
+ "=r"(__out[20]),
544
+ "=r"(__out[21]),
545
+ "=r"(__out[22]),
546
+ "=r"(__out[23]),
547
+ "=r"(__out[24]),
548
+ "=r"(__out[25]),
549
+ "=r"(__out[26]),
550
+ "=r"(__out[27]),
551
+ "=r"(__out[28]),
552
+ "=r"(__out[29]),
553
+ "=r"(__out[30]),
554
+ "=r"(__out[31]),
555
+ "=r"(__out[32]),
556
+ "=r"(__out[33]),
557
+ "=r"(__out[34]),
558
+ "=r"(__out[35]),
559
+ "=r"(__out[36]),
560
+ "=r"(__out[37]),
561
+ "=r"(__out[38]),
562
+ "=r"(__out[39]),
563
+ "=r"(__out[40]),
564
+ "=r"(__out[41]),
565
+ "=r"(__out[42]),
566
+ "=r"(__out[43]),
567
+ "=r"(__out[44]),
568
+ "=r"(__out[45]),
569
+ "=r"(__out[46]),
570
+ "=r"(__out[47]),
571
+ "=r"(__out[48]),
572
+ "=r"(__out[49]),
573
+ "=r"(__out[50]),
574
+ "=r"(__out[51]),
575
+ "=r"(__out[52]),
576
+ "=r"(__out[53]),
577
+ "=r"(__out[54]),
578
+ "=r"(__out[55]),
579
+ "=r"(__out[56]),
580
+ "=r"(__out[57]),
581
+ "=r"(__out[58]),
582
+ "=r"(__out[59]),
583
+ "=r"(__out[60]),
584
+ "=r"(__out[61]),
585
+ "=r"(__out[62]),
586
+ "=r"(__out[63])
587
+ : "r"(__taddr)
588
+ : "memory");
589
+ # else
590
+ // Unsupported architectures will have a linker error with a semi-decent error message
591
+ __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
592
+ # endif
593
+ }
594
+ #endif // __cccl_ptx_isa >= 860
595
+
596
+ /*
597
+ // tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
598
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
599
+ __device__ static inline void tcgen05_ld_16x64b(
600
+ B32 (&out)[128],
601
+ uint32_t taddr);
602
+ */
603
+ #if __cccl_ptx_isa >= 860
604
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
605
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
606
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr)
607
+ {
608
+ static_assert(sizeof(_B32) == 4, "");
609
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
610
+ asm(
611
+ "tcgen05.ld.sync.aligned.16x64b.x128.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
612
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, "
613
+ "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, "
614
+ "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, "
615
+ "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, "
616
+ "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, "
617
+ "%123, %124, %125, %126, %127}, [%128];"
618
+ : "=r"(__out[0]),
619
+ "=r"(__out[1]),
620
+ "=r"(__out[2]),
621
+ "=r"(__out[3]),
622
+ "=r"(__out[4]),
623
+ "=r"(__out[5]),
624
+ "=r"(__out[6]),
625
+ "=r"(__out[7]),
626
+ "=r"(__out[8]),
627
+ "=r"(__out[9]),
628
+ "=r"(__out[10]),
629
+ "=r"(__out[11]),
630
+ "=r"(__out[12]),
631
+ "=r"(__out[13]),
632
+ "=r"(__out[14]),
633
+ "=r"(__out[15]),
634
+ "=r"(__out[16]),
635
+ "=r"(__out[17]),
636
+ "=r"(__out[18]),
637
+ "=r"(__out[19]),
638
+ "=r"(__out[20]),
639
+ "=r"(__out[21]),
640
+ "=r"(__out[22]),
641
+ "=r"(__out[23]),
642
+ "=r"(__out[24]),
643
+ "=r"(__out[25]),
644
+ "=r"(__out[26]),
645
+ "=r"(__out[27]),
646
+ "=r"(__out[28]),
647
+ "=r"(__out[29]),
648
+ "=r"(__out[30]),
649
+ "=r"(__out[31]),
650
+ "=r"(__out[32]),
651
+ "=r"(__out[33]),
652
+ "=r"(__out[34]),
653
+ "=r"(__out[35]),
654
+ "=r"(__out[36]),
655
+ "=r"(__out[37]),
656
+ "=r"(__out[38]),
657
+ "=r"(__out[39]),
658
+ "=r"(__out[40]),
659
+ "=r"(__out[41]),
660
+ "=r"(__out[42]),
661
+ "=r"(__out[43]),
662
+ "=r"(__out[44]),
663
+ "=r"(__out[45]),
664
+ "=r"(__out[46]),
665
+ "=r"(__out[47]),
666
+ "=r"(__out[48]),
667
+ "=r"(__out[49]),
668
+ "=r"(__out[50]),
669
+ "=r"(__out[51]),
670
+ "=r"(__out[52]),
671
+ "=r"(__out[53]),
672
+ "=r"(__out[54]),
673
+ "=r"(__out[55]),
674
+ "=r"(__out[56]),
675
+ "=r"(__out[57]),
676
+ "=r"(__out[58]),
677
+ "=r"(__out[59]),
678
+ "=r"(__out[60]),
679
+ "=r"(__out[61]),
680
+ "=r"(__out[62]),
681
+ "=r"(__out[63]),
682
+ "=r"(__out[64]),
683
+ "=r"(__out[65]),
684
+ "=r"(__out[66]),
685
+ "=r"(__out[67]),
686
+ "=r"(__out[68]),
687
+ "=r"(__out[69]),
688
+ "=r"(__out[70]),
689
+ "=r"(__out[71]),
690
+ "=r"(__out[72]),
691
+ "=r"(__out[73]),
692
+ "=r"(__out[74]),
693
+ "=r"(__out[75]),
694
+ "=r"(__out[76]),
695
+ "=r"(__out[77]),
696
+ "=r"(__out[78]),
697
+ "=r"(__out[79]),
698
+ "=r"(__out[80]),
699
+ "=r"(__out[81]),
700
+ "=r"(__out[82]),
701
+ "=r"(__out[83]),
702
+ "=r"(__out[84]),
703
+ "=r"(__out[85]),
704
+ "=r"(__out[86]),
705
+ "=r"(__out[87]),
706
+ "=r"(__out[88]),
707
+ "=r"(__out[89]),
708
+ "=r"(__out[90]),
709
+ "=r"(__out[91]),
710
+ "=r"(__out[92]),
711
+ "=r"(__out[93]),
712
+ "=r"(__out[94]),
713
+ "=r"(__out[95]),
714
+ "=r"(__out[96]),
715
+ "=r"(__out[97]),
716
+ "=r"(__out[98]),
717
+ "=r"(__out[99]),
718
+ "=r"(__out[100]),
719
+ "=r"(__out[101]),
720
+ "=r"(__out[102]),
721
+ "=r"(__out[103]),
722
+ "=r"(__out[104]),
723
+ "=r"(__out[105]),
724
+ "=r"(__out[106]),
725
+ "=r"(__out[107]),
726
+ "=r"(__out[108]),
727
+ "=r"(__out[109]),
728
+ "=r"(__out[110]),
729
+ "=r"(__out[111]),
730
+ "=r"(__out[112]),
731
+ "=r"(__out[113]),
732
+ "=r"(__out[114]),
733
+ "=r"(__out[115]),
734
+ "=r"(__out[116]),
735
+ "=r"(__out[117]),
736
+ "=r"(__out[118]),
737
+ "=r"(__out[119]),
738
+ "=r"(__out[120]),
739
+ "=r"(__out[121]),
740
+ "=r"(__out[122]),
741
+ "=r"(__out[123]),
742
+ "=r"(__out[124]),
743
+ "=r"(__out[125]),
744
+ "=r"(__out[126]),
745
+ "=r"(__out[127])
746
+ : "r"(__taddr)
747
+ : "memory");
748
+ # else
749
+ // Unsupported architectures will have a linker error with a semi-decent error message
750
+ __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__();
751
+ # endif
752
+ }
753
+ #endif // __cccl_ptx_isa >= 860
754
+
755
+ /*
756
+ // tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
757
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
758
+ __device__ static inline void tcgen05_ld_16x64b_pack_16b(
759
+ B32 (&out)[128],
760
+ uint32_t taddr);
761
+ */
762
+ #if __cccl_ptx_isa >= 860
763
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
764
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
765
+ _CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr)
766
+ {
767
+ static_assert(sizeof(_B32) == 4, "");
768
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
769
+ asm(
770
+ "tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
771
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, "
772
+ "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, "
773
+ "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, "
774
+ "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, "
775
+ "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
776
+ "%121, %122, %123, %124, %125, %126, %127}, [%128];"
777
+ : "=r"(__out[0]),
778
+ "=r"(__out[1]),
779
+ "=r"(__out[2]),
780
+ "=r"(__out[3]),
781
+ "=r"(__out[4]),
782
+ "=r"(__out[5]),
783
+ "=r"(__out[6]),
784
+ "=r"(__out[7]),
785
+ "=r"(__out[8]),
786
+ "=r"(__out[9]),
787
+ "=r"(__out[10]),
788
+ "=r"(__out[11]),
789
+ "=r"(__out[12]),
790
+ "=r"(__out[13]),
791
+ "=r"(__out[14]),
792
+ "=r"(__out[15]),
793
+ "=r"(__out[16]),
794
+ "=r"(__out[17]),
795
+ "=r"(__out[18]),
796
+ "=r"(__out[19]),
797
+ "=r"(__out[20]),
798
+ "=r"(__out[21]),
799
+ "=r"(__out[22]),
800
+ "=r"(__out[23]),
801
+ "=r"(__out[24]),
802
+ "=r"(__out[25]),
803
+ "=r"(__out[26]),
804
+ "=r"(__out[27]),
805
+ "=r"(__out[28]),
806
+ "=r"(__out[29]),
807
+ "=r"(__out[30]),
808
+ "=r"(__out[31]),
809
+ "=r"(__out[32]),
810
+ "=r"(__out[33]),
811
+ "=r"(__out[34]),
812
+ "=r"(__out[35]),
813
+ "=r"(__out[36]),
814
+ "=r"(__out[37]),
815
+ "=r"(__out[38]),
816
+ "=r"(__out[39]),
817
+ "=r"(__out[40]),
818
+ "=r"(__out[41]),
819
+ "=r"(__out[42]),
820
+ "=r"(__out[43]),
821
+ "=r"(__out[44]),
822
+ "=r"(__out[45]),
823
+ "=r"(__out[46]),
824
+ "=r"(__out[47]),
825
+ "=r"(__out[48]),
826
+ "=r"(__out[49]),
827
+ "=r"(__out[50]),
828
+ "=r"(__out[51]),
829
+ "=r"(__out[52]),
830
+ "=r"(__out[53]),
831
+ "=r"(__out[54]),
832
+ "=r"(__out[55]),
833
+ "=r"(__out[56]),
834
+ "=r"(__out[57]),
835
+ "=r"(__out[58]),
836
+ "=r"(__out[59]),
837
+ "=r"(__out[60]),
838
+ "=r"(__out[61]),
839
+ "=r"(__out[62]),
840
+ "=r"(__out[63]),
841
+ "=r"(__out[64]),
842
+ "=r"(__out[65]),
843
+ "=r"(__out[66]),
844
+ "=r"(__out[67]),
845
+ "=r"(__out[68]),
846
+ "=r"(__out[69]),
847
+ "=r"(__out[70]),
848
+ "=r"(__out[71]),
849
+ "=r"(__out[72]),
850
+ "=r"(__out[73]),
851
+ "=r"(__out[74]),
852
+ "=r"(__out[75]),
853
+ "=r"(__out[76]),
854
+ "=r"(__out[77]),
855
+ "=r"(__out[78]),
856
+ "=r"(__out[79]),
857
+ "=r"(__out[80]),
858
+ "=r"(__out[81]),
859
+ "=r"(__out[82]),
860
+ "=r"(__out[83]),
861
+ "=r"(__out[84]),
862
+ "=r"(__out[85]),
863
+ "=r"(__out[86]),
864
+ "=r"(__out[87]),
865
+ "=r"(__out[88]),
866
+ "=r"(__out[89]),
867
+ "=r"(__out[90]),
868
+ "=r"(__out[91]),
869
+ "=r"(__out[92]),
870
+ "=r"(__out[93]),
871
+ "=r"(__out[94]),
872
+ "=r"(__out[95]),
873
+ "=r"(__out[96]),
874
+ "=r"(__out[97]),
875
+ "=r"(__out[98]),
876
+ "=r"(__out[99]),
877
+ "=r"(__out[100]),
878
+ "=r"(__out[101]),
879
+ "=r"(__out[102]),
880
+ "=r"(__out[103]),
881
+ "=r"(__out[104]),
882
+ "=r"(__out[105]),
883
+ "=r"(__out[106]),
884
+ "=r"(__out[107]),
885
+ "=r"(__out[108]),
886
+ "=r"(__out[109]),
887
+ "=r"(__out[110]),
888
+ "=r"(__out[111]),
889
+ "=r"(__out[112]),
890
+ "=r"(__out[113]),
891
+ "=r"(__out[114]),
892
+ "=r"(__out[115]),
893
+ "=r"(__out[116]),
894
+ "=r"(__out[117]),
895
+ "=r"(__out[118]),
896
+ "=r"(__out[119]),
897
+ "=r"(__out[120]),
898
+ "=r"(__out[121]),
899
+ "=r"(__out[122]),
900
+ "=r"(__out[123]),
901
+ "=r"(__out[124]),
902
+ "=r"(__out[125]),
903
+ "=r"(__out[126]),
904
+ "=r"(__out[127])
905
+ : "r"(__taddr)
906
+ : "memory");
907
+ # else
908
+ // Unsupported architectures will have a linker error with a semi-decent error message
909
+ __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
910
+ # endif
911
+ }
912
+ #endif // __cccl_ptx_isa >= 860
913
+
914
+ /*
915
+ // tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
916
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
917
+ __device__ static inline void tcgen05_ld_16x128b(
918
+ B32 (&out)[2],
919
+ uint32_t taddr);
920
+ */
921
+ #if __cccl_ptx_isa >= 860
922
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
923
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
924
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr)
925
+ {
926
+ static_assert(sizeof(_B32) == 4, "");
927
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
928
+ asm("tcgen05.ld.sync.aligned.16x128b.x1.b32 {%0, %1}, [%2];"
929
+ : "=r"(__out[0]), "=r"(__out[1])
930
+ : "r"(__taddr)
931
+ : "memory");
932
+ # else
933
+ // Unsupported architectures will have a linker error with a semi-decent error message
934
+ __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
935
+ # endif
936
+ }
937
+ #endif // __cccl_ptx_isa >= 860
938
+
939
+ /*
940
+ // tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
941
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
942
+ __device__ static inline void tcgen05_ld_16x128b_pack_16b(
943
+ B32 (&out)[2],
944
+ uint32_t taddr);
945
+ */
946
+ #if __cccl_ptx_isa >= 860
947
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
948
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
949
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr)
950
+ {
951
+ static_assert(sizeof(_B32) == 4, "");
952
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
953
+ asm("tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 {%0, %1}, [%2];"
954
+ : "=r"(__out[0]), "=r"(__out[1])
955
+ : "r"(__taddr)
956
+ : "memory");
957
+ # else
958
+ // Unsupported architectures will have a linker error with a semi-decent error message
959
+ __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
960
+ # endif
961
+ }
962
+ #endif // __cccl_ptx_isa >= 860
963
+
964
+ /*
965
+ // tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
966
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
967
+ __device__ static inline void tcgen05_ld_16x128b(
968
+ B32 (&out)[4],
969
+ uint32_t taddr);
970
+ */
971
+ #if __cccl_ptx_isa >= 860
972
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
973
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
974
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr)
975
+ {
976
+ static_assert(sizeof(_B32) == 4, "");
977
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
978
+ asm("tcgen05.ld.sync.aligned.16x128b.x2.b32 {%0, %1, %2, %3}, [%4];"
979
+ : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3])
980
+ : "r"(__taddr)
981
+ : "memory");
982
+ # else
983
+ // Unsupported architectures will have a linker error with a semi-decent error message
984
+ __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
985
+ # endif
986
+ }
987
+ #endif // __cccl_ptx_isa >= 860
988
+
989
+ /*
990
+ // tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
991
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
992
+ __device__ static inline void tcgen05_ld_16x128b_pack_16b(
993
+ B32 (&out)[4],
994
+ uint32_t taddr);
995
+ */
996
+ #if __cccl_ptx_isa >= 860
997
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
998
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
999
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr)
1000
+ {
1001
+ static_assert(sizeof(_B32) == 4, "");
1002
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1003
+ asm("tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 {%0, %1, %2, %3}, [%4];"
1004
+ : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3])
1005
+ : "r"(__taddr)
1006
+ : "memory");
1007
+ # else
1008
+ // Unsupported architectures will have a linker error with a semi-decent error message
1009
+ __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1010
+ # endif
1011
+ }
1012
+ #endif // __cccl_ptx_isa >= 860
1013
+
1014
+ /*
1015
+ // tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1016
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1017
+ __device__ static inline void tcgen05_ld_16x128b(
1018
+ B32 (&out)[8],
1019
+ uint32_t taddr);
1020
+ */
1021
+ #if __cccl_ptx_isa >= 860
1022
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
1023
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1024
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr)
1025
+ {
1026
+ static_assert(sizeof(_B32) == 4, "");
1027
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1028
+ asm("tcgen05.ld.sync.aligned.16x128b.x4.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
1029
+ : "=r"(__out[0]),
1030
+ "=r"(__out[1]),
1031
+ "=r"(__out[2]),
1032
+ "=r"(__out[3]),
1033
+ "=r"(__out[4]),
1034
+ "=r"(__out[5]),
1035
+ "=r"(__out[6]),
1036
+ "=r"(__out[7])
1037
+ : "r"(__taddr)
1038
+ : "memory");
1039
+ # else
1040
+ // Unsupported architectures will have a linker error with a semi-decent error message
1041
+ __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
1042
+ # endif
1043
+ }
1044
+ #endif // __cccl_ptx_isa >= 860
1045
+
1046
+ /*
1047
+ // tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1048
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1049
+ __device__ static inline void tcgen05_ld_16x128b_pack_16b(
1050
+ B32 (&out)[8],
1051
+ uint32_t taddr);
1052
+ */
1053
+ #if __cccl_ptx_isa >= 860
1054
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1055
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1056
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr)
1057
+ {
1058
+ static_assert(sizeof(_B32) == 4, "");
1059
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1060
+ asm("tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
1061
+ : "=r"(__out[0]),
1062
+ "=r"(__out[1]),
1063
+ "=r"(__out[2]),
1064
+ "=r"(__out[3]),
1065
+ "=r"(__out[4]),
1066
+ "=r"(__out[5]),
1067
+ "=r"(__out[6]),
1068
+ "=r"(__out[7])
1069
+ : "r"(__taddr)
1070
+ : "memory");
1071
+ # else
1072
+ // Unsupported architectures will have a linker error with a semi-decent error message
1073
+ __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1074
+ # endif
1075
+ }
1076
+ #endif // __cccl_ptx_isa >= 860
1077
+
1078
+ /*
1079
+ // tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1080
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1081
+ __device__ static inline void tcgen05_ld_16x128b(
1082
+ B32 (&out)[16],
1083
+ uint32_t taddr);
1084
+ */
1085
+ #if __cccl_ptx_isa >= 860
1086
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
1087
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1088
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr)
1089
+ {
1090
+ static_assert(sizeof(_B32) == 4, "");
1091
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1092
+ asm("tcgen05.ld.sync.aligned.16x128b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, "
1093
+ "[%16];"
1094
+ : "=r"(__out[0]),
1095
+ "=r"(__out[1]),
1096
+ "=r"(__out[2]),
1097
+ "=r"(__out[3]),
1098
+ "=r"(__out[4]),
1099
+ "=r"(__out[5]),
1100
+ "=r"(__out[6]),
1101
+ "=r"(__out[7]),
1102
+ "=r"(__out[8]),
1103
+ "=r"(__out[9]),
1104
+ "=r"(__out[10]),
1105
+ "=r"(__out[11]),
1106
+ "=r"(__out[12]),
1107
+ "=r"(__out[13]),
1108
+ "=r"(__out[14]),
1109
+ "=r"(__out[15])
1110
+ : "r"(__taddr)
1111
+ : "memory");
1112
+ # else
1113
+ // Unsupported architectures will have a linker error with a semi-decent error message
1114
+ __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
1115
+ # endif
1116
+ }
1117
+ #endif // __cccl_ptx_isa >= 860
1118
+
1119
+ /*
1120
+ // tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1121
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1122
+ __device__ static inline void tcgen05_ld_16x128b_pack_16b(
1123
+ B32 (&out)[16],
1124
+ uint32_t taddr);
1125
+ */
1126
+ #if __cccl_ptx_isa >= 860
1127
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1128
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1129
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr)
1130
+ {
1131
+ static_assert(sizeof(_B32) == 4, "");
1132
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1133
+ asm("tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
1134
+ "%14, %15}, [%16];"
1135
+ : "=r"(__out[0]),
1136
+ "=r"(__out[1]),
1137
+ "=r"(__out[2]),
1138
+ "=r"(__out[3]),
1139
+ "=r"(__out[4]),
1140
+ "=r"(__out[5]),
1141
+ "=r"(__out[6]),
1142
+ "=r"(__out[7]),
1143
+ "=r"(__out[8]),
1144
+ "=r"(__out[9]),
1145
+ "=r"(__out[10]),
1146
+ "=r"(__out[11]),
1147
+ "=r"(__out[12]),
1148
+ "=r"(__out[13]),
1149
+ "=r"(__out[14]),
1150
+ "=r"(__out[15])
1151
+ : "r"(__taddr)
1152
+ : "memory");
1153
+ # else
1154
+ // Unsupported architectures will have a linker error with a semi-decent error message
1155
+ __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1156
+ # endif
1157
+ }
1158
+ #endif // __cccl_ptx_isa >= 860
1159
+
1160
+ /*
1161
+ // tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1162
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1163
+ __device__ static inline void tcgen05_ld_16x128b(
1164
+ B32 (&out)[32],
1165
+ uint32_t taddr);
1166
+ */
1167
+ #if __cccl_ptx_isa >= 860
1168
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
1169
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1170
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr)
1171
+ {
1172
+ static_assert(sizeof(_B32) == 4, "");
1173
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1174
+ asm(
1175
+ "tcgen05.ld.sync.aligned.16x128b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
1176
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];"
1177
+ : "=r"(__out[0]),
1178
+ "=r"(__out[1]),
1179
+ "=r"(__out[2]),
1180
+ "=r"(__out[3]),
1181
+ "=r"(__out[4]),
1182
+ "=r"(__out[5]),
1183
+ "=r"(__out[6]),
1184
+ "=r"(__out[7]),
1185
+ "=r"(__out[8]),
1186
+ "=r"(__out[9]),
1187
+ "=r"(__out[10]),
1188
+ "=r"(__out[11]),
1189
+ "=r"(__out[12]),
1190
+ "=r"(__out[13]),
1191
+ "=r"(__out[14]),
1192
+ "=r"(__out[15]),
1193
+ "=r"(__out[16]),
1194
+ "=r"(__out[17]),
1195
+ "=r"(__out[18]),
1196
+ "=r"(__out[19]),
1197
+ "=r"(__out[20]),
1198
+ "=r"(__out[21]),
1199
+ "=r"(__out[22]),
1200
+ "=r"(__out[23]),
1201
+ "=r"(__out[24]),
1202
+ "=r"(__out[25]),
1203
+ "=r"(__out[26]),
1204
+ "=r"(__out[27]),
1205
+ "=r"(__out[28]),
1206
+ "=r"(__out[29]),
1207
+ "=r"(__out[30]),
1208
+ "=r"(__out[31])
1209
+ : "r"(__taddr)
1210
+ : "memory");
1211
+ # else
1212
+ // Unsupported architectures will have a linker error with a semi-decent error message
1213
+ __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
1214
+ # endif
1215
+ }
1216
+ #endif // __cccl_ptx_isa >= 860
1217
+
1218
+ /*
1219
+ // tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1220
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1221
+ __device__ static inline void tcgen05_ld_16x128b_pack_16b(
1222
+ B32 (&out)[32],
1223
+ uint32_t taddr);
1224
+ */
1225
+ #if __cccl_ptx_isa >= 860
1226
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1227
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1228
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr)
1229
+ {
1230
+ static_assert(sizeof(_B32) == 4, "");
1231
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1232
+ asm(
1233
+ "tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
1234
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];"
1235
+ : "=r"(__out[0]),
1236
+ "=r"(__out[1]),
1237
+ "=r"(__out[2]),
1238
+ "=r"(__out[3]),
1239
+ "=r"(__out[4]),
1240
+ "=r"(__out[5]),
1241
+ "=r"(__out[6]),
1242
+ "=r"(__out[7]),
1243
+ "=r"(__out[8]),
1244
+ "=r"(__out[9]),
1245
+ "=r"(__out[10]),
1246
+ "=r"(__out[11]),
1247
+ "=r"(__out[12]),
1248
+ "=r"(__out[13]),
1249
+ "=r"(__out[14]),
1250
+ "=r"(__out[15]),
1251
+ "=r"(__out[16]),
1252
+ "=r"(__out[17]),
1253
+ "=r"(__out[18]),
1254
+ "=r"(__out[19]),
1255
+ "=r"(__out[20]),
1256
+ "=r"(__out[21]),
1257
+ "=r"(__out[22]),
1258
+ "=r"(__out[23]),
1259
+ "=r"(__out[24]),
1260
+ "=r"(__out[25]),
1261
+ "=r"(__out[26]),
1262
+ "=r"(__out[27]),
1263
+ "=r"(__out[28]),
1264
+ "=r"(__out[29]),
1265
+ "=r"(__out[30]),
1266
+ "=r"(__out[31])
1267
+ : "r"(__taddr)
1268
+ : "memory");
1269
+ # else
1270
+ // Unsupported architectures will have a linker error with a semi-decent error message
1271
+ __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1272
+ # endif
1273
+ }
1274
+ #endif // __cccl_ptx_isa >= 860
1275
+
1276
+ /*
1277
+ // tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1278
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1279
+ __device__ static inline void tcgen05_ld_16x128b(
1280
+ B32 (&out)[64],
1281
+ uint32_t taddr);
1282
+ */
1283
+ #if __cccl_ptx_isa >= 860
1284
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
1285
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1286
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr)
1287
+ {
1288
+ static_assert(sizeof(_B32) == 4, "");
1289
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1290
+ asm(
1291
+ "tcgen05.ld.sync.aligned.16x128b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
1292
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, "
1293
+ "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, "
1294
+ "%60, %61, %62, %63}, [%64];"
1295
+ : "=r"(__out[0]),
1296
+ "=r"(__out[1]),
1297
+ "=r"(__out[2]),
1298
+ "=r"(__out[3]),
1299
+ "=r"(__out[4]),
1300
+ "=r"(__out[5]),
1301
+ "=r"(__out[6]),
1302
+ "=r"(__out[7]),
1303
+ "=r"(__out[8]),
1304
+ "=r"(__out[9]),
1305
+ "=r"(__out[10]),
1306
+ "=r"(__out[11]),
1307
+ "=r"(__out[12]),
1308
+ "=r"(__out[13]),
1309
+ "=r"(__out[14]),
1310
+ "=r"(__out[15]),
1311
+ "=r"(__out[16]),
1312
+ "=r"(__out[17]),
1313
+ "=r"(__out[18]),
1314
+ "=r"(__out[19]),
1315
+ "=r"(__out[20]),
1316
+ "=r"(__out[21]),
1317
+ "=r"(__out[22]),
1318
+ "=r"(__out[23]),
1319
+ "=r"(__out[24]),
1320
+ "=r"(__out[25]),
1321
+ "=r"(__out[26]),
1322
+ "=r"(__out[27]),
1323
+ "=r"(__out[28]),
1324
+ "=r"(__out[29]),
1325
+ "=r"(__out[30]),
1326
+ "=r"(__out[31]),
1327
+ "=r"(__out[32]),
1328
+ "=r"(__out[33]),
1329
+ "=r"(__out[34]),
1330
+ "=r"(__out[35]),
1331
+ "=r"(__out[36]),
1332
+ "=r"(__out[37]),
1333
+ "=r"(__out[38]),
1334
+ "=r"(__out[39]),
1335
+ "=r"(__out[40]),
1336
+ "=r"(__out[41]),
1337
+ "=r"(__out[42]),
1338
+ "=r"(__out[43]),
1339
+ "=r"(__out[44]),
1340
+ "=r"(__out[45]),
1341
+ "=r"(__out[46]),
1342
+ "=r"(__out[47]),
1343
+ "=r"(__out[48]),
1344
+ "=r"(__out[49]),
1345
+ "=r"(__out[50]),
1346
+ "=r"(__out[51]),
1347
+ "=r"(__out[52]),
1348
+ "=r"(__out[53]),
1349
+ "=r"(__out[54]),
1350
+ "=r"(__out[55]),
1351
+ "=r"(__out[56]),
1352
+ "=r"(__out[57]),
1353
+ "=r"(__out[58]),
1354
+ "=r"(__out[59]),
1355
+ "=r"(__out[60]),
1356
+ "=r"(__out[61]),
1357
+ "=r"(__out[62]),
1358
+ "=r"(__out[63])
1359
+ : "r"(__taddr)
1360
+ : "memory");
1361
+ # else
1362
+ // Unsupported architectures will have a linker error with a semi-decent error message
1363
+ __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
1364
+ # endif
1365
+ }
1366
+ #endif // __cccl_ptx_isa >= 860
1367
+
1368
+ /*
1369
+ // tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1370
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1371
+ __device__ static inline void tcgen05_ld_16x128b_pack_16b(
1372
+ B32 (&out)[64],
1373
+ uint32_t taddr);
1374
+ */
1375
+ #if __cccl_ptx_isa >= 860
1376
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1377
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1378
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr)
1379
+ {
1380
+ static_assert(sizeof(_B32) == 4, "");
1381
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1382
+ asm(
1383
+ "tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
1384
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, "
1385
+ "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, "
1386
+ "%58, %59, %60, %61, %62, %63}, [%64];"
1387
+ : "=r"(__out[0]),
1388
+ "=r"(__out[1]),
1389
+ "=r"(__out[2]),
1390
+ "=r"(__out[3]),
1391
+ "=r"(__out[4]),
1392
+ "=r"(__out[5]),
1393
+ "=r"(__out[6]),
1394
+ "=r"(__out[7]),
1395
+ "=r"(__out[8]),
1396
+ "=r"(__out[9]),
1397
+ "=r"(__out[10]),
1398
+ "=r"(__out[11]),
1399
+ "=r"(__out[12]),
1400
+ "=r"(__out[13]),
1401
+ "=r"(__out[14]),
1402
+ "=r"(__out[15]),
1403
+ "=r"(__out[16]),
1404
+ "=r"(__out[17]),
1405
+ "=r"(__out[18]),
1406
+ "=r"(__out[19]),
1407
+ "=r"(__out[20]),
1408
+ "=r"(__out[21]),
1409
+ "=r"(__out[22]),
1410
+ "=r"(__out[23]),
1411
+ "=r"(__out[24]),
1412
+ "=r"(__out[25]),
1413
+ "=r"(__out[26]),
1414
+ "=r"(__out[27]),
1415
+ "=r"(__out[28]),
1416
+ "=r"(__out[29]),
1417
+ "=r"(__out[30]),
1418
+ "=r"(__out[31]),
1419
+ "=r"(__out[32]),
1420
+ "=r"(__out[33]),
1421
+ "=r"(__out[34]),
1422
+ "=r"(__out[35]),
1423
+ "=r"(__out[36]),
1424
+ "=r"(__out[37]),
1425
+ "=r"(__out[38]),
1426
+ "=r"(__out[39]),
1427
+ "=r"(__out[40]),
1428
+ "=r"(__out[41]),
1429
+ "=r"(__out[42]),
1430
+ "=r"(__out[43]),
1431
+ "=r"(__out[44]),
1432
+ "=r"(__out[45]),
1433
+ "=r"(__out[46]),
1434
+ "=r"(__out[47]),
1435
+ "=r"(__out[48]),
1436
+ "=r"(__out[49]),
1437
+ "=r"(__out[50]),
1438
+ "=r"(__out[51]),
1439
+ "=r"(__out[52]),
1440
+ "=r"(__out[53]),
1441
+ "=r"(__out[54]),
1442
+ "=r"(__out[55]),
1443
+ "=r"(__out[56]),
1444
+ "=r"(__out[57]),
1445
+ "=r"(__out[58]),
1446
+ "=r"(__out[59]),
1447
+ "=r"(__out[60]),
1448
+ "=r"(__out[61]),
1449
+ "=r"(__out[62]),
1450
+ "=r"(__out[63])
1451
+ : "r"(__taddr)
1452
+ : "memory");
1453
+ # else
1454
+ // Unsupported architectures will have a linker error with a semi-decent error message
1455
+ __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1456
+ # endif
1457
+ }
1458
+ #endif // __cccl_ptx_isa >= 860
1459
+
1460
+ /*
1461
+ // tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1462
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1463
+ __device__ static inline void tcgen05_ld_16x128b(
1464
+ B32 (&out)[128],
1465
+ uint32_t taddr);
1466
+ */
1467
+ #if __cccl_ptx_isa >= 860
1468
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
1469
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1470
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr)
1471
+ {
1472
+ static_assert(sizeof(_B32) == 4, "");
1473
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1474
+ asm(
1475
+ "tcgen05.ld.sync.aligned.16x128b.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
1476
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, "
1477
+ "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, "
1478
+ "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, "
1479
+ "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, "
1480
+ "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, "
1481
+ "%123, %124, %125, %126, %127}, [%128];"
1482
+ : "=r"(__out[0]),
1483
+ "=r"(__out[1]),
1484
+ "=r"(__out[2]),
1485
+ "=r"(__out[3]),
1486
+ "=r"(__out[4]),
1487
+ "=r"(__out[5]),
1488
+ "=r"(__out[6]),
1489
+ "=r"(__out[7]),
1490
+ "=r"(__out[8]),
1491
+ "=r"(__out[9]),
1492
+ "=r"(__out[10]),
1493
+ "=r"(__out[11]),
1494
+ "=r"(__out[12]),
1495
+ "=r"(__out[13]),
1496
+ "=r"(__out[14]),
1497
+ "=r"(__out[15]),
1498
+ "=r"(__out[16]),
1499
+ "=r"(__out[17]),
1500
+ "=r"(__out[18]),
1501
+ "=r"(__out[19]),
1502
+ "=r"(__out[20]),
1503
+ "=r"(__out[21]),
1504
+ "=r"(__out[22]),
1505
+ "=r"(__out[23]),
1506
+ "=r"(__out[24]),
1507
+ "=r"(__out[25]),
1508
+ "=r"(__out[26]),
1509
+ "=r"(__out[27]),
1510
+ "=r"(__out[28]),
1511
+ "=r"(__out[29]),
1512
+ "=r"(__out[30]),
1513
+ "=r"(__out[31]),
1514
+ "=r"(__out[32]),
1515
+ "=r"(__out[33]),
1516
+ "=r"(__out[34]),
1517
+ "=r"(__out[35]),
1518
+ "=r"(__out[36]),
1519
+ "=r"(__out[37]),
1520
+ "=r"(__out[38]),
1521
+ "=r"(__out[39]),
1522
+ "=r"(__out[40]),
1523
+ "=r"(__out[41]),
1524
+ "=r"(__out[42]),
1525
+ "=r"(__out[43]),
1526
+ "=r"(__out[44]),
1527
+ "=r"(__out[45]),
1528
+ "=r"(__out[46]),
1529
+ "=r"(__out[47]),
1530
+ "=r"(__out[48]),
1531
+ "=r"(__out[49]),
1532
+ "=r"(__out[50]),
1533
+ "=r"(__out[51]),
1534
+ "=r"(__out[52]),
1535
+ "=r"(__out[53]),
1536
+ "=r"(__out[54]),
1537
+ "=r"(__out[55]),
1538
+ "=r"(__out[56]),
1539
+ "=r"(__out[57]),
1540
+ "=r"(__out[58]),
1541
+ "=r"(__out[59]),
1542
+ "=r"(__out[60]),
1543
+ "=r"(__out[61]),
1544
+ "=r"(__out[62]),
1545
+ "=r"(__out[63]),
1546
+ "=r"(__out[64]),
1547
+ "=r"(__out[65]),
1548
+ "=r"(__out[66]),
1549
+ "=r"(__out[67]),
1550
+ "=r"(__out[68]),
1551
+ "=r"(__out[69]),
1552
+ "=r"(__out[70]),
1553
+ "=r"(__out[71]),
1554
+ "=r"(__out[72]),
1555
+ "=r"(__out[73]),
1556
+ "=r"(__out[74]),
1557
+ "=r"(__out[75]),
1558
+ "=r"(__out[76]),
1559
+ "=r"(__out[77]),
1560
+ "=r"(__out[78]),
1561
+ "=r"(__out[79]),
1562
+ "=r"(__out[80]),
1563
+ "=r"(__out[81]),
1564
+ "=r"(__out[82]),
1565
+ "=r"(__out[83]),
1566
+ "=r"(__out[84]),
1567
+ "=r"(__out[85]),
1568
+ "=r"(__out[86]),
1569
+ "=r"(__out[87]),
1570
+ "=r"(__out[88]),
1571
+ "=r"(__out[89]),
1572
+ "=r"(__out[90]),
1573
+ "=r"(__out[91]),
1574
+ "=r"(__out[92]),
1575
+ "=r"(__out[93]),
1576
+ "=r"(__out[94]),
1577
+ "=r"(__out[95]),
1578
+ "=r"(__out[96]),
1579
+ "=r"(__out[97]),
1580
+ "=r"(__out[98]),
1581
+ "=r"(__out[99]),
1582
+ "=r"(__out[100]),
1583
+ "=r"(__out[101]),
1584
+ "=r"(__out[102]),
1585
+ "=r"(__out[103]),
1586
+ "=r"(__out[104]),
1587
+ "=r"(__out[105]),
1588
+ "=r"(__out[106]),
1589
+ "=r"(__out[107]),
1590
+ "=r"(__out[108]),
1591
+ "=r"(__out[109]),
1592
+ "=r"(__out[110]),
1593
+ "=r"(__out[111]),
1594
+ "=r"(__out[112]),
1595
+ "=r"(__out[113]),
1596
+ "=r"(__out[114]),
1597
+ "=r"(__out[115]),
1598
+ "=r"(__out[116]),
1599
+ "=r"(__out[117]),
1600
+ "=r"(__out[118]),
1601
+ "=r"(__out[119]),
1602
+ "=r"(__out[120]),
1603
+ "=r"(__out[121]),
1604
+ "=r"(__out[122]),
1605
+ "=r"(__out[123]),
1606
+ "=r"(__out[124]),
1607
+ "=r"(__out[125]),
1608
+ "=r"(__out[126]),
1609
+ "=r"(__out[127])
1610
+ : "r"(__taddr)
1611
+ : "memory");
1612
+ # else
1613
+ // Unsupported architectures will have a linker error with a semi-decent error message
1614
+ __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__();
1615
+ # endif
1616
+ }
1617
+ #endif // __cccl_ptx_isa >= 860
1618
+
1619
+ /*
1620
+ // tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1621
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1622
+ __device__ static inline void tcgen05_ld_16x128b_pack_16b(
1623
+ B32 (&out)[128],
1624
+ uint32_t taddr);
1625
+ */
1626
+ #if __cccl_ptx_isa >= 860
1627
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1628
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1629
+ _CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr)
1630
+ {
1631
+ static_assert(sizeof(_B32) == 4, "");
1632
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1633
+ asm(
1634
+ "tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
1635
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, "
1636
+ "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, "
1637
+ "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, "
1638
+ "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, "
1639
+ "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
1640
+ "%121, %122, %123, %124, %125, %126, %127}, [%128];"
1641
+ : "=r"(__out[0]),
1642
+ "=r"(__out[1]),
1643
+ "=r"(__out[2]),
1644
+ "=r"(__out[3]),
1645
+ "=r"(__out[4]),
1646
+ "=r"(__out[5]),
1647
+ "=r"(__out[6]),
1648
+ "=r"(__out[7]),
1649
+ "=r"(__out[8]),
1650
+ "=r"(__out[9]),
1651
+ "=r"(__out[10]),
1652
+ "=r"(__out[11]),
1653
+ "=r"(__out[12]),
1654
+ "=r"(__out[13]),
1655
+ "=r"(__out[14]),
1656
+ "=r"(__out[15]),
1657
+ "=r"(__out[16]),
1658
+ "=r"(__out[17]),
1659
+ "=r"(__out[18]),
1660
+ "=r"(__out[19]),
1661
+ "=r"(__out[20]),
1662
+ "=r"(__out[21]),
1663
+ "=r"(__out[22]),
1664
+ "=r"(__out[23]),
1665
+ "=r"(__out[24]),
1666
+ "=r"(__out[25]),
1667
+ "=r"(__out[26]),
1668
+ "=r"(__out[27]),
1669
+ "=r"(__out[28]),
1670
+ "=r"(__out[29]),
1671
+ "=r"(__out[30]),
1672
+ "=r"(__out[31]),
1673
+ "=r"(__out[32]),
1674
+ "=r"(__out[33]),
1675
+ "=r"(__out[34]),
1676
+ "=r"(__out[35]),
1677
+ "=r"(__out[36]),
1678
+ "=r"(__out[37]),
1679
+ "=r"(__out[38]),
1680
+ "=r"(__out[39]),
1681
+ "=r"(__out[40]),
1682
+ "=r"(__out[41]),
1683
+ "=r"(__out[42]),
1684
+ "=r"(__out[43]),
1685
+ "=r"(__out[44]),
1686
+ "=r"(__out[45]),
1687
+ "=r"(__out[46]),
1688
+ "=r"(__out[47]),
1689
+ "=r"(__out[48]),
1690
+ "=r"(__out[49]),
1691
+ "=r"(__out[50]),
1692
+ "=r"(__out[51]),
1693
+ "=r"(__out[52]),
1694
+ "=r"(__out[53]),
1695
+ "=r"(__out[54]),
1696
+ "=r"(__out[55]),
1697
+ "=r"(__out[56]),
1698
+ "=r"(__out[57]),
1699
+ "=r"(__out[58]),
1700
+ "=r"(__out[59]),
1701
+ "=r"(__out[60]),
1702
+ "=r"(__out[61]),
1703
+ "=r"(__out[62]),
1704
+ "=r"(__out[63]),
1705
+ "=r"(__out[64]),
1706
+ "=r"(__out[65]),
1707
+ "=r"(__out[66]),
1708
+ "=r"(__out[67]),
1709
+ "=r"(__out[68]),
1710
+ "=r"(__out[69]),
1711
+ "=r"(__out[70]),
1712
+ "=r"(__out[71]),
1713
+ "=r"(__out[72]),
1714
+ "=r"(__out[73]),
1715
+ "=r"(__out[74]),
1716
+ "=r"(__out[75]),
1717
+ "=r"(__out[76]),
1718
+ "=r"(__out[77]),
1719
+ "=r"(__out[78]),
1720
+ "=r"(__out[79]),
1721
+ "=r"(__out[80]),
1722
+ "=r"(__out[81]),
1723
+ "=r"(__out[82]),
1724
+ "=r"(__out[83]),
1725
+ "=r"(__out[84]),
1726
+ "=r"(__out[85]),
1727
+ "=r"(__out[86]),
1728
+ "=r"(__out[87]),
1729
+ "=r"(__out[88]),
1730
+ "=r"(__out[89]),
1731
+ "=r"(__out[90]),
1732
+ "=r"(__out[91]),
1733
+ "=r"(__out[92]),
1734
+ "=r"(__out[93]),
1735
+ "=r"(__out[94]),
1736
+ "=r"(__out[95]),
1737
+ "=r"(__out[96]),
1738
+ "=r"(__out[97]),
1739
+ "=r"(__out[98]),
1740
+ "=r"(__out[99]),
1741
+ "=r"(__out[100]),
1742
+ "=r"(__out[101]),
1743
+ "=r"(__out[102]),
1744
+ "=r"(__out[103]),
1745
+ "=r"(__out[104]),
1746
+ "=r"(__out[105]),
1747
+ "=r"(__out[106]),
1748
+ "=r"(__out[107]),
1749
+ "=r"(__out[108]),
1750
+ "=r"(__out[109]),
1751
+ "=r"(__out[110]),
1752
+ "=r"(__out[111]),
1753
+ "=r"(__out[112]),
1754
+ "=r"(__out[113]),
1755
+ "=r"(__out[114]),
1756
+ "=r"(__out[115]),
1757
+ "=r"(__out[116]),
1758
+ "=r"(__out[117]),
1759
+ "=r"(__out[118]),
1760
+ "=r"(__out[119]),
1761
+ "=r"(__out[120]),
1762
+ "=r"(__out[121]),
1763
+ "=r"(__out[122]),
1764
+ "=r"(__out[123]),
1765
+ "=r"(__out[124]),
1766
+ "=r"(__out[125]),
1767
+ "=r"(__out[126]),
1768
+ "=r"(__out[127])
1769
+ : "r"(__taddr)
1770
+ : "memory");
1771
+ # else
1772
+ // Unsupported architectures will have a linker error with a semi-decent error message
1773
+ __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1774
+ # endif
1775
+ }
1776
+ #endif // __cccl_ptx_isa >= 860
1777
+
1778
+ /*
1779
+ // tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1780
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1781
+ __device__ static inline void tcgen05_ld_16x256b(
1782
+ B32 (&out)[4],
1783
+ uint32_t taddr);
1784
+ */
1785
+ #if __cccl_ptx_isa >= 860
1786
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__();
1787
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1788
+ _CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr)
1789
+ {
1790
+ static_assert(sizeof(_B32) == 4, "");
1791
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1792
+ asm("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];"
1793
+ : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3])
1794
+ : "r"(__taddr)
1795
+ : "memory");
1796
+ # else
1797
+ // Unsupported architectures will have a linker error with a semi-decent error message
1798
+ __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__();
1799
+ # endif
1800
+ }
1801
+ #endif // __cccl_ptx_isa >= 860
1802
+
1803
+ /*
1804
+ // tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1805
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1806
+ __device__ static inline void tcgen05_ld_16x256b_pack_16b(
1807
+ B32 (&out)[4],
1808
+ uint32_t taddr);
1809
+ */
1810
+ #if __cccl_ptx_isa >= 860
1811
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1812
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1813
+ _CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr)
1814
+ {
1815
+ static_assert(sizeof(_B32) == 4, "");
1816
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1817
+ asm("tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 {%0, %1, %2, %3}, [%4];"
1818
+ : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3])
1819
+ : "r"(__taddr)
1820
+ : "memory");
1821
+ # else
1822
+ // Unsupported architectures will have a linker error with a semi-decent error message
1823
+ __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1824
+ # endif
1825
+ }
1826
+ #endif // __cccl_ptx_isa >= 860
1827
+
1828
+ /*
1829
+ // tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1830
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1831
+ __device__ static inline void tcgen05_ld_16x256b(
1832
+ B32 (&out)[8],
1833
+ uint32_t taddr);
1834
+ */
1835
+ #if __cccl_ptx_isa >= 860
1836
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__();
1837
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1838
+ _CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr)
1839
+ {
1840
+ static_assert(sizeof(_B32) == 4, "");
1841
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1842
+ asm("tcgen05.ld.sync.aligned.16x256b.x2.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
1843
+ : "=r"(__out[0]),
1844
+ "=r"(__out[1]),
1845
+ "=r"(__out[2]),
1846
+ "=r"(__out[3]),
1847
+ "=r"(__out[4]),
1848
+ "=r"(__out[5]),
1849
+ "=r"(__out[6]),
1850
+ "=r"(__out[7])
1851
+ : "r"(__taddr)
1852
+ : "memory");
1853
+ # else
1854
+ // Unsupported architectures will have a linker error with a semi-decent error message
1855
+ __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__();
1856
+ # endif
1857
+ }
1858
+ #endif // __cccl_ptx_isa >= 860
1859
+
1860
+ /*
1861
+ // tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1862
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1863
+ __device__ static inline void tcgen05_ld_16x256b_pack_16b(
1864
+ B32 (&out)[8],
1865
+ uint32_t taddr);
1866
+ */
1867
+ #if __cccl_ptx_isa >= 860
1868
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1869
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1870
+ _CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr)
1871
+ {
1872
+ static_assert(sizeof(_B32) == 4, "");
1873
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1874
+ asm("tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
1875
+ : "=r"(__out[0]),
1876
+ "=r"(__out[1]),
1877
+ "=r"(__out[2]),
1878
+ "=r"(__out[3]),
1879
+ "=r"(__out[4]),
1880
+ "=r"(__out[5]),
1881
+ "=r"(__out[6]),
1882
+ "=r"(__out[7])
1883
+ : "r"(__taddr)
1884
+ : "memory");
1885
+ # else
1886
+ // Unsupported architectures will have a linker error with a semi-decent error message
1887
+ __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1888
+ # endif
1889
+ }
1890
+ #endif // __cccl_ptx_isa >= 860
1891
+
1892
+ /*
1893
+ // tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1894
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1895
+ __device__ static inline void tcgen05_ld_16x256b(
1896
+ B32 (&out)[16],
1897
+ uint32_t taddr);
1898
+ */
1899
+ #if __cccl_ptx_isa >= 860
1900
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__();
1901
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1902
+ _CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr)
1903
+ {
1904
+ static_assert(sizeof(_B32) == 4, "");
1905
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1906
+ asm("tcgen05.ld.sync.aligned.16x256b.x4.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, "
1907
+ "[%16];"
1908
+ : "=r"(__out[0]),
1909
+ "=r"(__out[1]),
1910
+ "=r"(__out[2]),
1911
+ "=r"(__out[3]),
1912
+ "=r"(__out[4]),
1913
+ "=r"(__out[5]),
1914
+ "=r"(__out[6]),
1915
+ "=r"(__out[7]),
1916
+ "=r"(__out[8]),
1917
+ "=r"(__out[9]),
1918
+ "=r"(__out[10]),
1919
+ "=r"(__out[11]),
1920
+ "=r"(__out[12]),
1921
+ "=r"(__out[13]),
1922
+ "=r"(__out[14]),
1923
+ "=r"(__out[15])
1924
+ : "r"(__taddr)
1925
+ : "memory");
1926
+ # else
1927
+ // Unsupported architectures will have a linker error with a semi-decent error message
1928
+ __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__();
1929
+ # endif
1930
+ }
1931
+ #endif // __cccl_ptx_isa >= 860
1932
+
1933
+ /*
1934
+ // tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1935
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1936
+ __device__ static inline void tcgen05_ld_16x256b_pack_16b(
1937
+ B32 (&out)[16],
1938
+ uint32_t taddr);
1939
+ */
1940
+ #if __cccl_ptx_isa >= 860
1941
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1942
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1943
+ _CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr)
1944
+ {
1945
+ static_assert(sizeof(_B32) == 4, "");
1946
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1947
+ asm("tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
1948
+ "%14, %15}, [%16];"
1949
+ : "=r"(__out[0]),
1950
+ "=r"(__out[1]),
1951
+ "=r"(__out[2]),
1952
+ "=r"(__out[3]),
1953
+ "=r"(__out[4]),
1954
+ "=r"(__out[5]),
1955
+ "=r"(__out[6]),
1956
+ "=r"(__out[7]),
1957
+ "=r"(__out[8]),
1958
+ "=r"(__out[9]),
1959
+ "=r"(__out[10]),
1960
+ "=r"(__out[11]),
1961
+ "=r"(__out[12]),
1962
+ "=r"(__out[13]),
1963
+ "=r"(__out[14]),
1964
+ "=r"(__out[15])
1965
+ : "r"(__taddr)
1966
+ : "memory");
1967
+ # else
1968
+ // Unsupported architectures will have a linker error with a semi-decent error message
1969
+ __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
1970
+ # endif
1971
+ }
1972
+ #endif // __cccl_ptx_isa >= 860
1973
+
1974
+ /*
1975
+ // tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
1976
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
1977
+ __device__ static inline void tcgen05_ld_16x256b(
1978
+ B32 (&out)[32],
1979
+ uint32_t taddr);
1980
+ */
1981
+ #if __cccl_ptx_isa >= 860
1982
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__();
1983
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
1984
+ _CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr)
1985
+ {
1986
+ static_assert(sizeof(_B32) == 4, "");
1987
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
1988
+ asm(
1989
+ "tcgen05.ld.sync.aligned.16x256b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
1990
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];"
1991
+ : "=r"(__out[0]),
1992
+ "=r"(__out[1]),
1993
+ "=r"(__out[2]),
1994
+ "=r"(__out[3]),
1995
+ "=r"(__out[4]),
1996
+ "=r"(__out[5]),
1997
+ "=r"(__out[6]),
1998
+ "=r"(__out[7]),
1999
+ "=r"(__out[8]),
2000
+ "=r"(__out[9]),
2001
+ "=r"(__out[10]),
2002
+ "=r"(__out[11]),
2003
+ "=r"(__out[12]),
2004
+ "=r"(__out[13]),
2005
+ "=r"(__out[14]),
2006
+ "=r"(__out[15]),
2007
+ "=r"(__out[16]),
2008
+ "=r"(__out[17]),
2009
+ "=r"(__out[18]),
2010
+ "=r"(__out[19]),
2011
+ "=r"(__out[20]),
2012
+ "=r"(__out[21]),
2013
+ "=r"(__out[22]),
2014
+ "=r"(__out[23]),
2015
+ "=r"(__out[24]),
2016
+ "=r"(__out[25]),
2017
+ "=r"(__out[26]),
2018
+ "=r"(__out[27]),
2019
+ "=r"(__out[28]),
2020
+ "=r"(__out[29]),
2021
+ "=r"(__out[30]),
2022
+ "=r"(__out[31])
2023
+ : "r"(__taddr)
2024
+ : "memory");
2025
+ # else
2026
+ // Unsupported architectures will have a linker error with a semi-decent error message
2027
+ __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__();
2028
+ # endif
2029
+ }
2030
+ #endif // __cccl_ptx_isa >= 860
2031
+
2032
+ /*
2033
+ // tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2034
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2035
+ __device__ static inline void tcgen05_ld_16x256b_pack_16b(
2036
+ B32 (&out)[32],
2037
+ uint32_t taddr);
2038
+ */
2039
+ #if __cccl_ptx_isa >= 860
2040
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2041
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2042
+ _CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr)
2043
+ {
2044
+ static_assert(sizeof(_B32) == 4, "");
2045
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2046
+ asm(
2047
+ "tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
2048
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];"
2049
+ : "=r"(__out[0]),
2050
+ "=r"(__out[1]),
2051
+ "=r"(__out[2]),
2052
+ "=r"(__out[3]),
2053
+ "=r"(__out[4]),
2054
+ "=r"(__out[5]),
2055
+ "=r"(__out[6]),
2056
+ "=r"(__out[7]),
2057
+ "=r"(__out[8]),
2058
+ "=r"(__out[9]),
2059
+ "=r"(__out[10]),
2060
+ "=r"(__out[11]),
2061
+ "=r"(__out[12]),
2062
+ "=r"(__out[13]),
2063
+ "=r"(__out[14]),
2064
+ "=r"(__out[15]),
2065
+ "=r"(__out[16]),
2066
+ "=r"(__out[17]),
2067
+ "=r"(__out[18]),
2068
+ "=r"(__out[19]),
2069
+ "=r"(__out[20]),
2070
+ "=r"(__out[21]),
2071
+ "=r"(__out[22]),
2072
+ "=r"(__out[23]),
2073
+ "=r"(__out[24]),
2074
+ "=r"(__out[25]),
2075
+ "=r"(__out[26]),
2076
+ "=r"(__out[27]),
2077
+ "=r"(__out[28]),
2078
+ "=r"(__out[29]),
2079
+ "=r"(__out[30]),
2080
+ "=r"(__out[31])
2081
+ : "r"(__taddr)
2082
+ : "memory");
2083
+ # else
2084
+ // Unsupported architectures will have a linker error with a semi-decent error message
2085
+ __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2086
+ # endif
2087
+ }
2088
+ #endif // __cccl_ptx_isa >= 860
2089
+
2090
+ /*
2091
+ // tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2092
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2093
+ __device__ static inline void tcgen05_ld_16x256b(
2094
+ B32 (&out)[64],
2095
+ uint32_t taddr);
2096
+ */
2097
+ #if __cccl_ptx_isa >= 860
2098
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__();
2099
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2100
+ _CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr)
2101
+ {
2102
+ static_assert(sizeof(_B32) == 4, "");
2103
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2104
+ asm(
2105
+ "tcgen05.ld.sync.aligned.16x256b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
2106
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, "
2107
+ "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, "
2108
+ "%60, %61, %62, %63}, [%64];"
2109
+ : "=r"(__out[0]),
2110
+ "=r"(__out[1]),
2111
+ "=r"(__out[2]),
2112
+ "=r"(__out[3]),
2113
+ "=r"(__out[4]),
2114
+ "=r"(__out[5]),
2115
+ "=r"(__out[6]),
2116
+ "=r"(__out[7]),
2117
+ "=r"(__out[8]),
2118
+ "=r"(__out[9]),
2119
+ "=r"(__out[10]),
2120
+ "=r"(__out[11]),
2121
+ "=r"(__out[12]),
2122
+ "=r"(__out[13]),
2123
+ "=r"(__out[14]),
2124
+ "=r"(__out[15]),
2125
+ "=r"(__out[16]),
2126
+ "=r"(__out[17]),
2127
+ "=r"(__out[18]),
2128
+ "=r"(__out[19]),
2129
+ "=r"(__out[20]),
2130
+ "=r"(__out[21]),
2131
+ "=r"(__out[22]),
2132
+ "=r"(__out[23]),
2133
+ "=r"(__out[24]),
2134
+ "=r"(__out[25]),
2135
+ "=r"(__out[26]),
2136
+ "=r"(__out[27]),
2137
+ "=r"(__out[28]),
2138
+ "=r"(__out[29]),
2139
+ "=r"(__out[30]),
2140
+ "=r"(__out[31]),
2141
+ "=r"(__out[32]),
2142
+ "=r"(__out[33]),
2143
+ "=r"(__out[34]),
2144
+ "=r"(__out[35]),
2145
+ "=r"(__out[36]),
2146
+ "=r"(__out[37]),
2147
+ "=r"(__out[38]),
2148
+ "=r"(__out[39]),
2149
+ "=r"(__out[40]),
2150
+ "=r"(__out[41]),
2151
+ "=r"(__out[42]),
2152
+ "=r"(__out[43]),
2153
+ "=r"(__out[44]),
2154
+ "=r"(__out[45]),
2155
+ "=r"(__out[46]),
2156
+ "=r"(__out[47]),
2157
+ "=r"(__out[48]),
2158
+ "=r"(__out[49]),
2159
+ "=r"(__out[50]),
2160
+ "=r"(__out[51]),
2161
+ "=r"(__out[52]),
2162
+ "=r"(__out[53]),
2163
+ "=r"(__out[54]),
2164
+ "=r"(__out[55]),
2165
+ "=r"(__out[56]),
2166
+ "=r"(__out[57]),
2167
+ "=r"(__out[58]),
2168
+ "=r"(__out[59]),
2169
+ "=r"(__out[60]),
2170
+ "=r"(__out[61]),
2171
+ "=r"(__out[62]),
2172
+ "=r"(__out[63])
2173
+ : "r"(__taddr)
2174
+ : "memory");
2175
+ # else
2176
+ // Unsupported architectures will have a linker error with a semi-decent error message
2177
+ __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__();
2178
+ # endif
2179
+ }
2180
+ #endif // __cccl_ptx_isa >= 860
2181
+
2182
+ /*
2183
+ // tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2184
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2185
+ __device__ static inline void tcgen05_ld_16x256b_pack_16b(
2186
+ B32 (&out)[64],
2187
+ uint32_t taddr);
2188
+ */
2189
+ #if __cccl_ptx_isa >= 860
2190
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2191
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2192
+ _CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr)
2193
+ {
2194
+ static_assert(sizeof(_B32) == 4, "");
2195
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2196
+ asm(
2197
+ "tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
2198
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, "
2199
+ "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, "
2200
+ "%58, %59, %60, %61, %62, %63}, [%64];"
2201
+ : "=r"(__out[0]),
2202
+ "=r"(__out[1]),
2203
+ "=r"(__out[2]),
2204
+ "=r"(__out[3]),
2205
+ "=r"(__out[4]),
2206
+ "=r"(__out[5]),
2207
+ "=r"(__out[6]),
2208
+ "=r"(__out[7]),
2209
+ "=r"(__out[8]),
2210
+ "=r"(__out[9]),
2211
+ "=r"(__out[10]),
2212
+ "=r"(__out[11]),
2213
+ "=r"(__out[12]),
2214
+ "=r"(__out[13]),
2215
+ "=r"(__out[14]),
2216
+ "=r"(__out[15]),
2217
+ "=r"(__out[16]),
2218
+ "=r"(__out[17]),
2219
+ "=r"(__out[18]),
2220
+ "=r"(__out[19]),
2221
+ "=r"(__out[20]),
2222
+ "=r"(__out[21]),
2223
+ "=r"(__out[22]),
2224
+ "=r"(__out[23]),
2225
+ "=r"(__out[24]),
2226
+ "=r"(__out[25]),
2227
+ "=r"(__out[26]),
2228
+ "=r"(__out[27]),
2229
+ "=r"(__out[28]),
2230
+ "=r"(__out[29]),
2231
+ "=r"(__out[30]),
2232
+ "=r"(__out[31]),
2233
+ "=r"(__out[32]),
2234
+ "=r"(__out[33]),
2235
+ "=r"(__out[34]),
2236
+ "=r"(__out[35]),
2237
+ "=r"(__out[36]),
2238
+ "=r"(__out[37]),
2239
+ "=r"(__out[38]),
2240
+ "=r"(__out[39]),
2241
+ "=r"(__out[40]),
2242
+ "=r"(__out[41]),
2243
+ "=r"(__out[42]),
2244
+ "=r"(__out[43]),
2245
+ "=r"(__out[44]),
2246
+ "=r"(__out[45]),
2247
+ "=r"(__out[46]),
2248
+ "=r"(__out[47]),
2249
+ "=r"(__out[48]),
2250
+ "=r"(__out[49]),
2251
+ "=r"(__out[50]),
2252
+ "=r"(__out[51]),
2253
+ "=r"(__out[52]),
2254
+ "=r"(__out[53]),
2255
+ "=r"(__out[54]),
2256
+ "=r"(__out[55]),
2257
+ "=r"(__out[56]),
2258
+ "=r"(__out[57]),
2259
+ "=r"(__out[58]),
2260
+ "=r"(__out[59]),
2261
+ "=r"(__out[60]),
2262
+ "=r"(__out[61]),
2263
+ "=r"(__out[62]),
2264
+ "=r"(__out[63])
2265
+ : "r"(__taddr)
2266
+ : "memory");
2267
+ # else
2268
+ // Unsupported architectures will have a linker error with a semi-decent error message
2269
+ __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2270
+ # endif
2271
+ }
2272
+ #endif // __cccl_ptx_isa >= 860
2273
+
2274
+ /*
2275
+ // tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2276
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2277
+ __device__ static inline void tcgen05_ld_16x256b(
2278
+ B32 (&out)[128],
2279
+ uint32_t taddr);
2280
+ */
2281
+ #if __cccl_ptx_isa >= 860
2282
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__();
2283
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2284
+ _CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr)
2285
+ {
2286
+ static_assert(sizeof(_B32) == 4, "");
2287
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2288
+ asm(
2289
+ "tcgen05.ld.sync.aligned.16x256b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
2290
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, "
2291
+ "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, "
2292
+ "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, "
2293
+ "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, "
2294
+ "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, "
2295
+ "%123, %124, %125, %126, %127}, [%128];"
2296
+ : "=r"(__out[0]),
2297
+ "=r"(__out[1]),
2298
+ "=r"(__out[2]),
2299
+ "=r"(__out[3]),
2300
+ "=r"(__out[4]),
2301
+ "=r"(__out[5]),
2302
+ "=r"(__out[6]),
2303
+ "=r"(__out[7]),
2304
+ "=r"(__out[8]),
2305
+ "=r"(__out[9]),
2306
+ "=r"(__out[10]),
2307
+ "=r"(__out[11]),
2308
+ "=r"(__out[12]),
2309
+ "=r"(__out[13]),
2310
+ "=r"(__out[14]),
2311
+ "=r"(__out[15]),
2312
+ "=r"(__out[16]),
2313
+ "=r"(__out[17]),
2314
+ "=r"(__out[18]),
2315
+ "=r"(__out[19]),
2316
+ "=r"(__out[20]),
2317
+ "=r"(__out[21]),
2318
+ "=r"(__out[22]),
2319
+ "=r"(__out[23]),
2320
+ "=r"(__out[24]),
2321
+ "=r"(__out[25]),
2322
+ "=r"(__out[26]),
2323
+ "=r"(__out[27]),
2324
+ "=r"(__out[28]),
2325
+ "=r"(__out[29]),
2326
+ "=r"(__out[30]),
2327
+ "=r"(__out[31]),
2328
+ "=r"(__out[32]),
2329
+ "=r"(__out[33]),
2330
+ "=r"(__out[34]),
2331
+ "=r"(__out[35]),
2332
+ "=r"(__out[36]),
2333
+ "=r"(__out[37]),
2334
+ "=r"(__out[38]),
2335
+ "=r"(__out[39]),
2336
+ "=r"(__out[40]),
2337
+ "=r"(__out[41]),
2338
+ "=r"(__out[42]),
2339
+ "=r"(__out[43]),
2340
+ "=r"(__out[44]),
2341
+ "=r"(__out[45]),
2342
+ "=r"(__out[46]),
2343
+ "=r"(__out[47]),
2344
+ "=r"(__out[48]),
2345
+ "=r"(__out[49]),
2346
+ "=r"(__out[50]),
2347
+ "=r"(__out[51]),
2348
+ "=r"(__out[52]),
2349
+ "=r"(__out[53]),
2350
+ "=r"(__out[54]),
2351
+ "=r"(__out[55]),
2352
+ "=r"(__out[56]),
2353
+ "=r"(__out[57]),
2354
+ "=r"(__out[58]),
2355
+ "=r"(__out[59]),
2356
+ "=r"(__out[60]),
2357
+ "=r"(__out[61]),
2358
+ "=r"(__out[62]),
2359
+ "=r"(__out[63]),
2360
+ "=r"(__out[64]),
2361
+ "=r"(__out[65]),
2362
+ "=r"(__out[66]),
2363
+ "=r"(__out[67]),
2364
+ "=r"(__out[68]),
2365
+ "=r"(__out[69]),
2366
+ "=r"(__out[70]),
2367
+ "=r"(__out[71]),
2368
+ "=r"(__out[72]),
2369
+ "=r"(__out[73]),
2370
+ "=r"(__out[74]),
2371
+ "=r"(__out[75]),
2372
+ "=r"(__out[76]),
2373
+ "=r"(__out[77]),
2374
+ "=r"(__out[78]),
2375
+ "=r"(__out[79]),
2376
+ "=r"(__out[80]),
2377
+ "=r"(__out[81]),
2378
+ "=r"(__out[82]),
2379
+ "=r"(__out[83]),
2380
+ "=r"(__out[84]),
2381
+ "=r"(__out[85]),
2382
+ "=r"(__out[86]),
2383
+ "=r"(__out[87]),
2384
+ "=r"(__out[88]),
2385
+ "=r"(__out[89]),
2386
+ "=r"(__out[90]),
2387
+ "=r"(__out[91]),
2388
+ "=r"(__out[92]),
2389
+ "=r"(__out[93]),
2390
+ "=r"(__out[94]),
2391
+ "=r"(__out[95]),
2392
+ "=r"(__out[96]),
2393
+ "=r"(__out[97]),
2394
+ "=r"(__out[98]),
2395
+ "=r"(__out[99]),
2396
+ "=r"(__out[100]),
2397
+ "=r"(__out[101]),
2398
+ "=r"(__out[102]),
2399
+ "=r"(__out[103]),
2400
+ "=r"(__out[104]),
2401
+ "=r"(__out[105]),
2402
+ "=r"(__out[106]),
2403
+ "=r"(__out[107]),
2404
+ "=r"(__out[108]),
2405
+ "=r"(__out[109]),
2406
+ "=r"(__out[110]),
2407
+ "=r"(__out[111]),
2408
+ "=r"(__out[112]),
2409
+ "=r"(__out[113]),
2410
+ "=r"(__out[114]),
2411
+ "=r"(__out[115]),
2412
+ "=r"(__out[116]),
2413
+ "=r"(__out[117]),
2414
+ "=r"(__out[118]),
2415
+ "=r"(__out[119]),
2416
+ "=r"(__out[120]),
2417
+ "=r"(__out[121]),
2418
+ "=r"(__out[122]),
2419
+ "=r"(__out[123]),
2420
+ "=r"(__out[124]),
2421
+ "=r"(__out[125]),
2422
+ "=r"(__out[126]),
2423
+ "=r"(__out[127])
2424
+ : "r"(__taddr)
2425
+ : "memory");
2426
+ # else
2427
+ // Unsupported architectures will have a linker error with a semi-decent error message
2428
+ __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__();
2429
+ # endif
2430
+ }
2431
+ #endif // __cccl_ptx_isa >= 860
2432
+
2433
+ /*
2434
+ // tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2435
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2436
+ __device__ static inline void tcgen05_ld_16x256b_pack_16b(
2437
+ B32 (&out)[128],
2438
+ uint32_t taddr);
2439
+ */
2440
+ #if __cccl_ptx_isa >= 860
2441
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2442
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2443
+ _CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr)
2444
+ {
2445
+ static_assert(sizeof(_B32) == 4, "");
2446
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2447
+ asm(
2448
+ "tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
2449
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, "
2450
+ "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, "
2451
+ "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, "
2452
+ "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, "
2453
+ "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
2454
+ "%121, %122, %123, %124, %125, %126, %127}, [%128];"
2455
+ : "=r"(__out[0]),
2456
+ "=r"(__out[1]),
2457
+ "=r"(__out[2]),
2458
+ "=r"(__out[3]),
2459
+ "=r"(__out[4]),
2460
+ "=r"(__out[5]),
2461
+ "=r"(__out[6]),
2462
+ "=r"(__out[7]),
2463
+ "=r"(__out[8]),
2464
+ "=r"(__out[9]),
2465
+ "=r"(__out[10]),
2466
+ "=r"(__out[11]),
2467
+ "=r"(__out[12]),
2468
+ "=r"(__out[13]),
2469
+ "=r"(__out[14]),
2470
+ "=r"(__out[15]),
2471
+ "=r"(__out[16]),
2472
+ "=r"(__out[17]),
2473
+ "=r"(__out[18]),
2474
+ "=r"(__out[19]),
2475
+ "=r"(__out[20]),
2476
+ "=r"(__out[21]),
2477
+ "=r"(__out[22]),
2478
+ "=r"(__out[23]),
2479
+ "=r"(__out[24]),
2480
+ "=r"(__out[25]),
2481
+ "=r"(__out[26]),
2482
+ "=r"(__out[27]),
2483
+ "=r"(__out[28]),
2484
+ "=r"(__out[29]),
2485
+ "=r"(__out[30]),
2486
+ "=r"(__out[31]),
2487
+ "=r"(__out[32]),
2488
+ "=r"(__out[33]),
2489
+ "=r"(__out[34]),
2490
+ "=r"(__out[35]),
2491
+ "=r"(__out[36]),
2492
+ "=r"(__out[37]),
2493
+ "=r"(__out[38]),
2494
+ "=r"(__out[39]),
2495
+ "=r"(__out[40]),
2496
+ "=r"(__out[41]),
2497
+ "=r"(__out[42]),
2498
+ "=r"(__out[43]),
2499
+ "=r"(__out[44]),
2500
+ "=r"(__out[45]),
2501
+ "=r"(__out[46]),
2502
+ "=r"(__out[47]),
2503
+ "=r"(__out[48]),
2504
+ "=r"(__out[49]),
2505
+ "=r"(__out[50]),
2506
+ "=r"(__out[51]),
2507
+ "=r"(__out[52]),
2508
+ "=r"(__out[53]),
2509
+ "=r"(__out[54]),
2510
+ "=r"(__out[55]),
2511
+ "=r"(__out[56]),
2512
+ "=r"(__out[57]),
2513
+ "=r"(__out[58]),
2514
+ "=r"(__out[59]),
2515
+ "=r"(__out[60]),
2516
+ "=r"(__out[61]),
2517
+ "=r"(__out[62]),
2518
+ "=r"(__out[63]),
2519
+ "=r"(__out[64]),
2520
+ "=r"(__out[65]),
2521
+ "=r"(__out[66]),
2522
+ "=r"(__out[67]),
2523
+ "=r"(__out[68]),
2524
+ "=r"(__out[69]),
2525
+ "=r"(__out[70]),
2526
+ "=r"(__out[71]),
2527
+ "=r"(__out[72]),
2528
+ "=r"(__out[73]),
2529
+ "=r"(__out[74]),
2530
+ "=r"(__out[75]),
2531
+ "=r"(__out[76]),
2532
+ "=r"(__out[77]),
2533
+ "=r"(__out[78]),
2534
+ "=r"(__out[79]),
2535
+ "=r"(__out[80]),
2536
+ "=r"(__out[81]),
2537
+ "=r"(__out[82]),
2538
+ "=r"(__out[83]),
2539
+ "=r"(__out[84]),
2540
+ "=r"(__out[85]),
2541
+ "=r"(__out[86]),
2542
+ "=r"(__out[87]),
2543
+ "=r"(__out[88]),
2544
+ "=r"(__out[89]),
2545
+ "=r"(__out[90]),
2546
+ "=r"(__out[91]),
2547
+ "=r"(__out[92]),
2548
+ "=r"(__out[93]),
2549
+ "=r"(__out[94]),
2550
+ "=r"(__out[95]),
2551
+ "=r"(__out[96]),
2552
+ "=r"(__out[97]),
2553
+ "=r"(__out[98]),
2554
+ "=r"(__out[99]),
2555
+ "=r"(__out[100]),
2556
+ "=r"(__out[101]),
2557
+ "=r"(__out[102]),
2558
+ "=r"(__out[103]),
2559
+ "=r"(__out[104]),
2560
+ "=r"(__out[105]),
2561
+ "=r"(__out[106]),
2562
+ "=r"(__out[107]),
2563
+ "=r"(__out[108]),
2564
+ "=r"(__out[109]),
2565
+ "=r"(__out[110]),
2566
+ "=r"(__out[111]),
2567
+ "=r"(__out[112]),
2568
+ "=r"(__out[113]),
2569
+ "=r"(__out[114]),
2570
+ "=r"(__out[115]),
2571
+ "=r"(__out[116]),
2572
+ "=r"(__out[117]),
2573
+ "=r"(__out[118]),
2574
+ "=r"(__out[119]),
2575
+ "=r"(__out[120]),
2576
+ "=r"(__out[121]),
2577
+ "=r"(__out[122]),
2578
+ "=r"(__out[123]),
2579
+ "=r"(__out[124]),
2580
+ "=r"(__out[125]),
2581
+ "=r"(__out[126]),
2582
+ "=r"(__out[127])
2583
+ : "r"(__taddr)
2584
+ : "memory");
2585
+ # else
2586
+ // Unsupported architectures will have a linker error with a semi-decent error message
2587
+ __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2588
+ # endif
2589
+ }
2590
+ #endif // __cccl_ptx_isa >= 860
2591
+
2592
+ /*
2593
+ // tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2594
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2595
+ __device__ static inline void tcgen05_ld_32x32b(
2596
+ B32 (&out)[1],
2597
+ uint32_t taddr);
2598
+ */
2599
+ #if __cccl_ptx_isa >= 860
2600
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
2601
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2602
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr)
2603
+ {
2604
+ static_assert(sizeof(_B32) == 4, "");
2605
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2606
+ asm("tcgen05.ld.sync.aligned.32x32b.x1.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory");
2607
+ # else
2608
+ // Unsupported architectures will have a linker error with a semi-decent error message
2609
+ __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
2610
+ # endif
2611
+ }
2612
+ #endif // __cccl_ptx_isa >= 860
2613
+
2614
+ /*
2615
+ // tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2616
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2617
+ __device__ static inline void tcgen05_ld_32x32b_pack_16b(
2618
+ B32 (&out)[1],
2619
+ uint32_t taddr);
2620
+ */
2621
+ #if __cccl_ptx_isa >= 860
2622
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2623
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2624
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr)
2625
+ {
2626
+ static_assert(sizeof(_B32) == 4, "");
2627
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2628
+ asm("tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory");
2629
+ # else
2630
+ // Unsupported architectures will have a linker error with a semi-decent error message
2631
+ __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2632
+ # endif
2633
+ }
2634
+ #endif // __cccl_ptx_isa >= 860
2635
+
2636
+ /*
2637
+ // tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2638
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2639
+ __device__ static inline void tcgen05_ld_32x32b(
2640
+ B32 (&out)[2],
2641
+ uint32_t taddr);
2642
+ */
2643
+ #if __cccl_ptx_isa >= 860
2644
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
2645
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2646
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr)
2647
+ {
2648
+ static_assert(sizeof(_B32) == 4, "");
2649
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2650
+ asm("tcgen05.ld.sync.aligned.32x32b.x2.b32 {%0, %1}, [%2];"
2651
+ : "=r"(__out[0]), "=r"(__out[1])
2652
+ : "r"(__taddr)
2653
+ : "memory");
2654
+ # else
2655
+ // Unsupported architectures will have a linker error with a semi-decent error message
2656
+ __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
2657
+ # endif
2658
+ }
2659
+ #endif // __cccl_ptx_isa >= 860
2660
+
2661
+ /*
2662
+ // tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2663
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2664
+ __device__ static inline void tcgen05_ld_32x32b_pack_16b(
2665
+ B32 (&out)[2],
2666
+ uint32_t taddr);
2667
+ */
2668
+ #if __cccl_ptx_isa >= 860
2669
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2670
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2671
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr)
2672
+ {
2673
+ static_assert(sizeof(_B32) == 4, "");
2674
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2675
+ asm("tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 {%0, %1}, [%2];"
2676
+ : "=r"(__out[0]), "=r"(__out[1])
2677
+ : "r"(__taddr)
2678
+ : "memory");
2679
+ # else
2680
+ // Unsupported architectures will have a linker error with a semi-decent error message
2681
+ __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2682
+ # endif
2683
+ }
2684
+ #endif // __cccl_ptx_isa >= 860
2685
+
2686
+ /*
2687
+ // tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2688
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2689
+ __device__ static inline void tcgen05_ld_32x32b(
2690
+ B32 (&out)[4],
2691
+ uint32_t taddr);
2692
+ */
2693
+ #if __cccl_ptx_isa >= 860
2694
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
2695
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2696
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr)
2697
+ {
2698
+ static_assert(sizeof(_B32) == 4, "");
2699
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2700
+ asm("tcgen05.ld.sync.aligned.32x32b.x4.b32 {%0, %1, %2, %3}, [%4];"
2701
+ : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3])
2702
+ : "r"(__taddr)
2703
+ : "memory");
2704
+ # else
2705
+ // Unsupported architectures will have a linker error with a semi-decent error message
2706
+ __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
2707
+ # endif
2708
+ }
2709
+ #endif // __cccl_ptx_isa >= 860
2710
+
2711
+ /*
2712
+ // tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2713
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2714
+ __device__ static inline void tcgen05_ld_32x32b_pack_16b(
2715
+ B32 (&out)[4],
2716
+ uint32_t taddr);
2717
+ */
2718
+ #if __cccl_ptx_isa >= 860
2719
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2720
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2721
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr)
2722
+ {
2723
+ static_assert(sizeof(_B32) == 4, "");
2724
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2725
+ asm("tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 {%0, %1, %2, %3}, [%4];"
2726
+ : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3])
2727
+ : "r"(__taddr)
2728
+ : "memory");
2729
+ # else
2730
+ // Unsupported architectures will have a linker error with a semi-decent error message
2731
+ __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2732
+ # endif
2733
+ }
2734
+ #endif // __cccl_ptx_isa >= 860
2735
+
2736
+ /*
2737
+ // tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2738
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2739
+ __device__ static inline void tcgen05_ld_32x32b(
2740
+ B32 (&out)[8],
2741
+ uint32_t taddr);
2742
+ */
2743
+ #if __cccl_ptx_isa >= 860
2744
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
2745
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2746
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr)
2747
+ {
2748
+ static_assert(sizeof(_B32) == 4, "");
2749
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2750
+ asm("tcgen05.ld.sync.aligned.32x32b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
2751
+ : "=r"(__out[0]),
2752
+ "=r"(__out[1]),
2753
+ "=r"(__out[2]),
2754
+ "=r"(__out[3]),
2755
+ "=r"(__out[4]),
2756
+ "=r"(__out[5]),
2757
+ "=r"(__out[6]),
2758
+ "=r"(__out[7])
2759
+ : "r"(__taddr)
2760
+ : "memory");
2761
+ # else
2762
+ // Unsupported architectures will have a linker error with a semi-decent error message
2763
+ __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
2764
+ # endif
2765
+ }
2766
+ #endif // __cccl_ptx_isa >= 860
2767
+
2768
+ /*
2769
+ // tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2770
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2771
+ __device__ static inline void tcgen05_ld_32x32b_pack_16b(
2772
+ B32 (&out)[8],
2773
+ uint32_t taddr);
2774
+ */
2775
+ #if __cccl_ptx_isa >= 860
2776
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2777
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2778
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr)
2779
+ {
2780
+ static_assert(sizeof(_B32) == 4, "");
2781
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2782
+ asm("tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
2783
+ : "=r"(__out[0]),
2784
+ "=r"(__out[1]),
2785
+ "=r"(__out[2]),
2786
+ "=r"(__out[3]),
2787
+ "=r"(__out[4]),
2788
+ "=r"(__out[5]),
2789
+ "=r"(__out[6]),
2790
+ "=r"(__out[7])
2791
+ : "r"(__taddr)
2792
+ : "memory");
2793
+ # else
2794
+ // Unsupported architectures will have a linker error with a semi-decent error message
2795
+ __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2796
+ # endif
2797
+ }
2798
+ #endif // __cccl_ptx_isa >= 860
2799
+
2800
+ /*
2801
+ // tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2802
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2803
+ __device__ static inline void tcgen05_ld_32x32b(
2804
+ B32 (&out)[16],
2805
+ uint32_t taddr);
2806
+ */
2807
+ #if __cccl_ptx_isa >= 860
2808
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
2809
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2810
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr)
2811
+ {
2812
+ static_assert(sizeof(_B32) == 4, "");
2813
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2814
+ asm("tcgen05.ld.sync.aligned.32x32b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, "
2815
+ "[%16];"
2816
+ : "=r"(__out[0]),
2817
+ "=r"(__out[1]),
2818
+ "=r"(__out[2]),
2819
+ "=r"(__out[3]),
2820
+ "=r"(__out[4]),
2821
+ "=r"(__out[5]),
2822
+ "=r"(__out[6]),
2823
+ "=r"(__out[7]),
2824
+ "=r"(__out[8]),
2825
+ "=r"(__out[9]),
2826
+ "=r"(__out[10]),
2827
+ "=r"(__out[11]),
2828
+ "=r"(__out[12]),
2829
+ "=r"(__out[13]),
2830
+ "=r"(__out[14]),
2831
+ "=r"(__out[15])
2832
+ : "r"(__taddr)
2833
+ : "memory");
2834
+ # else
2835
+ // Unsupported architectures will have a linker error with a semi-decent error message
2836
+ __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
2837
+ # endif
2838
+ }
2839
+ #endif // __cccl_ptx_isa >= 860
2840
+
2841
+ /*
2842
+ // tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2843
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2844
+ __device__ static inline void tcgen05_ld_32x32b_pack_16b(
2845
+ B32 (&out)[16],
2846
+ uint32_t taddr);
2847
+ */
2848
+ #if __cccl_ptx_isa >= 860
2849
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2850
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2851
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr)
2852
+ {
2853
+ static_assert(sizeof(_B32) == 4, "");
2854
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2855
+ asm("tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
2856
+ "%14, %15}, [%16];"
2857
+ : "=r"(__out[0]),
2858
+ "=r"(__out[1]),
2859
+ "=r"(__out[2]),
2860
+ "=r"(__out[3]),
2861
+ "=r"(__out[4]),
2862
+ "=r"(__out[5]),
2863
+ "=r"(__out[6]),
2864
+ "=r"(__out[7]),
2865
+ "=r"(__out[8]),
2866
+ "=r"(__out[9]),
2867
+ "=r"(__out[10]),
2868
+ "=r"(__out[11]),
2869
+ "=r"(__out[12]),
2870
+ "=r"(__out[13]),
2871
+ "=r"(__out[14]),
2872
+ "=r"(__out[15])
2873
+ : "r"(__taddr)
2874
+ : "memory");
2875
+ # else
2876
+ // Unsupported architectures will have a linker error with a semi-decent error message
2877
+ __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2878
+ # endif
2879
+ }
2880
+ #endif // __cccl_ptx_isa >= 860
2881
+
2882
+ /*
2883
+ // tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2884
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2885
+ __device__ static inline void tcgen05_ld_32x32b(
2886
+ B32 (&out)[32],
2887
+ uint32_t taddr);
2888
+ */
2889
+ #if __cccl_ptx_isa >= 860
2890
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
2891
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2892
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr)
2893
+ {
2894
+ static_assert(sizeof(_B32) == 4, "");
2895
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2896
+ asm(
2897
+ "tcgen05.ld.sync.aligned.32x32b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
2898
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];"
2899
+ : "=r"(__out[0]),
2900
+ "=r"(__out[1]),
2901
+ "=r"(__out[2]),
2902
+ "=r"(__out[3]),
2903
+ "=r"(__out[4]),
2904
+ "=r"(__out[5]),
2905
+ "=r"(__out[6]),
2906
+ "=r"(__out[7]),
2907
+ "=r"(__out[8]),
2908
+ "=r"(__out[9]),
2909
+ "=r"(__out[10]),
2910
+ "=r"(__out[11]),
2911
+ "=r"(__out[12]),
2912
+ "=r"(__out[13]),
2913
+ "=r"(__out[14]),
2914
+ "=r"(__out[15]),
2915
+ "=r"(__out[16]),
2916
+ "=r"(__out[17]),
2917
+ "=r"(__out[18]),
2918
+ "=r"(__out[19]),
2919
+ "=r"(__out[20]),
2920
+ "=r"(__out[21]),
2921
+ "=r"(__out[22]),
2922
+ "=r"(__out[23]),
2923
+ "=r"(__out[24]),
2924
+ "=r"(__out[25]),
2925
+ "=r"(__out[26]),
2926
+ "=r"(__out[27]),
2927
+ "=r"(__out[28]),
2928
+ "=r"(__out[29]),
2929
+ "=r"(__out[30]),
2930
+ "=r"(__out[31])
2931
+ : "r"(__taddr)
2932
+ : "memory");
2933
+ # else
2934
+ // Unsupported architectures will have a linker error with a semi-decent error message
2935
+ __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
2936
+ # endif
2937
+ }
2938
+ #endif // __cccl_ptx_isa >= 860
2939
+
2940
+ /*
2941
+ // tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
2942
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
2943
+ __device__ static inline void tcgen05_ld_32x32b_pack_16b(
2944
+ B32 (&out)[32],
2945
+ uint32_t taddr);
2946
+ */
2947
+ #if __cccl_ptx_isa >= 860
2948
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2949
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
2950
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr)
2951
+ {
2952
+ static_assert(sizeof(_B32) == 4, "");
2953
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
2954
+ asm(
2955
+ "tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
2956
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];"
2957
+ : "=r"(__out[0]),
2958
+ "=r"(__out[1]),
2959
+ "=r"(__out[2]),
2960
+ "=r"(__out[3]),
2961
+ "=r"(__out[4]),
2962
+ "=r"(__out[5]),
2963
+ "=r"(__out[6]),
2964
+ "=r"(__out[7]),
2965
+ "=r"(__out[8]),
2966
+ "=r"(__out[9]),
2967
+ "=r"(__out[10]),
2968
+ "=r"(__out[11]),
2969
+ "=r"(__out[12]),
2970
+ "=r"(__out[13]),
2971
+ "=r"(__out[14]),
2972
+ "=r"(__out[15]),
2973
+ "=r"(__out[16]),
2974
+ "=r"(__out[17]),
2975
+ "=r"(__out[18]),
2976
+ "=r"(__out[19]),
2977
+ "=r"(__out[20]),
2978
+ "=r"(__out[21]),
2979
+ "=r"(__out[22]),
2980
+ "=r"(__out[23]),
2981
+ "=r"(__out[24]),
2982
+ "=r"(__out[25]),
2983
+ "=r"(__out[26]),
2984
+ "=r"(__out[27]),
2985
+ "=r"(__out[28]),
2986
+ "=r"(__out[29]),
2987
+ "=r"(__out[30]),
2988
+ "=r"(__out[31])
2989
+ : "r"(__taddr)
2990
+ : "memory");
2991
+ # else
2992
+ // Unsupported architectures will have a linker error with a semi-decent error message
2993
+ __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
2994
+ # endif
2995
+ }
2996
+ #endif // __cccl_ptx_isa >= 860
2997
+
2998
+ /*
2999
+ // tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
3000
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
3001
+ __device__ static inline void tcgen05_ld_32x32b(
3002
+ B32 (&out)[64],
3003
+ uint32_t taddr);
3004
+ */
3005
+ #if __cccl_ptx_isa >= 860
3006
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
3007
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
3008
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr)
3009
+ {
3010
+ static_assert(sizeof(_B32) == 4, "");
3011
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3012
+ asm(
3013
+ "tcgen05.ld.sync.aligned.32x32b.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
3014
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, "
3015
+ "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, "
3016
+ "%60, %61, %62, %63}, [%64];"
3017
+ : "=r"(__out[0]),
3018
+ "=r"(__out[1]),
3019
+ "=r"(__out[2]),
3020
+ "=r"(__out[3]),
3021
+ "=r"(__out[4]),
3022
+ "=r"(__out[5]),
3023
+ "=r"(__out[6]),
3024
+ "=r"(__out[7]),
3025
+ "=r"(__out[8]),
3026
+ "=r"(__out[9]),
3027
+ "=r"(__out[10]),
3028
+ "=r"(__out[11]),
3029
+ "=r"(__out[12]),
3030
+ "=r"(__out[13]),
3031
+ "=r"(__out[14]),
3032
+ "=r"(__out[15]),
3033
+ "=r"(__out[16]),
3034
+ "=r"(__out[17]),
3035
+ "=r"(__out[18]),
3036
+ "=r"(__out[19]),
3037
+ "=r"(__out[20]),
3038
+ "=r"(__out[21]),
3039
+ "=r"(__out[22]),
3040
+ "=r"(__out[23]),
3041
+ "=r"(__out[24]),
3042
+ "=r"(__out[25]),
3043
+ "=r"(__out[26]),
3044
+ "=r"(__out[27]),
3045
+ "=r"(__out[28]),
3046
+ "=r"(__out[29]),
3047
+ "=r"(__out[30]),
3048
+ "=r"(__out[31]),
3049
+ "=r"(__out[32]),
3050
+ "=r"(__out[33]),
3051
+ "=r"(__out[34]),
3052
+ "=r"(__out[35]),
3053
+ "=r"(__out[36]),
3054
+ "=r"(__out[37]),
3055
+ "=r"(__out[38]),
3056
+ "=r"(__out[39]),
3057
+ "=r"(__out[40]),
3058
+ "=r"(__out[41]),
3059
+ "=r"(__out[42]),
3060
+ "=r"(__out[43]),
3061
+ "=r"(__out[44]),
3062
+ "=r"(__out[45]),
3063
+ "=r"(__out[46]),
3064
+ "=r"(__out[47]),
3065
+ "=r"(__out[48]),
3066
+ "=r"(__out[49]),
3067
+ "=r"(__out[50]),
3068
+ "=r"(__out[51]),
3069
+ "=r"(__out[52]),
3070
+ "=r"(__out[53]),
3071
+ "=r"(__out[54]),
3072
+ "=r"(__out[55]),
3073
+ "=r"(__out[56]),
3074
+ "=r"(__out[57]),
3075
+ "=r"(__out[58]),
3076
+ "=r"(__out[59]),
3077
+ "=r"(__out[60]),
3078
+ "=r"(__out[61]),
3079
+ "=r"(__out[62]),
3080
+ "=r"(__out[63])
3081
+ : "r"(__taddr)
3082
+ : "memory");
3083
+ # else
3084
+ // Unsupported architectures will have a linker error with a semi-decent error message
3085
+ __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
3086
+ # endif
3087
+ }
3088
+ #endif // __cccl_ptx_isa >= 860
3089
+
3090
+ /*
3091
+ // tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
3092
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
3093
+ __device__ static inline void tcgen05_ld_32x32b_pack_16b(
3094
+ B32 (&out)[64],
3095
+ uint32_t taddr);
3096
+ */
3097
+ #if __cccl_ptx_isa >= 860
3098
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3099
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
3100
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr)
3101
+ {
3102
+ static_assert(sizeof(_B32) == 4, "");
3103
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3104
+ asm(
3105
+ "tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
3106
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, "
3107
+ "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, "
3108
+ "%58, %59, %60, %61, %62, %63}, [%64];"
3109
+ : "=r"(__out[0]),
3110
+ "=r"(__out[1]),
3111
+ "=r"(__out[2]),
3112
+ "=r"(__out[3]),
3113
+ "=r"(__out[4]),
3114
+ "=r"(__out[5]),
3115
+ "=r"(__out[6]),
3116
+ "=r"(__out[7]),
3117
+ "=r"(__out[8]),
3118
+ "=r"(__out[9]),
3119
+ "=r"(__out[10]),
3120
+ "=r"(__out[11]),
3121
+ "=r"(__out[12]),
3122
+ "=r"(__out[13]),
3123
+ "=r"(__out[14]),
3124
+ "=r"(__out[15]),
3125
+ "=r"(__out[16]),
3126
+ "=r"(__out[17]),
3127
+ "=r"(__out[18]),
3128
+ "=r"(__out[19]),
3129
+ "=r"(__out[20]),
3130
+ "=r"(__out[21]),
3131
+ "=r"(__out[22]),
3132
+ "=r"(__out[23]),
3133
+ "=r"(__out[24]),
3134
+ "=r"(__out[25]),
3135
+ "=r"(__out[26]),
3136
+ "=r"(__out[27]),
3137
+ "=r"(__out[28]),
3138
+ "=r"(__out[29]),
3139
+ "=r"(__out[30]),
3140
+ "=r"(__out[31]),
3141
+ "=r"(__out[32]),
3142
+ "=r"(__out[33]),
3143
+ "=r"(__out[34]),
3144
+ "=r"(__out[35]),
3145
+ "=r"(__out[36]),
3146
+ "=r"(__out[37]),
3147
+ "=r"(__out[38]),
3148
+ "=r"(__out[39]),
3149
+ "=r"(__out[40]),
3150
+ "=r"(__out[41]),
3151
+ "=r"(__out[42]),
3152
+ "=r"(__out[43]),
3153
+ "=r"(__out[44]),
3154
+ "=r"(__out[45]),
3155
+ "=r"(__out[46]),
3156
+ "=r"(__out[47]),
3157
+ "=r"(__out[48]),
3158
+ "=r"(__out[49]),
3159
+ "=r"(__out[50]),
3160
+ "=r"(__out[51]),
3161
+ "=r"(__out[52]),
3162
+ "=r"(__out[53]),
3163
+ "=r"(__out[54]),
3164
+ "=r"(__out[55]),
3165
+ "=r"(__out[56]),
3166
+ "=r"(__out[57]),
3167
+ "=r"(__out[58]),
3168
+ "=r"(__out[59]),
3169
+ "=r"(__out[60]),
3170
+ "=r"(__out[61]),
3171
+ "=r"(__out[62]),
3172
+ "=r"(__out[63])
3173
+ : "r"(__taddr)
3174
+ : "memory");
3175
+ # else
3176
+ // Unsupported architectures will have a linker error with a semi-decent error message
3177
+ __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3178
+ # endif
3179
+ }
3180
+ #endif // __cccl_ptx_isa >= 860
3181
+
3182
+ /*
3183
+ // tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
3184
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
3185
+ __device__ static inline void tcgen05_ld_32x32b(
3186
+ B32 (&out)[128],
3187
+ uint32_t taddr);
3188
+ */
3189
+ #if __cccl_ptx_isa >= 860
3190
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
3191
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
3192
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr)
3193
+ {
3194
+ static_assert(sizeof(_B32) == 4, "");
3195
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3196
+ asm(
3197
+ "tcgen05.ld.sync.aligned.32x32b.x128.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
3198
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, "
3199
+ "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, "
3200
+ "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, "
3201
+ "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, "
3202
+ "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, "
3203
+ "%123, %124, %125, %126, %127}, [%128];"
3204
+ : "=r"(__out[0]),
3205
+ "=r"(__out[1]),
3206
+ "=r"(__out[2]),
3207
+ "=r"(__out[3]),
3208
+ "=r"(__out[4]),
3209
+ "=r"(__out[5]),
3210
+ "=r"(__out[6]),
3211
+ "=r"(__out[7]),
3212
+ "=r"(__out[8]),
3213
+ "=r"(__out[9]),
3214
+ "=r"(__out[10]),
3215
+ "=r"(__out[11]),
3216
+ "=r"(__out[12]),
3217
+ "=r"(__out[13]),
3218
+ "=r"(__out[14]),
3219
+ "=r"(__out[15]),
3220
+ "=r"(__out[16]),
3221
+ "=r"(__out[17]),
3222
+ "=r"(__out[18]),
3223
+ "=r"(__out[19]),
3224
+ "=r"(__out[20]),
3225
+ "=r"(__out[21]),
3226
+ "=r"(__out[22]),
3227
+ "=r"(__out[23]),
3228
+ "=r"(__out[24]),
3229
+ "=r"(__out[25]),
3230
+ "=r"(__out[26]),
3231
+ "=r"(__out[27]),
3232
+ "=r"(__out[28]),
3233
+ "=r"(__out[29]),
3234
+ "=r"(__out[30]),
3235
+ "=r"(__out[31]),
3236
+ "=r"(__out[32]),
3237
+ "=r"(__out[33]),
3238
+ "=r"(__out[34]),
3239
+ "=r"(__out[35]),
3240
+ "=r"(__out[36]),
3241
+ "=r"(__out[37]),
3242
+ "=r"(__out[38]),
3243
+ "=r"(__out[39]),
3244
+ "=r"(__out[40]),
3245
+ "=r"(__out[41]),
3246
+ "=r"(__out[42]),
3247
+ "=r"(__out[43]),
3248
+ "=r"(__out[44]),
3249
+ "=r"(__out[45]),
3250
+ "=r"(__out[46]),
3251
+ "=r"(__out[47]),
3252
+ "=r"(__out[48]),
3253
+ "=r"(__out[49]),
3254
+ "=r"(__out[50]),
3255
+ "=r"(__out[51]),
3256
+ "=r"(__out[52]),
3257
+ "=r"(__out[53]),
3258
+ "=r"(__out[54]),
3259
+ "=r"(__out[55]),
3260
+ "=r"(__out[56]),
3261
+ "=r"(__out[57]),
3262
+ "=r"(__out[58]),
3263
+ "=r"(__out[59]),
3264
+ "=r"(__out[60]),
3265
+ "=r"(__out[61]),
3266
+ "=r"(__out[62]),
3267
+ "=r"(__out[63]),
3268
+ "=r"(__out[64]),
3269
+ "=r"(__out[65]),
3270
+ "=r"(__out[66]),
3271
+ "=r"(__out[67]),
3272
+ "=r"(__out[68]),
3273
+ "=r"(__out[69]),
3274
+ "=r"(__out[70]),
3275
+ "=r"(__out[71]),
3276
+ "=r"(__out[72]),
3277
+ "=r"(__out[73]),
3278
+ "=r"(__out[74]),
3279
+ "=r"(__out[75]),
3280
+ "=r"(__out[76]),
3281
+ "=r"(__out[77]),
3282
+ "=r"(__out[78]),
3283
+ "=r"(__out[79]),
3284
+ "=r"(__out[80]),
3285
+ "=r"(__out[81]),
3286
+ "=r"(__out[82]),
3287
+ "=r"(__out[83]),
3288
+ "=r"(__out[84]),
3289
+ "=r"(__out[85]),
3290
+ "=r"(__out[86]),
3291
+ "=r"(__out[87]),
3292
+ "=r"(__out[88]),
3293
+ "=r"(__out[89]),
3294
+ "=r"(__out[90]),
3295
+ "=r"(__out[91]),
3296
+ "=r"(__out[92]),
3297
+ "=r"(__out[93]),
3298
+ "=r"(__out[94]),
3299
+ "=r"(__out[95]),
3300
+ "=r"(__out[96]),
3301
+ "=r"(__out[97]),
3302
+ "=r"(__out[98]),
3303
+ "=r"(__out[99]),
3304
+ "=r"(__out[100]),
3305
+ "=r"(__out[101]),
3306
+ "=r"(__out[102]),
3307
+ "=r"(__out[103]),
3308
+ "=r"(__out[104]),
3309
+ "=r"(__out[105]),
3310
+ "=r"(__out[106]),
3311
+ "=r"(__out[107]),
3312
+ "=r"(__out[108]),
3313
+ "=r"(__out[109]),
3314
+ "=r"(__out[110]),
3315
+ "=r"(__out[111]),
3316
+ "=r"(__out[112]),
3317
+ "=r"(__out[113]),
3318
+ "=r"(__out[114]),
3319
+ "=r"(__out[115]),
3320
+ "=r"(__out[116]),
3321
+ "=r"(__out[117]),
3322
+ "=r"(__out[118]),
3323
+ "=r"(__out[119]),
3324
+ "=r"(__out[120]),
3325
+ "=r"(__out[121]),
3326
+ "=r"(__out[122]),
3327
+ "=r"(__out[123]),
3328
+ "=r"(__out[124]),
3329
+ "=r"(__out[125]),
3330
+ "=r"(__out[126]),
3331
+ "=r"(__out[127])
3332
+ : "r"(__taddr)
3333
+ : "memory");
3334
+ # else
3335
+ // Unsupported architectures will have a linker error with a semi-decent error message
3336
+ __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__();
3337
+ # endif
3338
+ }
3339
+ #endif // __cccl_ptx_isa >= 860
3340
+
3341
+ /*
3342
+ // tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a
3343
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
3344
+ __device__ static inline void tcgen05_ld_32x32b_pack_16b(
3345
+ B32 (&out)[128],
3346
+ uint32_t taddr);
3347
+ */
3348
+ #if __cccl_ptx_isa >= 860
3349
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3350
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
3351
+ _CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr)
3352
+ {
3353
+ static_assert(sizeof(_B32) == 4, "");
3354
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3355
+ asm(
3356
+ "tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
3357
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, "
3358
+ "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, "
3359
+ "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, "
3360
+ "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, "
3361
+ "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
3362
+ "%121, %122, %123, %124, %125, %126, %127}, [%128];"
3363
+ : "=r"(__out[0]),
3364
+ "=r"(__out[1]),
3365
+ "=r"(__out[2]),
3366
+ "=r"(__out[3]),
3367
+ "=r"(__out[4]),
3368
+ "=r"(__out[5]),
3369
+ "=r"(__out[6]),
3370
+ "=r"(__out[7]),
3371
+ "=r"(__out[8]),
3372
+ "=r"(__out[9]),
3373
+ "=r"(__out[10]),
3374
+ "=r"(__out[11]),
3375
+ "=r"(__out[12]),
3376
+ "=r"(__out[13]),
3377
+ "=r"(__out[14]),
3378
+ "=r"(__out[15]),
3379
+ "=r"(__out[16]),
3380
+ "=r"(__out[17]),
3381
+ "=r"(__out[18]),
3382
+ "=r"(__out[19]),
3383
+ "=r"(__out[20]),
3384
+ "=r"(__out[21]),
3385
+ "=r"(__out[22]),
3386
+ "=r"(__out[23]),
3387
+ "=r"(__out[24]),
3388
+ "=r"(__out[25]),
3389
+ "=r"(__out[26]),
3390
+ "=r"(__out[27]),
3391
+ "=r"(__out[28]),
3392
+ "=r"(__out[29]),
3393
+ "=r"(__out[30]),
3394
+ "=r"(__out[31]),
3395
+ "=r"(__out[32]),
3396
+ "=r"(__out[33]),
3397
+ "=r"(__out[34]),
3398
+ "=r"(__out[35]),
3399
+ "=r"(__out[36]),
3400
+ "=r"(__out[37]),
3401
+ "=r"(__out[38]),
3402
+ "=r"(__out[39]),
3403
+ "=r"(__out[40]),
3404
+ "=r"(__out[41]),
3405
+ "=r"(__out[42]),
3406
+ "=r"(__out[43]),
3407
+ "=r"(__out[44]),
3408
+ "=r"(__out[45]),
3409
+ "=r"(__out[46]),
3410
+ "=r"(__out[47]),
3411
+ "=r"(__out[48]),
3412
+ "=r"(__out[49]),
3413
+ "=r"(__out[50]),
3414
+ "=r"(__out[51]),
3415
+ "=r"(__out[52]),
3416
+ "=r"(__out[53]),
3417
+ "=r"(__out[54]),
3418
+ "=r"(__out[55]),
3419
+ "=r"(__out[56]),
3420
+ "=r"(__out[57]),
3421
+ "=r"(__out[58]),
3422
+ "=r"(__out[59]),
3423
+ "=r"(__out[60]),
3424
+ "=r"(__out[61]),
3425
+ "=r"(__out[62]),
3426
+ "=r"(__out[63]),
3427
+ "=r"(__out[64]),
3428
+ "=r"(__out[65]),
3429
+ "=r"(__out[66]),
3430
+ "=r"(__out[67]),
3431
+ "=r"(__out[68]),
3432
+ "=r"(__out[69]),
3433
+ "=r"(__out[70]),
3434
+ "=r"(__out[71]),
3435
+ "=r"(__out[72]),
3436
+ "=r"(__out[73]),
3437
+ "=r"(__out[74]),
3438
+ "=r"(__out[75]),
3439
+ "=r"(__out[76]),
3440
+ "=r"(__out[77]),
3441
+ "=r"(__out[78]),
3442
+ "=r"(__out[79]),
3443
+ "=r"(__out[80]),
3444
+ "=r"(__out[81]),
3445
+ "=r"(__out[82]),
3446
+ "=r"(__out[83]),
3447
+ "=r"(__out[84]),
3448
+ "=r"(__out[85]),
3449
+ "=r"(__out[86]),
3450
+ "=r"(__out[87]),
3451
+ "=r"(__out[88]),
3452
+ "=r"(__out[89]),
3453
+ "=r"(__out[90]),
3454
+ "=r"(__out[91]),
3455
+ "=r"(__out[92]),
3456
+ "=r"(__out[93]),
3457
+ "=r"(__out[94]),
3458
+ "=r"(__out[95]),
3459
+ "=r"(__out[96]),
3460
+ "=r"(__out[97]),
3461
+ "=r"(__out[98]),
3462
+ "=r"(__out[99]),
3463
+ "=r"(__out[100]),
3464
+ "=r"(__out[101]),
3465
+ "=r"(__out[102]),
3466
+ "=r"(__out[103]),
3467
+ "=r"(__out[104]),
3468
+ "=r"(__out[105]),
3469
+ "=r"(__out[106]),
3470
+ "=r"(__out[107]),
3471
+ "=r"(__out[108]),
3472
+ "=r"(__out[109]),
3473
+ "=r"(__out[110]),
3474
+ "=r"(__out[111]),
3475
+ "=r"(__out[112]),
3476
+ "=r"(__out[113]),
3477
+ "=r"(__out[114]),
3478
+ "=r"(__out[115]),
3479
+ "=r"(__out[116]),
3480
+ "=r"(__out[117]),
3481
+ "=r"(__out[118]),
3482
+ "=r"(__out[119]),
3483
+ "=r"(__out[120]),
3484
+ "=r"(__out[121]),
3485
+ "=r"(__out[122]),
3486
+ "=r"(__out[123]),
3487
+ "=r"(__out[124]),
3488
+ "=r"(__out[125]),
3489
+ "=r"(__out[126]),
3490
+ "=r"(__out[127])
3491
+ : "r"(__taddr)
3492
+ : "memory");
3493
+ # else
3494
+ // Unsupported architectures will have a linker error with a semi-decent error message
3495
+ __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3496
+ # endif
3497
+ }
3498
+ #endif // __cccl_ptx_isa >= 860
3499
+
3500
+ /*
3501
+ // tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3502
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3503
+ __device__ static inline void tcgen05_ld_16x32bx2(
3504
+ B32 (&out)[1],
3505
+ uint32_t taddr,
3506
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3507
+ */
3508
+ #if __cccl_ptx_isa >= 860
3509
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3510
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3511
+ _CCCL_DEVICE static inline void
3512
+ tcgen05_ld_16x32bx2(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3513
+ {
3514
+ static_assert(sizeof(_B32) == 4, "");
3515
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3516
+ asm("tcgen05.ld.sync.aligned.16x32bx2.x1.b32 {%0}, [%1], %2;"
3517
+ : "=r"(__out[0])
3518
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
3519
+ : "memory");
3520
+ # else
3521
+ // Unsupported architectures will have a linker error with a semi-decent error message
3522
+ __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3523
+ # endif
3524
+ }
3525
+ #endif // __cccl_ptx_isa >= 860
3526
+
3527
+ /*
3528
+ // tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3529
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3530
+ __device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
3531
+ B32 (&out)[1],
3532
+ uint32_t taddr,
3533
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3534
+ */
3535
+ #if __cccl_ptx_isa >= 860
3536
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3537
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3538
+ _CCCL_DEVICE static inline void
3539
+ tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3540
+ {
3541
+ static_assert(sizeof(_B32) == 4, "");
3542
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3543
+ asm("tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 {%0}, [%1], %2;"
3544
+ : "=r"(__out[0])
3545
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
3546
+ : "memory");
3547
+ # else
3548
+ // Unsupported architectures will have a linker error with a semi-decent error message
3549
+ __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3550
+ # endif
3551
+ }
3552
+ #endif // __cccl_ptx_isa >= 860
3553
+
3554
+ /*
3555
+ // tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3556
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3557
+ __device__ static inline void tcgen05_ld_16x32bx2(
3558
+ B32 (&out)[2],
3559
+ uint32_t taddr,
3560
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3561
+ */
3562
+ #if __cccl_ptx_isa >= 860
3563
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3564
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3565
+ _CCCL_DEVICE static inline void
3566
+ tcgen05_ld_16x32bx2(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3567
+ {
3568
+ static_assert(sizeof(_B32) == 4, "");
3569
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3570
+ asm("tcgen05.ld.sync.aligned.16x32bx2.x2.b32 {%0, %1}, [%2], %3;"
3571
+ : "=r"(__out[0]), "=r"(__out[1])
3572
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
3573
+ : "memory");
3574
+ # else
3575
+ // Unsupported architectures will have a linker error with a semi-decent error message
3576
+ __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3577
+ # endif
3578
+ }
3579
+ #endif // __cccl_ptx_isa >= 860
3580
+
3581
+ /*
3582
+ // tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3583
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3584
+ __device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
3585
+ B32 (&out)[2],
3586
+ uint32_t taddr,
3587
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3588
+ */
3589
+ #if __cccl_ptx_isa >= 860
3590
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3591
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3592
+ _CCCL_DEVICE static inline void
3593
+ tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3594
+ {
3595
+ static_assert(sizeof(_B32) == 4, "");
3596
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3597
+ asm("tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 {%0, %1}, [%2], %3;"
3598
+ : "=r"(__out[0]), "=r"(__out[1])
3599
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
3600
+ : "memory");
3601
+ # else
3602
+ // Unsupported architectures will have a linker error with a semi-decent error message
3603
+ __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3604
+ # endif
3605
+ }
3606
+ #endif // __cccl_ptx_isa >= 860
3607
+
3608
+ /*
3609
+ // tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3610
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3611
+ __device__ static inline void tcgen05_ld_16x32bx2(
3612
+ B32 (&out)[4],
3613
+ uint32_t taddr,
3614
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3615
+ */
3616
+ #if __cccl_ptx_isa >= 860
3617
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3618
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3619
+ _CCCL_DEVICE static inline void
3620
+ tcgen05_ld_16x32bx2(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3621
+ {
3622
+ static_assert(sizeof(_B32) == 4, "");
3623
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3624
+ asm("tcgen05.ld.sync.aligned.16x32bx2.x4.b32 {%0, %1, %2, %3}, [%4], %5;"
3625
+ : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3])
3626
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
3627
+ : "memory");
3628
+ # else
3629
+ // Unsupported architectures will have a linker error with a semi-decent error message
3630
+ __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3631
+ # endif
3632
+ }
3633
+ #endif // __cccl_ptx_isa >= 860
3634
+
3635
+ /*
3636
+ // tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3637
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3638
+ __device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
3639
+ B32 (&out)[4],
3640
+ uint32_t taddr,
3641
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3642
+ */
3643
+ #if __cccl_ptx_isa >= 860
3644
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3645
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3646
+ _CCCL_DEVICE static inline void
3647
+ tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3648
+ {
3649
+ static_assert(sizeof(_B32) == 4, "");
3650
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3651
+ asm("tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 {%0, %1, %2, %3}, [%4], %5;"
3652
+ : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3])
3653
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
3654
+ : "memory");
3655
+ # else
3656
+ // Unsupported architectures will have a linker error with a semi-decent error message
3657
+ __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3658
+ # endif
3659
+ }
3660
+ #endif // __cccl_ptx_isa >= 860
3661
+
3662
+ /*
3663
+ // tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3664
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3665
+ __device__ static inline void tcgen05_ld_16x32bx2(
3666
+ B32 (&out)[8],
3667
+ uint32_t taddr,
3668
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3669
+ */
3670
+ #if __cccl_ptx_isa >= 860
3671
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3672
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3673
+ _CCCL_DEVICE static inline void
3674
+ tcgen05_ld_16x32bx2(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3675
+ {
3676
+ static_assert(sizeof(_B32) == 4, "");
3677
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3678
+ asm("tcgen05.ld.sync.aligned.16x32bx2.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8], %9;"
3679
+ : "=r"(__out[0]),
3680
+ "=r"(__out[1]),
3681
+ "=r"(__out[2]),
3682
+ "=r"(__out[3]),
3683
+ "=r"(__out[4]),
3684
+ "=r"(__out[5]),
3685
+ "=r"(__out[6]),
3686
+ "=r"(__out[7])
3687
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
3688
+ : "memory");
3689
+ # else
3690
+ // Unsupported architectures will have a linker error with a semi-decent error message
3691
+ __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3692
+ # endif
3693
+ }
3694
+ #endif // __cccl_ptx_isa >= 860
3695
+
3696
+ /*
3697
+ // tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3698
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3699
+ __device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
3700
+ B32 (&out)[8],
3701
+ uint32_t taddr,
3702
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3703
+ */
3704
+ #if __cccl_ptx_isa >= 860
3705
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3706
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3707
+ _CCCL_DEVICE static inline void
3708
+ tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3709
+ {
3710
+ static_assert(sizeof(_B32) == 4, "");
3711
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3712
+ asm("tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8], %9;"
3713
+ : "=r"(__out[0]),
3714
+ "=r"(__out[1]),
3715
+ "=r"(__out[2]),
3716
+ "=r"(__out[3]),
3717
+ "=r"(__out[4]),
3718
+ "=r"(__out[5]),
3719
+ "=r"(__out[6]),
3720
+ "=r"(__out[7])
3721
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
3722
+ : "memory");
3723
+ # else
3724
+ // Unsupported architectures will have a linker error with a semi-decent error message
3725
+ __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3726
+ # endif
3727
+ }
3728
+ #endif // __cccl_ptx_isa >= 860
3729
+
3730
+ /*
3731
+ // tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3732
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3733
+ __device__ static inline void tcgen05_ld_16x32bx2(
3734
+ B32 (&out)[16],
3735
+ uint32_t taddr,
3736
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3737
+ */
3738
+ #if __cccl_ptx_isa >= 860
3739
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3740
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3741
+ _CCCL_DEVICE static inline void
3742
+ tcgen05_ld_16x32bx2(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3743
+ {
3744
+ static_assert(sizeof(_B32) == 4, "");
3745
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3746
+ asm("tcgen05.ld.sync.aligned.16x32bx2.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
3747
+ "%15}, [%16], %17;"
3748
+ : "=r"(__out[0]),
3749
+ "=r"(__out[1]),
3750
+ "=r"(__out[2]),
3751
+ "=r"(__out[3]),
3752
+ "=r"(__out[4]),
3753
+ "=r"(__out[5]),
3754
+ "=r"(__out[6]),
3755
+ "=r"(__out[7]),
3756
+ "=r"(__out[8]),
3757
+ "=r"(__out[9]),
3758
+ "=r"(__out[10]),
3759
+ "=r"(__out[11]),
3760
+ "=r"(__out[12]),
3761
+ "=r"(__out[13]),
3762
+ "=r"(__out[14]),
3763
+ "=r"(__out[15])
3764
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
3765
+ : "memory");
3766
+ # else
3767
+ // Unsupported architectures will have a linker error with a semi-decent error message
3768
+ __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3769
+ # endif
3770
+ }
3771
+ #endif // __cccl_ptx_isa >= 860
3772
+
3773
+ /*
3774
+ // tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3775
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3776
+ __device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
3777
+ B32 (&out)[16],
3778
+ uint32_t taddr,
3779
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3780
+ */
3781
+ #if __cccl_ptx_isa >= 860
3782
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3783
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3784
+ _CCCL_DEVICE static inline void
3785
+ tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3786
+ {
3787
+ static_assert(sizeof(_B32) == 4, "");
3788
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3789
+ asm("tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
3790
+ "%14, %15}, [%16], %17;"
3791
+ : "=r"(__out[0]),
3792
+ "=r"(__out[1]),
3793
+ "=r"(__out[2]),
3794
+ "=r"(__out[3]),
3795
+ "=r"(__out[4]),
3796
+ "=r"(__out[5]),
3797
+ "=r"(__out[6]),
3798
+ "=r"(__out[7]),
3799
+ "=r"(__out[8]),
3800
+ "=r"(__out[9]),
3801
+ "=r"(__out[10]),
3802
+ "=r"(__out[11]),
3803
+ "=r"(__out[12]),
3804
+ "=r"(__out[13]),
3805
+ "=r"(__out[14]),
3806
+ "=r"(__out[15])
3807
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
3808
+ : "memory");
3809
+ # else
3810
+ // Unsupported architectures will have a linker error with a semi-decent error message
3811
+ __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3812
+ # endif
3813
+ }
3814
+ #endif // __cccl_ptx_isa >= 860
3815
+
3816
+ /*
3817
+ // tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3818
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3819
+ __device__ static inline void tcgen05_ld_16x32bx2(
3820
+ B32 (&out)[32],
3821
+ uint32_t taddr,
3822
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3823
+ */
3824
+ #if __cccl_ptx_isa >= 860
3825
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3826
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3827
+ _CCCL_DEVICE static inline void
3828
+ tcgen05_ld_16x32bx2(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3829
+ {
3830
+ static_assert(sizeof(_B32) == 4, "");
3831
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3832
+ asm(
3833
+ "tcgen05.ld.sync.aligned.16x32bx2.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
3834
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32], %33;"
3835
+ : "=r"(__out[0]),
3836
+ "=r"(__out[1]),
3837
+ "=r"(__out[2]),
3838
+ "=r"(__out[3]),
3839
+ "=r"(__out[4]),
3840
+ "=r"(__out[5]),
3841
+ "=r"(__out[6]),
3842
+ "=r"(__out[7]),
3843
+ "=r"(__out[8]),
3844
+ "=r"(__out[9]),
3845
+ "=r"(__out[10]),
3846
+ "=r"(__out[11]),
3847
+ "=r"(__out[12]),
3848
+ "=r"(__out[13]),
3849
+ "=r"(__out[14]),
3850
+ "=r"(__out[15]),
3851
+ "=r"(__out[16]),
3852
+ "=r"(__out[17]),
3853
+ "=r"(__out[18]),
3854
+ "=r"(__out[19]),
3855
+ "=r"(__out[20]),
3856
+ "=r"(__out[21]),
3857
+ "=r"(__out[22]),
3858
+ "=r"(__out[23]),
3859
+ "=r"(__out[24]),
3860
+ "=r"(__out[25]),
3861
+ "=r"(__out[26]),
3862
+ "=r"(__out[27]),
3863
+ "=r"(__out[28]),
3864
+ "=r"(__out[29]),
3865
+ "=r"(__out[30]),
3866
+ "=r"(__out[31])
3867
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
3868
+ : "memory");
3869
+ # else
3870
+ // Unsupported architectures will have a linker error with a semi-decent error message
3871
+ __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3872
+ # endif
3873
+ }
3874
+ #endif // __cccl_ptx_isa >= 860
3875
+
3876
+ /*
3877
+ // tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3878
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3879
+ __device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
3880
+ B32 (&out)[32],
3881
+ uint32_t taddr,
3882
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3883
+ */
3884
+ #if __cccl_ptx_isa >= 860
3885
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3886
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3887
+ _CCCL_DEVICE static inline void
3888
+ tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3889
+ {
3890
+ static_assert(sizeof(_B32) == 4, "");
3891
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3892
+ asm(
3893
+ "tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
3894
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32], %33;"
3895
+ : "=r"(__out[0]),
3896
+ "=r"(__out[1]),
3897
+ "=r"(__out[2]),
3898
+ "=r"(__out[3]),
3899
+ "=r"(__out[4]),
3900
+ "=r"(__out[5]),
3901
+ "=r"(__out[6]),
3902
+ "=r"(__out[7]),
3903
+ "=r"(__out[8]),
3904
+ "=r"(__out[9]),
3905
+ "=r"(__out[10]),
3906
+ "=r"(__out[11]),
3907
+ "=r"(__out[12]),
3908
+ "=r"(__out[13]),
3909
+ "=r"(__out[14]),
3910
+ "=r"(__out[15]),
3911
+ "=r"(__out[16]),
3912
+ "=r"(__out[17]),
3913
+ "=r"(__out[18]),
3914
+ "=r"(__out[19]),
3915
+ "=r"(__out[20]),
3916
+ "=r"(__out[21]),
3917
+ "=r"(__out[22]),
3918
+ "=r"(__out[23]),
3919
+ "=r"(__out[24]),
3920
+ "=r"(__out[25]),
3921
+ "=r"(__out[26]),
3922
+ "=r"(__out[27]),
3923
+ "=r"(__out[28]),
3924
+ "=r"(__out[29]),
3925
+ "=r"(__out[30]),
3926
+ "=r"(__out[31])
3927
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
3928
+ : "memory");
3929
+ # else
3930
+ // Unsupported architectures will have a linker error with a semi-decent error message
3931
+ __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
3932
+ # endif
3933
+ }
3934
+ #endif // __cccl_ptx_isa >= 860
3935
+
3936
+ /*
3937
+ // tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
3938
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
3939
+ __device__ static inline void tcgen05_ld_16x32bx2(
3940
+ B32 (&out)[64],
3941
+ uint32_t taddr,
3942
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
3943
+ */
3944
+ #if __cccl_ptx_isa >= 860
3945
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
3946
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
3947
+ _CCCL_DEVICE static inline void
3948
+ tcgen05_ld_16x32bx2(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
3949
+ {
3950
+ static_assert(sizeof(_B32) == 4, "");
3951
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
3952
+ asm(
3953
+ "tcgen05.ld.sync.aligned.16x32bx2.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
3954
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, "
3955
+ "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, "
3956
+ "%60, %61, %62, %63}, [%64], %65;"
3957
+ : "=r"(__out[0]),
3958
+ "=r"(__out[1]),
3959
+ "=r"(__out[2]),
3960
+ "=r"(__out[3]),
3961
+ "=r"(__out[4]),
3962
+ "=r"(__out[5]),
3963
+ "=r"(__out[6]),
3964
+ "=r"(__out[7]),
3965
+ "=r"(__out[8]),
3966
+ "=r"(__out[9]),
3967
+ "=r"(__out[10]),
3968
+ "=r"(__out[11]),
3969
+ "=r"(__out[12]),
3970
+ "=r"(__out[13]),
3971
+ "=r"(__out[14]),
3972
+ "=r"(__out[15]),
3973
+ "=r"(__out[16]),
3974
+ "=r"(__out[17]),
3975
+ "=r"(__out[18]),
3976
+ "=r"(__out[19]),
3977
+ "=r"(__out[20]),
3978
+ "=r"(__out[21]),
3979
+ "=r"(__out[22]),
3980
+ "=r"(__out[23]),
3981
+ "=r"(__out[24]),
3982
+ "=r"(__out[25]),
3983
+ "=r"(__out[26]),
3984
+ "=r"(__out[27]),
3985
+ "=r"(__out[28]),
3986
+ "=r"(__out[29]),
3987
+ "=r"(__out[30]),
3988
+ "=r"(__out[31]),
3989
+ "=r"(__out[32]),
3990
+ "=r"(__out[33]),
3991
+ "=r"(__out[34]),
3992
+ "=r"(__out[35]),
3993
+ "=r"(__out[36]),
3994
+ "=r"(__out[37]),
3995
+ "=r"(__out[38]),
3996
+ "=r"(__out[39]),
3997
+ "=r"(__out[40]),
3998
+ "=r"(__out[41]),
3999
+ "=r"(__out[42]),
4000
+ "=r"(__out[43]),
4001
+ "=r"(__out[44]),
4002
+ "=r"(__out[45]),
4003
+ "=r"(__out[46]),
4004
+ "=r"(__out[47]),
4005
+ "=r"(__out[48]),
4006
+ "=r"(__out[49]),
4007
+ "=r"(__out[50]),
4008
+ "=r"(__out[51]),
4009
+ "=r"(__out[52]),
4010
+ "=r"(__out[53]),
4011
+ "=r"(__out[54]),
4012
+ "=r"(__out[55]),
4013
+ "=r"(__out[56]),
4014
+ "=r"(__out[57]),
4015
+ "=r"(__out[58]),
4016
+ "=r"(__out[59]),
4017
+ "=r"(__out[60]),
4018
+ "=r"(__out[61]),
4019
+ "=r"(__out[62]),
4020
+ "=r"(__out[63])
4021
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
4022
+ : "memory");
4023
+ # else
4024
+ // Unsupported architectures will have a linker error with a semi-decent error message
4025
+ __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
4026
+ # endif
4027
+ }
4028
+ #endif // __cccl_ptx_isa >= 860
4029
+
4030
+ /*
4031
+ // tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
4032
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
4033
+ __device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
4034
+ B32 (&out)[64],
4035
+ uint32_t taddr,
4036
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
4037
+ */
4038
+ #if __cccl_ptx_isa >= 860
4039
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
4040
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
4041
+ _CCCL_DEVICE static inline void
4042
+ tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
4043
+ {
4044
+ static_assert(sizeof(_B32) == 4, "");
4045
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
4046
+ asm(
4047
+ "tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
4048
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, "
4049
+ "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, "
4050
+ "%58, %59, %60, %61, %62, %63}, [%64], %65;"
4051
+ : "=r"(__out[0]),
4052
+ "=r"(__out[1]),
4053
+ "=r"(__out[2]),
4054
+ "=r"(__out[3]),
4055
+ "=r"(__out[4]),
4056
+ "=r"(__out[5]),
4057
+ "=r"(__out[6]),
4058
+ "=r"(__out[7]),
4059
+ "=r"(__out[8]),
4060
+ "=r"(__out[9]),
4061
+ "=r"(__out[10]),
4062
+ "=r"(__out[11]),
4063
+ "=r"(__out[12]),
4064
+ "=r"(__out[13]),
4065
+ "=r"(__out[14]),
4066
+ "=r"(__out[15]),
4067
+ "=r"(__out[16]),
4068
+ "=r"(__out[17]),
4069
+ "=r"(__out[18]),
4070
+ "=r"(__out[19]),
4071
+ "=r"(__out[20]),
4072
+ "=r"(__out[21]),
4073
+ "=r"(__out[22]),
4074
+ "=r"(__out[23]),
4075
+ "=r"(__out[24]),
4076
+ "=r"(__out[25]),
4077
+ "=r"(__out[26]),
4078
+ "=r"(__out[27]),
4079
+ "=r"(__out[28]),
4080
+ "=r"(__out[29]),
4081
+ "=r"(__out[30]),
4082
+ "=r"(__out[31]),
4083
+ "=r"(__out[32]),
4084
+ "=r"(__out[33]),
4085
+ "=r"(__out[34]),
4086
+ "=r"(__out[35]),
4087
+ "=r"(__out[36]),
4088
+ "=r"(__out[37]),
4089
+ "=r"(__out[38]),
4090
+ "=r"(__out[39]),
4091
+ "=r"(__out[40]),
4092
+ "=r"(__out[41]),
4093
+ "=r"(__out[42]),
4094
+ "=r"(__out[43]),
4095
+ "=r"(__out[44]),
4096
+ "=r"(__out[45]),
4097
+ "=r"(__out[46]),
4098
+ "=r"(__out[47]),
4099
+ "=r"(__out[48]),
4100
+ "=r"(__out[49]),
4101
+ "=r"(__out[50]),
4102
+ "=r"(__out[51]),
4103
+ "=r"(__out[52]),
4104
+ "=r"(__out[53]),
4105
+ "=r"(__out[54]),
4106
+ "=r"(__out[55]),
4107
+ "=r"(__out[56]),
4108
+ "=r"(__out[57]),
4109
+ "=r"(__out[58]),
4110
+ "=r"(__out[59]),
4111
+ "=r"(__out[60]),
4112
+ "=r"(__out[61]),
4113
+ "=r"(__out[62]),
4114
+ "=r"(__out[63])
4115
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
4116
+ : "memory");
4117
+ # else
4118
+ // Unsupported architectures will have a linker error with a semi-decent error message
4119
+ __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
4120
+ # endif
4121
+ }
4122
+ #endif // __cccl_ptx_isa >= 860
4123
+
4124
+ /*
4125
+ // tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
4126
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
4127
+ __device__ static inline void tcgen05_ld_16x32bx2(
4128
+ B32 (&out)[128],
4129
+ uint32_t taddr,
4130
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
4131
+ */
4132
+ #if __cccl_ptx_isa >= 860
4133
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
4134
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
4135
+ _CCCL_DEVICE static inline void
4136
+ tcgen05_ld_16x32bx2(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
4137
+ {
4138
+ static_assert(sizeof(_B32) == 4, "");
4139
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
4140
+ asm(
4141
+ "tcgen05.ld.sync.aligned.16x32bx2.x128.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, "
4142
+ "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, "
4143
+ "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, "
4144
+ "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, "
4145
+ "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, "
4146
+ "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, "
4147
+ "%123, %124, %125, %126, %127}, [%128], %129;"
4148
+ : "=r"(__out[0]),
4149
+ "=r"(__out[1]),
4150
+ "=r"(__out[2]),
4151
+ "=r"(__out[3]),
4152
+ "=r"(__out[4]),
4153
+ "=r"(__out[5]),
4154
+ "=r"(__out[6]),
4155
+ "=r"(__out[7]),
4156
+ "=r"(__out[8]),
4157
+ "=r"(__out[9]),
4158
+ "=r"(__out[10]),
4159
+ "=r"(__out[11]),
4160
+ "=r"(__out[12]),
4161
+ "=r"(__out[13]),
4162
+ "=r"(__out[14]),
4163
+ "=r"(__out[15]),
4164
+ "=r"(__out[16]),
4165
+ "=r"(__out[17]),
4166
+ "=r"(__out[18]),
4167
+ "=r"(__out[19]),
4168
+ "=r"(__out[20]),
4169
+ "=r"(__out[21]),
4170
+ "=r"(__out[22]),
4171
+ "=r"(__out[23]),
4172
+ "=r"(__out[24]),
4173
+ "=r"(__out[25]),
4174
+ "=r"(__out[26]),
4175
+ "=r"(__out[27]),
4176
+ "=r"(__out[28]),
4177
+ "=r"(__out[29]),
4178
+ "=r"(__out[30]),
4179
+ "=r"(__out[31]),
4180
+ "=r"(__out[32]),
4181
+ "=r"(__out[33]),
4182
+ "=r"(__out[34]),
4183
+ "=r"(__out[35]),
4184
+ "=r"(__out[36]),
4185
+ "=r"(__out[37]),
4186
+ "=r"(__out[38]),
4187
+ "=r"(__out[39]),
4188
+ "=r"(__out[40]),
4189
+ "=r"(__out[41]),
4190
+ "=r"(__out[42]),
4191
+ "=r"(__out[43]),
4192
+ "=r"(__out[44]),
4193
+ "=r"(__out[45]),
4194
+ "=r"(__out[46]),
4195
+ "=r"(__out[47]),
4196
+ "=r"(__out[48]),
4197
+ "=r"(__out[49]),
4198
+ "=r"(__out[50]),
4199
+ "=r"(__out[51]),
4200
+ "=r"(__out[52]),
4201
+ "=r"(__out[53]),
4202
+ "=r"(__out[54]),
4203
+ "=r"(__out[55]),
4204
+ "=r"(__out[56]),
4205
+ "=r"(__out[57]),
4206
+ "=r"(__out[58]),
4207
+ "=r"(__out[59]),
4208
+ "=r"(__out[60]),
4209
+ "=r"(__out[61]),
4210
+ "=r"(__out[62]),
4211
+ "=r"(__out[63]),
4212
+ "=r"(__out[64]),
4213
+ "=r"(__out[65]),
4214
+ "=r"(__out[66]),
4215
+ "=r"(__out[67]),
4216
+ "=r"(__out[68]),
4217
+ "=r"(__out[69]),
4218
+ "=r"(__out[70]),
4219
+ "=r"(__out[71]),
4220
+ "=r"(__out[72]),
4221
+ "=r"(__out[73]),
4222
+ "=r"(__out[74]),
4223
+ "=r"(__out[75]),
4224
+ "=r"(__out[76]),
4225
+ "=r"(__out[77]),
4226
+ "=r"(__out[78]),
4227
+ "=r"(__out[79]),
4228
+ "=r"(__out[80]),
4229
+ "=r"(__out[81]),
4230
+ "=r"(__out[82]),
4231
+ "=r"(__out[83]),
4232
+ "=r"(__out[84]),
4233
+ "=r"(__out[85]),
4234
+ "=r"(__out[86]),
4235
+ "=r"(__out[87]),
4236
+ "=r"(__out[88]),
4237
+ "=r"(__out[89]),
4238
+ "=r"(__out[90]),
4239
+ "=r"(__out[91]),
4240
+ "=r"(__out[92]),
4241
+ "=r"(__out[93]),
4242
+ "=r"(__out[94]),
4243
+ "=r"(__out[95]),
4244
+ "=r"(__out[96]),
4245
+ "=r"(__out[97]),
4246
+ "=r"(__out[98]),
4247
+ "=r"(__out[99]),
4248
+ "=r"(__out[100]),
4249
+ "=r"(__out[101]),
4250
+ "=r"(__out[102]),
4251
+ "=r"(__out[103]),
4252
+ "=r"(__out[104]),
4253
+ "=r"(__out[105]),
4254
+ "=r"(__out[106]),
4255
+ "=r"(__out[107]),
4256
+ "=r"(__out[108]),
4257
+ "=r"(__out[109]),
4258
+ "=r"(__out[110]),
4259
+ "=r"(__out[111]),
4260
+ "=r"(__out[112]),
4261
+ "=r"(__out[113]),
4262
+ "=r"(__out[114]),
4263
+ "=r"(__out[115]),
4264
+ "=r"(__out[116]),
4265
+ "=r"(__out[117]),
4266
+ "=r"(__out[118]),
4267
+ "=r"(__out[119]),
4268
+ "=r"(__out[120]),
4269
+ "=r"(__out[121]),
4270
+ "=r"(__out[122]),
4271
+ "=r"(__out[123]),
4272
+ "=r"(__out[124]),
4273
+ "=r"(__out[125]),
4274
+ "=r"(__out[126]),
4275
+ "=r"(__out[127])
4276
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
4277
+ : "memory");
4278
+ # else
4279
+ // Unsupported architectures will have a linker error with a semi-decent error message
4280
+ __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__();
4281
+ # endif
4282
+ }
4283
+ #endif // __cccl_ptx_isa >= 860
4284
+
4285
+ /*
4286
+ // tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a
4287
+ template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, int N32>
4288
+ __device__ static inline void tcgen05_ld_16x32bx2_pack_16b(
4289
+ B32 (&out)[128],
4290
+ uint32_t taddr,
4291
+ cuda::ptx::n32_t<N32> immHalfSplitoff);
4292
+ */
4293
+ #if __cccl_ptx_isa >= 860
4294
+ extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
4295
+ template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true, int _N32>
4296
+ _CCCL_DEVICE static inline void
4297
+ tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff)
4298
+ {
4299
+ static_assert(sizeof(_B32) == 4, "");
4300
+ # if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
4301
+ asm(
4302
+ "tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
4303
+ "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, "
4304
+ "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, "
4305
+ "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, "
4306
+ "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, "
4307
+ "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
4308
+ "%121, %122, %123, %124, %125, %126, %127}, [%128], %129;"
4309
+ : "=r"(__out[0]),
4310
+ "=r"(__out[1]),
4311
+ "=r"(__out[2]),
4312
+ "=r"(__out[3]),
4313
+ "=r"(__out[4]),
4314
+ "=r"(__out[5]),
4315
+ "=r"(__out[6]),
4316
+ "=r"(__out[7]),
4317
+ "=r"(__out[8]),
4318
+ "=r"(__out[9]),
4319
+ "=r"(__out[10]),
4320
+ "=r"(__out[11]),
4321
+ "=r"(__out[12]),
4322
+ "=r"(__out[13]),
4323
+ "=r"(__out[14]),
4324
+ "=r"(__out[15]),
4325
+ "=r"(__out[16]),
4326
+ "=r"(__out[17]),
4327
+ "=r"(__out[18]),
4328
+ "=r"(__out[19]),
4329
+ "=r"(__out[20]),
4330
+ "=r"(__out[21]),
4331
+ "=r"(__out[22]),
4332
+ "=r"(__out[23]),
4333
+ "=r"(__out[24]),
4334
+ "=r"(__out[25]),
4335
+ "=r"(__out[26]),
4336
+ "=r"(__out[27]),
4337
+ "=r"(__out[28]),
4338
+ "=r"(__out[29]),
4339
+ "=r"(__out[30]),
4340
+ "=r"(__out[31]),
4341
+ "=r"(__out[32]),
4342
+ "=r"(__out[33]),
4343
+ "=r"(__out[34]),
4344
+ "=r"(__out[35]),
4345
+ "=r"(__out[36]),
4346
+ "=r"(__out[37]),
4347
+ "=r"(__out[38]),
4348
+ "=r"(__out[39]),
4349
+ "=r"(__out[40]),
4350
+ "=r"(__out[41]),
4351
+ "=r"(__out[42]),
4352
+ "=r"(__out[43]),
4353
+ "=r"(__out[44]),
4354
+ "=r"(__out[45]),
4355
+ "=r"(__out[46]),
4356
+ "=r"(__out[47]),
4357
+ "=r"(__out[48]),
4358
+ "=r"(__out[49]),
4359
+ "=r"(__out[50]),
4360
+ "=r"(__out[51]),
4361
+ "=r"(__out[52]),
4362
+ "=r"(__out[53]),
4363
+ "=r"(__out[54]),
4364
+ "=r"(__out[55]),
4365
+ "=r"(__out[56]),
4366
+ "=r"(__out[57]),
4367
+ "=r"(__out[58]),
4368
+ "=r"(__out[59]),
4369
+ "=r"(__out[60]),
4370
+ "=r"(__out[61]),
4371
+ "=r"(__out[62]),
4372
+ "=r"(__out[63]),
4373
+ "=r"(__out[64]),
4374
+ "=r"(__out[65]),
4375
+ "=r"(__out[66]),
4376
+ "=r"(__out[67]),
4377
+ "=r"(__out[68]),
4378
+ "=r"(__out[69]),
4379
+ "=r"(__out[70]),
4380
+ "=r"(__out[71]),
4381
+ "=r"(__out[72]),
4382
+ "=r"(__out[73]),
4383
+ "=r"(__out[74]),
4384
+ "=r"(__out[75]),
4385
+ "=r"(__out[76]),
4386
+ "=r"(__out[77]),
4387
+ "=r"(__out[78]),
4388
+ "=r"(__out[79]),
4389
+ "=r"(__out[80]),
4390
+ "=r"(__out[81]),
4391
+ "=r"(__out[82]),
4392
+ "=r"(__out[83]),
4393
+ "=r"(__out[84]),
4394
+ "=r"(__out[85]),
4395
+ "=r"(__out[86]),
4396
+ "=r"(__out[87]),
4397
+ "=r"(__out[88]),
4398
+ "=r"(__out[89]),
4399
+ "=r"(__out[90]),
4400
+ "=r"(__out[91]),
4401
+ "=r"(__out[92]),
4402
+ "=r"(__out[93]),
4403
+ "=r"(__out[94]),
4404
+ "=r"(__out[95]),
4405
+ "=r"(__out[96]),
4406
+ "=r"(__out[97]),
4407
+ "=r"(__out[98]),
4408
+ "=r"(__out[99]),
4409
+ "=r"(__out[100]),
4410
+ "=r"(__out[101]),
4411
+ "=r"(__out[102]),
4412
+ "=r"(__out[103]),
4413
+ "=r"(__out[104]),
4414
+ "=r"(__out[105]),
4415
+ "=r"(__out[106]),
4416
+ "=r"(__out[107]),
4417
+ "=r"(__out[108]),
4418
+ "=r"(__out[109]),
4419
+ "=r"(__out[110]),
4420
+ "=r"(__out[111]),
4421
+ "=r"(__out[112]),
4422
+ "=r"(__out[113]),
4423
+ "=r"(__out[114]),
4424
+ "=r"(__out[115]),
4425
+ "=r"(__out[116]),
4426
+ "=r"(__out[117]),
4427
+ "=r"(__out[118]),
4428
+ "=r"(__out[119]),
4429
+ "=r"(__out[120]),
4430
+ "=r"(__out[121]),
4431
+ "=r"(__out[122]),
4432
+ "=r"(__out[123]),
4433
+ "=r"(__out[124]),
4434
+ "=r"(__out[125]),
4435
+ "=r"(__out[126]),
4436
+ "=r"(__out[127])
4437
+ : "r"(__taddr), "n"(__immHalfSplitoff.value)
4438
+ : "memory");
4439
+ # else
4440
+ // Unsupported architectures will have a linker error with a semi-decent error message
4441
+ __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__();
4442
+ # endif
4443
+ }
4444
+ #endif // __cccl_ptx_isa >= 860
4445
+
4446
+ #endif // _CUDA_PTX_GENERATED_TCGEN05_LD_H_